In [3]:
!pip install pinecone-client cohere datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [4]:
import os
import pinecone
import cohere
import torch
from datasets import load_dataset


In [5]:
PINECONE_API_KEY = "your key"
COHERE_API_KEY = "key"


In [6]:
def init_pinecone():
    pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
    index_name = "qa-bot"

    # Checking if the index already exists
    if index_name in pc.list_indexes().names():
        print(f"Deleting existing index '{index_name}'...")
        pc.delete_index(index_name)

    # Create a new index with the correct dimension
    print(f"Creating index '{index_name}' with dimension 4096...")
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    return pc, index_name


In [7]:
def load_and_preprocess_data():
    print("Loading and preprocessing dataset...")
    dataset = load_dataset('wikipedia', '20220301.en', split='train', streaming=True)

    documents = []
    for i, article in enumerate(dataset.take(25)):
        documents.append(article['text'])

    print("Documents loaded and preprocessed.")
    return documents


In [8]:
def generate_embeddings(documents):
    print("Generating embeddings using Cohere...")
    co = cohere.Client(COHERE_API_KEY)
    embeddings = co.embed(texts=documents).embeddings
    print(f"Generated {len(embeddings)} embeddings.")
    return embeddings


In [9]:
def store_embeddings_in_pinecone(embeddings, pc, index_name):
    print("Storing embeddings in Pinecone...")
    index = pc.Index(index_name)

    for i, embed in enumerate(embeddings):
        index.upsert([(f'doc_{i}', embed)])

    print("Embeddings stored successfully in Pinecone.")
    return index


In [10]:
def query_pinecone(query_embedding, pc, index_name, top_k=5):
    print("Querying Pinecone...")
    index = pc.Index(index_name)
    query_result = index.query(vector=query_embedding, top_k=top_k)

    return query_result


In [11]:
def generate_answer(query, retrieved_texts, max_tokens=1500):
    print("Generating answer using Cohere's generate API...")
    co = cohere.Client(COHERE_API_KEY)

    context = ' '.join(retrieved_texts)
    prompt = f"Question: {query}\n\nContext: {context[:max_tokens]}...\n\nPlease provide a concise answer about machine learning."

    response = co.generate(
        prompt=prompt,
        max_tokens=150
    )
    return response.generations[0].text


In [12]:
def main():
    pc, index_name = init_pinecone()
    documents = load_and_preprocess_data()
    embeddings = generate_embeddings(documents)
    index = store_embeddings_in_pinecone(embeddings, pc, index_name)

    # Sample query
    query = "What is machine learning?"
    co = cohere.Client(COHERE_API_KEY)
    query_embedding = co.embed(texts=[query]).embeddings[0]

    query_result = query_pinecone(query_embedding, pc, index_name)
    retrieved_texts = [documents[int(match.id.split('_')[1])] for match in query_result.matches]
    answer = generate_answer(query, retrieved_texts, max_tokens=1500)
    print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Deleting existing index 'qa-bot'...
Creating index 'qa-bot' with dimension 4096...
Loading and preprocessing dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Documents loaded and preprocessed.
Generating embeddings using Cohere...
Generated 25 embeddings.
Storing embeddings in Pinecone...
Embeddings stored successfully in Pinecone.
Querying Pinecone...
Generating answer using Cohere's generate API...
Answer:  Machine learning (ML) is a type of artificial intelligence (AI) that allows software applications to become more accurate at predicting outcomes without being explicitly programmed to do so. Machine learning algorithms use historical data as input to predict new output values. ML uses computational statistics and algorithms to learn from and make predictions based on data, all without being programmed with explicit instructions. 


In [13]:
def main():
    pc, index_name = init_pinecone()
    documents = load_and_preprocess_data()
    embeddings = generate_embeddings(documents)
    index = store_embeddings_in_pinecone(embeddings, pc, index_name)

    # Sample query
    query = "What is supervised learning?"
    co = cohere.Client(COHERE_API_KEY)
    query_embedding = co.embed(texts=[query]).embeddings[0]

    query_result = query_pinecone(query_embedding, pc, index_name)
    retrieved_texts = [documents[int(match.id.split('_')[1])] for match in query_result.matches]
    answer = generate_answer(query, retrieved_texts, max_tokens=1500)
    print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Deleting existing index 'qa-bot'...
Creating index 'qa-bot' with dimension 4096...
Loading and preprocessing dataset...
Documents loaded and preprocessed.
Generating embeddings using Cohere...
Generated 25 embeddings.
Storing embeddings in Pinecone...
Embeddings stored successfully in Pinecone.
Querying Pinecone...
Generating answer using Cohere's generate API...
Answer:  Supervised learning is a type of machine learning algorithm where the model is trained using labeled data. During training, the algorithm receives input data and corresponding correct output labels to predict future data labels accurately. The goal is to predict the label of new, unseen data based on the relationship between the input data and corresponding labels learned during the training process. It involves input data and corresponding correct output labels during training. It learns to predict accurate output labels for new, unseen data. It is widely used in various applications, such as classification, regressi

In [14]:
def main():
    pc, index_name = init_pinecone()
    documents = load_and_preprocess_data()
    embeddings = generate_embeddings(documents)
    index = store_embeddings_in_pinecone(embeddings, pc, index_name)

    # Sample query
    query = "What are Large Language Models?"
    co = cohere.Client(COHERE_API_KEY)
    query_embedding = co.embed(texts=[query]).embeddings[0]

    query_result = query_pinecone(query_embedding, pc, index_name)
    retrieved_texts = [documents[int(match.id.split('_')[1])] for match in query_result.matches]
    answer = generate_answer(query, retrieved_texts, max_tokens=1500)
    print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Deleting existing index 'qa-bot'...
Creating index 'qa-bot' with dimension 4096...
Loading and preprocessing dataset...
Documents loaded and preprocessed.
Generating embeddings using Cohere...
Generated 25 embeddings.
Storing embeddings in Pinecone...
Embeddings stored successfully in Pinecone.
Querying Pinecone...
Generating answer using Cohere's generate API...
Answer:  Large Language Models are artificial intelligence tools that have been trained on massive amounts of text data and have the ability to understand, summarize and generate content in response to a wide array of prompts. They enable you to have conversations with them, providing you with answers to questions and helping you with your tasks. Some well-known examples are Cohere's own Command Model, as well as ChatGPT, LaMDA, and BARD. 
