In [65]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

API keys:

In [66]:
with open("/content/cohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("/content/pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

#**Preprocessing & Embedding the data**

In [67]:
def load_and_embedd_dataset(
        dataset_name: str = 'community-datasets/yahoo_answers_topics',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'best_answer',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [68]:
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [69]:
DATASET_NAME = 'community-datasets/yahoo_answers_topics'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=2000,
    model=model,
    text_field='best_answer'
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


# **Inserting the data into Pinecone VectorDB**


### creating the index

In [72]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,

            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [74]:
INDEX_NAME = 'yahoo-answers'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


inserting data into the index

In [75]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'best_answer',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [76]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 16/16 [00:07<00:00,  2.19it/s]


In [77]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2000}},
 'total_vector_count': 2000}

Retrieving relevant documents and generating answers to given questions using the retrieved documents & an LLM


In [216]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=10,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['best_answer'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know. please answer shortly.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [217]:
def generate_simple_answer(query):
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
        model='command-r-plus',
        message=query,
    )
  print(response.text)

In [218]:
def generate_augmented_answer(query):
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
  print(response.text)

Questions:
1. What is the name of the first person who trained dogs to help blind people?
2.  Where did the band, The Dismemberment Plan, get their name from?
3. What year did Virgin Galactic say they would fly tourists to space?

In [219]:
queries = ["what is the name of the first person who trained dogs to help blind people?",
           "Where did the band, The Dismemberment Plan, get their name from?",
           "what year did Virgin Galactic say they would fly tourists to space?"
          ]
for i, query in enumerate(queries):
  print(f"query {i+1}:\n")
  print("answer without RAG:")
  generate_simple_answer(query)
  print("\n")
  print("RAG answer:")
  generate_augmented_answer(query)
  print("-"*50)

query 1:

answer without RAG:
The first person to train dogs to assist blind people was a German pastor named Johann Wilhelm Klein. He founded an institution in 1819 that taught crafts and skills to blind people to help them become more independent. As part of this mission, he began training dogs to guide the blind, and in 1835, he successfully paired the first guide dog with a blind person. This pioneering work laid the foundation for the modern guide dog programs that have since transformed the lives of countless individuals with visual impairments.


RAG answer:
Josef Riesinger
--------------------------------------------------
query 2:

answer without RAG:
The band The Dismemberment Plan, formed in Washington, D.C. in 1993, chose their unique and attention-grabbing name through a democratic process. The band members put suggested names into a hat, and "The Dismemberment Plan" was the one randomly drawn out. The name stuck, and the band went on to build a successful career with this