In [1]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [2]:
COHERE_API_KEY ="bCicpvvKIFV8VV9bpC1ITwjYa6rjLeOPrwz6HVkm"
PINECONE_API_KEY = "198559e6-23b6-49b3-b1ff-acc8ee2e6340"

In [3]:
#First lets write a query for the LLM
query = "How many people served as U.S. president between 1880 and 2000?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

'There were 21 U.S. presidents who served between 1880 and 2000. They are, in order of their presidency:\n\n1. Rutherford B. Hayes (1877-1881)\n2. James A. Garfield (1881)\n3. Chester A. Arthur (1881-1885)\n4. Grover Cleveland (1885-1889)\n5. Benjamin Harrison (1889-1893)\n6. Grover Cleveland (1893-1897)\n7. William McKinley (1897-1901)\n8. Theodore Roosevelt (1901-1909)\n9. William Howard Taft (1909-1913)\n10. Woodrow Wilson (1913-1921)\n11. Warren G. Harding (1921-1923)\n1Multiplier-2Calvin Coolidge (1923-1929)\n13. Herbert Hoover (1929-1933)\n14. Franklin D. Roosevelt (1933-1945)\n15. Harry S. Truman (1945-1953)\n16. Dwight D. Eisenhower (1953-1961)\n17. John F. Kennedy (1961-1963)\n18. Lyndon B. Johnson (1963-1969)\n19. Richard Nixon (1969-1974)\n20. Gerald Ford (1974-1977)\n21. Jimmy Carter (1977-1981)\n22. Ronald Reagan (1981-1989)\n23. George H.W. Bush (1989-1993)\n24. Bill Clinton (1993-2001)\n\nNote that Grover Cleveland served two non-consecutive terms, so he is counted twice

In [4]:
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')


In [5]:

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [6]:
def load_and_embedd_dataset(
        dataset_path: str = 'wikitext',
        dataset_name: str = 'wikitext-2-raw-v1',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'text',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_path, dataset_name, split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [7]:
dataset_path = 'wikitext'
dataset_name = 'wikitext-2-raw-v1'

dataset, embeddings = load_and_embedd_dataset(
    dataset_path=dataset_path,
    dataset_name=dataset_name,
    rec_num=400,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


In [8]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(7)

Unnamed: 0,text
0,
1,= Valkyria Chronicles III = \n
2,
3,Senjō no Valkyria 3 : Unrecorded Chronicles (...
4,"The game began development in 2010 , carrying..."
5,"It met with positive sales in Japan , and was..."
6,


In [9]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (400, 384)


In [10]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [11]:
INDEX_NAME = 'cnn-dailymail'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [12]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [13]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 4/4 [00:16<00:00,  4.04s/it]


In [14]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 400}},
 'total_vector_count': 400}

In [15]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [16]:
def compare_models(query):
    co = cohere.Client(api_key=COHERE_API_KEY)
    response = co.chat(
            model='command-r-plus',
            message=query,
        )
    print("Original response:")
    print(response.text)
    print('-------------------------------------------------------------------------------------------------')
    
    augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
    response = co.chat(
            model='command-r-plus',
            message=augmented_prompt,
        )
    print("Augmented response:")
    print(response.text)
    print('-------------------------------------------------------------------------------------------------')
    print("Source knowledge:")
    print(source_knowledge)

In [17]:
query = "Who sung the opening theme in Valkyria Chronicles III?"
compare_models(query)

Original response:
*Valkyria Chronicles III* is an amazing game with a beautiful opening theme! The song is called “Blue Star” and it is performed by Japanese singer and voice actress, Kanako Kotera.
-------------------------------------------------------------------------------------------------
Augmented response:
May'n
-------------------------------------------------------------------------------------------------
Source knowledge:
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . 


 = Valkyria Chronicles III = 


 T

In [18]:
query = "When did Raphael Tuck buy four of Barker's 'little drawings'?"
compare_models(query)

Original response:
Raphael Tuck bought four of Barker's "little drawings" in 1866.
-------------------------------------------------------------------------------------------------
Augmented response:
In 1911, Raphael Tuck & Sons bought four of Barker's "little drawings."
-------------------------------------------------------------------------------------------------
Source knowledge:
 In 1911 , Raphael Tuck & Sons bought four of Barker 's " little drawings " for half a sovereign , and published them as postcards . In October 1911 , she won second prize in the Croydon Art Society 's poster competition , and shortly afterward was elected the youngest member of the Society . The art critic for the Croydon Advertiser remarked , " Her drawings show a remarkable freedom of spirit . She has distinct promise . " 


 Following her father ’ s death in June 1912 , the seventeen @-@ year @-@ old Barker submitted art and poetry to My Magazine , Child ’ s Own , Leading Strings , and Raphael Tuck a

In [19]:
query = "When did Columbus have the best chance of receiving the first overall pick?"
compare_models(query)

Original response:
2015
-------------------------------------------------------------------------------------------------
Augmented response:
Columbus had the best chance of receiving the first overall pick in the 2012 NHL Entry Draft lottery.
-------------------------------------------------------------------------------------------------
Source knowledge:
 Finishing with the worst record in the NHL , Columbus had the best chance of receiving the first overall pick in the 2012 draft . With the NHL 's weighted draft lottery the Blue Jackets had a 48 @.@ 2 % chance of drafting first overall . However , the lottery was won by the Edmonton Oilers , who proceeded to leapfrog Columbus and secure the number one draft pick for a third consecutive year . It was the fifth time that the Blue Jackets were dropped one draft position in the franchise 's 12 lottery participations . 


 Two weeks prior to the NHL trade deadline , Columbus announced that unlike earlier in the season , they would liste