In [2]:
!pip install sentence_transformers
!pip install datasets
!pip install pinecone-client
!pip install cohere

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [3]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [5]:
with open("chohere_api_keys.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [6]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
def load_and_embedd_dataset(
        dataset_name: str = 'nihal-mp/worst_cars_in_history',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'Content',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [16]:
DATASET_NAME = 'nihal-mp/worst_cars_in_history'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=40,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


Lets us look at the dataset and the embeddings

In [17]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,Car,Content
0,Triumph Mayflower (1949-53),Triumph Mayflower The Triumph Mayflower was an...
1,Nash/Austin Metropolitan (1954-62),Nash Metropolitan The Nash Metropolitan was on...
2,Renault Dauphine (North American version) (195...,Renault Dauphine While the Renault Dauphine wa...
3,Trabant (1957-90),Trabant P50 Limousine The Trabant P50 was intr...
4,Edsel (1958),Edsel Corsair Launched with considerable publi...


In [18]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (40, 384)


## Second Element - Vector Database
We will use Pinecone's free-to-use vectorDB

In [19]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [21]:
INDEX_NAME = 'worst-cars-in-history'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


Now that we have created the vector database, let's add some data to it!

In [22]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'Content',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [24]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


Let's view the index statistics!

In [25]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Third Element - LLM
We will use [Cohere's chat API](https://cohere.com/chat)

In [35]:
import cohere

#First lets write a query for the LLM
query = "Where was Renault Dauphine included in a list of cars? and which list was it?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

'The Renault Dauphine was included in a list of "The 10 Most Influential Cars of the 20th Century" published by *Popular Mechanics* in 1999. \n\nThe Renault Dauphine, introduced in 1956, was a small, economical car that became popular in Europe and was also one of the first foreign cars to be widely sold in the United States. It represented a shift in automotive design and consumer preferences, as it offered a more compact and affordable alternative to the larger, more luxurious cars that had dominated the market. \n\nHere is an excerpt from the article:"*The 10 Most Influential Cars of the 20th Century*" by *Popular Mechanics*:"6. 1956 Renault Dauphine: The Dauphine was one of the first successful "people\'s cars," offering basic transportation at a low price. It was a huge seller in Europe and one of the first imported cars to make serious inroads in the U.S. market."'

In [27]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['Content'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [36]:
# Let us remember our query
query = "Where was Renault Dauphine included in a list of cars? and which list was it?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The Renault Dauphine was included in the following lists:\n- Autoblog\'s "The 20 Dumbest Cars of All Time"\n- Time\'s "50 Worst Cars of All Time"\n- Car Talk\'s "Worst Car of the Millennium" poll, where it placed 9th\n- Edmunds.com\'s list of the worst cars of all time, where it ranked 67th'

In [34]:
print(source_knowledge)

Nash Metropolitan The Nash Metropolitan was one of the first attempts by a US car maker to produce a small car "which would be mainly used as a second car."[13] Compared to other small cars of the era, it was to be much more luxurious, intended to be "a big car in miniature" by its main developer George Mason.[14] It was also one of the first cars specifically designed for and marketed towards women,[15] being advertised as "a motorized shopping cart for affluent urban gals."[16] It was developed by Nash in co-operation with British Austin Motors, who also produced it at their Longbridge plant, making it the first US-developed car to be entirely produced in a different country. However, despite having large marketing efforts put into the project, it never sold as well as planned and ended up as a commercial failure. The main reasons for this were considered its poor performance, poor handling, poor reliability, and the small market for small economy cars on the US market, with a rising