In [1]:
#suppresses warnings to keep the notebook output clean.
#It's common in Jupyter notebooks to avoid cluttering the output with warning messages that are not critical.
import warnings
warnings.filterwarnings('ignore')


In [2]:
#Importing Libraries for Dataset, Sentence Embeddings, and Vector Database
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec


In [None]:
#Util functions
from DLAIUtils import Utils
import DLAIUtils


In [None]:
#os and time are standard Python libraries for interacting with the operating system and handling time-related functions.
#tqdm is for displaying progress bars, useful for long-running operations.
import os
import time
import torch
from tqdm.auto import tqdm


In [None]:
#The Quora dataset typically contains question pairs with a label indicating if they are duplicates.
dataset = load_dataset('quora', split='train[240000:290000]')

#Displaying the First Five Records
dataset[:5]

#iterate through the dataset, extracting text from each record.
#extend is used to flatten the list of lists into a single list.
#set is used to remove duplicate questions.
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))


In [None]:
#Displaying Sample Questions
print('\n'.join(questions[:10]))


In [None]:
print('-' * 50)
print(f'Number of questions: {len(questions)}')


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')


In [None]:
#all-MiniLM-L6-v2 sentence-transformers model maps sentences to a 384 dimensional dense vector space.
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


In [None]:
#Encoding a Query
query = 'which city is the most populated in the world?'
xq = model.encode(query)

#display the dimensions of the resulting vector, typically the size of the embedding.
xq.shape


In [None]:

utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [None]:

pinecone = Pinecone(api_key=PINECONE_API_KEY)

#Creating and Managing the Index:

#generates a unique index name
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

#Checks if the index already exists. If it does, it deletes the existing index. This ensures you start with a fresh index.
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)

#Creates a new Pinecone index with the specified name
pinecone.create_index(name=INDEX_NAME,
    dimension=model.get_sentence_embedding_dimension(),
    metric='cosine',
    #specifies the cloud provider and region for the Pinecone index.
    spec=ServerlessSpec(cloud='aws', region='us-west-2'))

#Initializes the Pinecone index with the created index name and prints the index object for confirmation.
index = pinecone.Index(INDEX_NAME)
print(index)

In [None]:
#Create Embeddings and Upsert to Pinecone

batch_size=200

#Sets a vector_limit to limit the number of questions processed.
vector_limit=10000

#Truncates the list of questions to this vector_limit
questions = question[:vector_limit]

In [None]:
#Iterates over the questions in batches (using tqdm for a progress bar).
for i in tqdm(range(0, len(questions), batch_size)):
    #calculates the end of the current batch
    i_end = min(i+batch_size, len(questions))
    #Generates unique IDs for each question in the batch
    ids = [str(x) for x in range(i, i_end)]
    #Creates metadata for each question (here, just the text of the question).
    metadatas = [{'text': text} for text in questions[i:i_end]]
    #Encodes the questions to create embeddings (xc
    xc = model.encode(questions[i:i_end])
    #Zips the IDs, embeddings, and metadata into records.
    records = zip(ids, xc, metadatas)
    #Performs an upsert operation to the Pinecone index.
    #upsert is a combination of insert and update: it adds new vectors and updates existing ones if the ID already exists.
    index.upsert(vectors=records)


In [None]:
#retrieves and displays statistics about the Pinecone index, such as the number of vectors stored and the index capacity.
index.describe_index_stats()


In [None]:
#Defining a Helper Function for Queries
def run_query(query):
    #encodes the query into an embedding using the model
    embedding = model.encode(query).tolist()
    #Performs a query against the Pinecone index
    #requesting the top 10 (top_k=10) closest matches based on the query vector.
    #include_metadata=True ensures that the metadata (like the original text of the questions) is returned with the results.
    results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
    #prints each result, showing the cosine similarity score (rounded to two decimal places) and the corresponding question text from the metadata.
    for result in results['matches']:
        print(f"{round(result['score'], 2)}: {result['metadata']['text']}")


In [None]:
#Running a Sample Query
query = 'how do i make chocolate cake?'
run_query(query)
