In [None]:
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import ast
import os
import pandas as pd

# get api key
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

pinecone = Pinecone(api_key=PINECONE_API_KEY)

utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

In [None]:
#downloading and unzipping a dataset.

#wget is a command-line utility for downloading files from the web.
#-q stands for "quiet" and tells wget to download the file without displaying progress or messages.
#-O lesson2-wiki.csv.zip specifies the output filename. It means the downloaded file will be saved as lesson2-wiki.csv.zip.
#The URL following -O is the direct link to the dataset hosted on Dropbox.

#!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

#!unzip lesson2-wiki.csv.zip


(Note: max_articles_num = 500): To achieve a more comprehensive context for the Language Learning Model, a larger number of articles is generally more beneficial. In this lab, we've initially set max_articles_num to 500 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 750 or 1,000. You'll likely notice that the context provided to the LLM becomes richer and better. You can experiment by gradually raising this variable for different queries to observe the improvements in the LLM's contextual understanding.

In [None]:
max_articles_num = 500
df = pd.read_csv('./data/wiki.csv', nrows=max_articles_num)
df.head()


In [None]:
#Unlike literal_eval, it only allows certain simple literal structures: strings, bytes, numbers, tuples, lists, dicts, sets, booleans, and None.
# import ast

# # String representation of a list
# list_str = "[1, 2, 3, 4, 5]"

# # Using ast.literal_eval to convert it back to a list
# converted_list = ast.literal_eval(list_str)

# print(type(converted_list))  # This will show <class 'list'>
# print(converted_list)        # This will print [1, 2, 3, 4, 5]


In [None]:
#Prepare the Embeddings and Upsert to Pinecone

#an empty list prepped to hold processed data before upserting it into Pinecone.
prepped = []

# tqdm is used to display a progress bar, total=df.shape[0] indicating the total number of iterations.
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'],
                    'values':ast.literal_eval(row['values']),
                    'metadata':meta})

    #upsert to pinecone every 250 records
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []


In [None]:
index.describe_index_stats()

In [None]:
#connect to OpenAI
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

#The function returns the embeddings for the given input articles.
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [None]:
#Run Your Query
query = "what is the berlin wall?"

#communicates with OpenAI's API to get the embedding for the query text.
embed = get_embeddings([query])

#Queries the Pinecone index using the embedding of the query.

#embed.data:The data attribute of the embed object contains the actual response from the OpenAI embeddings API,
# this is usually a list of results corresponding to each input provided to the get_embeddings function.

#embed.data[0]: Since you provided a single query, embed.data[0] accesses the first (and in this case, the only) item in the response list,
# this item is an object representing the embedding of your query.

#.embedding: Finally, the embedding attribute of this object contains the actual numerical vector representing the semantic embedding of your query.

#metadata of the matches (which includes the actual text) is returned in the response
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

#Extracts the text from the metadata of each result in res['matches'].
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))


In [None]:
#Build the Prompt

#Setting the Query and Getting Embeddings
query = "write an article titled: what is the berlin wall?"
embed = get_embeddings([query])

#Querying the Pinecone Index, retrieving the top 3 most similar articles or documents based on their embeddings
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

#Extracts the text content from each of the top matches' metadata for building prompt
contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

#prompt contains both question and related context
prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) +
    prompt_end
)

print(prompt)

In [1]:
#using OpenAI's GPT-3.5 model to generate a response based on the given prompt
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,

    # Adjustments to discourage repetition; both are set to 0 here.
    frequency_penalty=0,
    presence_penalty=0,

    #A sequence where the model should stop generating further content. None means no specific stop sequence.
    stop=None
)
print('-' * 80)

#Accesses and prints the generated text from the first choice in the response
print(res.choices[0].text)

### After running this code, you should see a generated text block that answers the query based on the supplied contexts.