In [1]:
#TO IGNORE WARNINGS IN CODE
import warnings
warnings.filterwarnings("ignore")

In [19]:
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from dotenv import load_dotenv
import os
import pandas as pd
import ast

#LOADING THE ENVIRONMENT
load_dotenv()

True

In [3]:
#LOADING API KEYS 
PINECONE_API_KEY=os.environ['PINECONE_API_KEY'] #VECTOR DB
OPENAI_API_KEY_VAL = os.environ["OPENAI_API_KEY"] #GENEATE EMBEDDINGS AND TEXT SUMMARIZATION

In [4]:
pc = Pinecone(api_key=PINECONE_API_KEY) #GET PINECONE CONNECTION
INDEX_NAME = "ragindex1" #DEFINE INDEX NAME
if INDEX_NAME in [index.name for index in pc.list_indexes()]: #CHECK IF INDEX ALREADY EXIST
  pc.delete_index(INDEX_NAME) #IF YES, THEN DELETE THE OLD ONE

pc.create_index(name=INDEX_NAME, dimension=1536, metric='cosine', #CREATE A NEW INDEX
  spec=ServerlessSpec(cloud='aws', region='us-west-2')) #TO RUN INDEX ON A CLOUD SERVICE PROVIDER

index = pc.Index(INDEX_NAME) #GET INDEX

In [8]:
#DOWNLAOD THE DATSET USING WGET, INSTALL WGET BEFORE RUNNING THIS COMMAND
!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

In [9]:
!unzip lesson2-wiki.csv.zip # UNZIP THE DATASET

Archive:  lesson2-wiki.csv.zip
  inflating: wiki.csv                


In [11]:
max_articles_num = 500
df = pd.read_csv('./wiki.csv', nrows=max_articles_num)
df.head()


Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


# Prepare the Embeddings and Upsert to Pinecone

In [12]:
#STORE INTO PINCEODE WITH A BATCH OF 250 {A RANDOM NUMBER SUGGESTED VALUE BETWEEN 100-500 FOR GOOD EFFICIENCY}
prepped = []

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'], 
                    'values':ast.literal_eval(row['values']), 
                    'metadata':meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []


100%|██████████| 500/500 [00:07<00:00, 64.13it/s] 


In [13]:
index.describe_index_stats() #STATS OF INDEXED DATASET

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

# open ai connectION SETUP

In [20]:

openai_client = OpenAI(api_key=OPENAI_API_KEY_VAL)

def get_embeddings(articles, model="text-embedding-ada-002"): #USING model="text-embedding-ada-002" FOR ENBEDDINGS {THE MOST BASIC}
   return openai_client.embeddings.create(input = articles, model=model)

In [24]:
#TEST EXAMPLE
query = "what is the dragon ball?"

embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))


Festivals 
Spring Festival is the Chinese New Year.

Dragon Boat Festival is celebrated to commemorate the death of Qu Yuan, a patriotic poet of the State of Chu during the Warring States period. He persuaded his emperor not to accept Qin's diplomats' offers several times but his emperor did not listen to him. He was very sad and ended up jumping into the river to end his life. The people loved him so much that they did not want the fish to eat his corpse. They made and threw rice dumplings into the river. They hope the fish eat these dumplings instead of the poet's corpse. They also rowed dragon boats in the river to get rid of the fish. Such practices, eating rice dumplings and holding dragon boat races, become what Chinese do in this festival nowadays.

Held on the fifteenth day of the eighth lunar month, Mid-Autumn Festival is a festival for families. Now when the festival sets in, people would sit together to eat moon cakes, appreciate the bright full moon cakes, appreciate the br

# PROPMPT DESIGN FOR OPENAI-CALL

In [26]:
query = "write an article titled: what is the DRAGON BALL?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) + 
    prompt_end
)

print(prompt)

Answer the question based on the context below.

Context:
Everything2 or E2 is a website.  It lets people make pages about many different things, and some people use it as a diary.

E2 users create pages called nodes and add stuff in writeups.  Only logged-in users can create writeups.  Only the person who created the writeup or someone who the website owners (called "gods") choose can edit the writeup.  On the other hand, on Wikipedia, anyone can edit pages, but on Everything2 only those who can edit the writeup can edit pages.

Everything2 does not require a neutral point of view like Wikipedia does. So, it is possible to have more than one article (writeups) under the same title (node), each by different authors, and presenting different points of view.

Other websites 
 Everything2 website
 Everything2.com article about Wikipedia

Websites

---

Festivals 
Spring Festival is the Chinese New Year.

Dragon Boat Festival is celebrated to commemorate the death of Qu Yuan, a patriotic p

# TEXT-SUMMARIZATION USING OPEN AI

In [27]:
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-' * 80)
print(res.choices[0].text)

--------------------------------------------------------------------------------


Dragon Ball is a popular Japanese manga series created by Akira Toriyama. It was first published in 1984 and has since become a worldwide phenomenon, with millions of fans across the globe. The series follows the adventures of a young boy named Goku, who possesses incredible strength and abilities, as he goes on a journey to find the seven mystical Dragon Balls.

The Dragon Balls are powerful orbs that, when gathered together, can summon a wish-granting dragon named Shenron. Throughout the series, Goku and his friends must battle powerful enemies and overcome challenges in order to collect all seven Dragon Balls and make their wishes come true.

The series is known for its action-packed fight scenes, colorful characters, and unique blend of humor and drama. It has spawned numerous spin-offs, including anime series, movies, video games, and merchandise. Dragon Ball has also been adapted into multiple lang