In [1]:
import fitz  

def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_document = fitz.open(pdf_path)
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    pdf_document.close()
    return text


DOC_PATH = "harrypotter1.pdf"
text = extract_text_from_pdf(DOC_PATH)
print(text[:500])

1
Harry Potter and the Sorcerer's Stone
CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. M


In [2]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)

In [3]:
chunks = text_splitter.create_documents([text])
print(chunks[0])

page_content="1\nHarry Potter and the Sorcerer's Stone\nCHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did"


In [4]:
OPENAI_API_KEY ='<OPEN AI KEY>'
CHROMA_PATH = "hp_db" 

In [5]:
chunks

[Document(page_content="1\nHarry Potter and the Sorcerer's Stone\nCHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did"),
 Document(page_content="because they just didn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their

In [6]:
import openai
import pandas as pd
import numpy as np

openai.api_key = "<OPEN AI KEY>"

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input=[text], model=model).data[0].embedding


In [7]:
df = pd.DataFrame([doc.page_content for doc in chunks], columns=['text_chunk'])

In [12]:
df['ada_embedding'] = df['text_chunk'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [13]:
[doc.page_content for doc in chunks]

["1\nHarry Potter and the Sorcerer's Stone\nCHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did",
 "because they just didn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their",
 "spent so much of her time craning over g

In [14]:
print(df)
df.to_csv('embeddings_hp1.csv', index=False)

                                             text_chunk  \
0     1\nHarry Potter and the Sorcerer's Stone\nCHAP...   
1     because they just didn't hold with such nonsen...   
2     spent so much of her time craning over garden ...   
3     think they could bear it if anyone found out a...   
4     say if the Potters arrived in the street. The ...   
...                                                 ...   
1429  world. Some of them called:\n"Bye, Harry!"\n"S...   
1430  Ron.\n"Harry Potter!" she squealed. "Look, Mom...   
1431  furious at the nerve of Harry, carrying an owl...   
1432  got all day." He walked away.\nHarry hung back...   
1433  spreading over his face. "They don't know we'r...   

                                          ada_embedding  
0     [0.030016308650374413, 0.06340831518173218, -0...  
1     [0.031364940106868744, 0.06092509254813194, -0...  
2     [0.05524228513240814, 0.015667768195271492, -0...  
3     [0.04070686921477318, -0.0052930801175534725, ...  
4

In [15]:
question = "Who are the Dursleys?"
question_vector = get_embedding(question, model='text-embedding-ada-002')

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
df['cosine_similarity'] = df['ada_embedding'].apply(lambda x: cosine_similarity([question_vector], [x])[0][0])

In [17]:

df = df.sort_values(by='cosine_similarity', ascending=False)

top_50 = df.head(50)

print(top_50)

                                             text_chunk  \
692   flashed furiously, "-- how dare you -- might h...   
840   The Gryffindor common room was very noisy that...   
107   17\n"What about what's-her-name, your friend -...   
702   I'd say."\nI shall speak to Professor Dumbledo...   
1041  busy... excellent..."\nSnape spat bitterly on ...   
1241  about the Sorcerer's tone --"\n214\nWhatever P...   
280   said. "I'm -- er -- not supposed ter do magic,...   
1128  "Professor -- please\n"You can't --"\n195\n"Do...   
706   hurried over.\n"Well done," said George in a l...   
1423  to normal next year, or as normal as it ever w...   
1242  and steal the Stone. I've got to talk to Profe...   
860   the head by a Bludger -- Quaffle taken by the ...   
1392  choked and said, "Alas! Ear wax!"\nMadam Pomfr...   
92    second television and the racing bike. Exactly...   
691   "HARRY POTTER!"\nHis heart sank faster than he...   
1335  referee your next match? He was trying to make... 

In [18]:
top_50_list = str(top_50['text_chunk'].tolist())

In [19]:
print(len(top_50_list)/4)

6112.0


In [20]:
top_50.shape

(50, 3)

In [23]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

PROMPT_TEMPLATE = """
Below is the context of a story, answer the question based on this context:
{context}
Give an answer based on the above context: {question}.
Provide a detailed answer. It is a story so provide relevant answers to it.
Don not justify your answers.
Don not give information not mentioned in the CONTEXT.
Do not say "according to the context" or "mentioned in the context" or similar.
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=top_50_list, question=question)
messages = [{"role":"user", "content":prompt}]

model = ChatOpenAI(openai_api_key='<OPEN AI KEY>', model = 'gpt-4-turbo', temperature=0.3)
response_text = model.invoke(messages)

In [22]:
print(response_text.content)

The Dursleys are a family that includes Mr. Dursley, who is the director of a firm called Grunnings, which made drills. He is described as a big, beefy man with hardly any neck but a very large mustache. Mrs. Dursley is thin and blonde and has nearly twice the usual amount of neck, which she uses to spy over garden fences on the neighbors. They have a small son named Dudley. The family does not hold with nonsense, as indicated by their disdain for anything out of the ordinary.
