# Ingesting PDF

In [49]:
# Didn't work with conda environment or python version 12
!python --version

Python 3.11.7


In [50]:
!pip install --q unstructured[all-docs] langchain langchain-community
# Chromadb is a vector database, alternatives could be used
!pip install --q chromadb
!pip install --q langchain-text-splitters

In [51]:
import os
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm import tqdm

# Single PDF

In [36]:
file_path = "books/J.K. Rowling - HP 1 - Harry Potter and the Sorcerer's Stone.pdf"

In [None]:
# This would be used if we were using a singular PDF
# loader = UnstructuredPDFLoader(file_path, mode="elements") - Elements is what unstructured creates and state to keep
loader = UnstructuredPDFLoader(file_path)
data = loader.load()

# Directory of PDFs

In [52]:
# Directory containing the Harry Potter books
directory = "books/"

# List to store all documents
all_data = []

In [53]:
# Load each PDF file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory, filename)
        print(f"Loading {filename}...")
        loader = UnstructuredPDFLoader(file_path)
        data = loader.load()
        all_data.extend(data)

print(f"Loaded {len(all_data)} documents in total.")

Loading J.K. Rowling - HP 1 - Harry Potter and the Sorcerer's Stone.pdf...
Loading J.K. Rowling - HP 2 - Harry Potter and the Chamber of Secrets.pdf...
Loading J.K. Rowling - HP 3 - Harry Potter and the Prisoner of Azkaban.pdf...
Loading J.K. Rowling - HP 4 - Harry Potter and the Goblet of Fire.pdf...
Loading J.K. Rowling - HP 5 - Harry Potter and the Order of the Phoenix.pdf...
Loading J.K. Rowling - HP 6 - Harry Potter and the Half-Blood Prince.pdf...
Loading J.K. Rowling - HP 7 - Harry Potter and the Deathly Hallows.pdf...
Loaded 7 documents in total.


# Vector Embeddings

In [6]:
!ollama pull nomic-embed-text

[?25lpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 274 MB                         
pulling c71d239df917... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�  11 KB                         
pulling ce4a164fc046... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�   17 B                         
pulling 31df23ea7daa... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�  420 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h


In [7]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED     
nomic-embed-text:latest	0a109f422b47	274 MB	14 hours ago	
phi3:latest            	64c1188f2485	2.4 GB	11 days ago 	
llama3:latest          	365c0bd3c000	4.7 GB	11 days ago 	


In [54]:
# Split and chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=200)
chunks = text_splitter.split_documents(all_data)

In [55]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="harry-potter-rag"
)

OllamaEmbeddings: 100%|██████████| 900/900 [49:36<00:00,  3.31s/it]


# Retrieval

In [61]:
# LLM from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

In [62]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant, Your task is to generate five
    different versions of the given user question to retrieve relevanta documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcom some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [63]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(search_kwargs={"k": 5}),
    llm,
    prompt=QUERY_PROMPT
)

# Enhanced RAG prompt
template = """Answer the question based on the following context from the Harry Potter book series. 
If the answer isn't explicitly stated in the context, say so, but try to provide the most relevant information available.
If the question is vague, consider multiple interpretations across all the books.

Context: {context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [64]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [68]:
chain.invoke(input(""))

 Does Harry beat Voldemort?


OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.96s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


'According to the text, yes, Harry beats Voldemort. The exact mechanism of how this happens is described in the passage:\n\n"Your courage won, your wand overpowered his. And in doing so, something happened between those wands, something that echoed the relationship between their masters."\n\nIt seems that the shared core of the two wands reacted to each other in a way that was unique and powerful, allowing Harry\'s wand to overpower Voldemort\'s.'

In [69]:
chain.invoke("In you opinion, who is the best character?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


"Based on the context from the Harry Potter book series, I think Dumbledore is an excellent character. He is wise, compassionate, and merciful, even in the face of danger. He has a remarkable ability to understand and connect with others, as seen in his conversation with Malfoy. His calm demeanor and sense of humor, even when facing death, are truly admirable.\n\nIn this specific scene, Dumbledore shows great mercy towards Malfoy, offering him a chance to change sides and escape the clutches of Voldemort. He also displays remarkable bravery and composure in the face of the Death Eaters' attack, using his wit and charm to try to reason with them. Overall, I believe Dumbledore is one of the most compelling and admirable characters in the series!"

In [70]:
chain.invoke("At what point does Luke die?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]


'Luke does not appear in this context from the Harry Potter book series. The passage appears to be describing the thoughts and actions of Harry Potter, his parents, Sirius Black, James Potter, Remus Lupin, and Lily Potter as they approach their deaths at the hands of Lord Voldemort. There is no mention of Luke or any other character with that name dying in this context.'

In [48]:
## Delete all collections in the db
vector_db.delete_collection()