# Ingesting PDF

In [49]:
# Didn't work with conda environment or python version 12
!python --version

Python 3.11.7


In [50]:
!pip install --q unstructured[all-docs] langchain langchain-community
# Chromadb is a vector database, alternatives could be used
!pip install --q chromadb
!pip install --q langchain-text-splitters

In [1]:
import os
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm import tqdm

# Single PDF

In [36]:
file_path = "books/J.K. Rowling - HP 1 - Harry Potter and the Sorcerer's Stone.pdf"

In [8]:
# This would be used if we were using a singular PDF
# loader = UnstructuredPDFLoader(file_path, mode="elements") - Elements is what unstructured creates and state to keep
loader = UnstructuredPDFLoader(file_path)
all_data = loader.load()

# Directory of PDFs

In [2]:
# Directory containing the Harry Potter books
directory = "books/"

# List to store all documents
all_data = []

In [3]:
# Load each PDF file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory, filename)
        print(f"Loading {filename}...")
        loader = UnstructuredPDFLoader(file_path)
        data = loader.load()
        all_data.extend(data)

print(f"Loaded {len(all_data)} documents in total.")

Loading J.K. Rowling - HP 1 - Harry Potter and the Sorcerer's Stone.pdf...


  from .autonotebook import tqdm as notebook_tqdm


Loading J.K. Rowling - HP 2 - Harry Potter and the Chamber of Secrets.pdf...
Loading J.K. Rowling - HP 3 - Harry Potter and the Prisoner of Azkaban.pdf...
Loading J.K. Rowling - HP 4 - Harry Potter and the Goblet of Fire.pdf...
Loading J.K. Rowling - HP 5 - Harry Potter and the Order of the Phoenix.pdf...
Loading J.K. Rowling - HP 6 - Harry Potter and the Half-Blood Prince.pdf...
Loading J.K. Rowling - HP 7 - Harry Potter and the Deathly Hallows.pdf...
Loaded 7 documents in total.


# Vector Embeddings

In [6]:
!ollama pull nomic-embed-text

[?25lpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 274 MB                         
pulling c71d239df917... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�  11 KB                         
pulling ce4a164fc046... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�   17 B                         
pulling 31df23ea7daa... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�  420 B                         
verifying sha256 digest 
writing manifest 
removing any unused layers 
success [?25h


In [4]:
!ollama list

NAME                    	ID          	SIZE  	MODIFIED       
mxbai-embed-large:latest	468836162de7	669 MB	19 minutes ago	
nomic-embed-text:latest 	0a109f422b47	274 MB	2 days ago    	
phi3:latest             	64c1188f2485	2.4 GB	2 weeks ago   	
llama3:latest           	365c0bd3c000	4.7 GB	2 weeks ago   	


In [9]:
# Split and chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
chunks = text_splitter.split_documents(all_data)

In [10]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="mxbai-embed-large", show_progress=True),
    collection_name="harry-potter-rag"
)


OllamaEmbeddings:   0%|          | 0/249 [00:00<?, ?it/s][A
OllamaEmbeddings:   0%|          | 1/249 [00:02<10:50,  2.62s/it][A
OllamaEmbeddings:   1%|          | 2/249 [00:05<10:21,  2.51s/it][A
OllamaEmbeddings:   1%|          | 3/249 [00:07<10:23,  2.54s/it][A
OllamaEmbeddings:   2%|▏         | 4/249 [00:10<10:14,  2.51s/it][A
OllamaEmbeddings:   2%|▏         | 5/249 [00:12<10:14,  2.52s/it][A
OllamaEmbeddings:   2%|▏         | 6/249 [00:15<10:04,  2.49s/it][A
OllamaEmbeddings:   3%|▎         | 7/249 [00:17<10:07,  2.51s/it][A
OllamaEmbeddings:   3%|▎         | 8/249 [00:20<09:57,  2.48s/it][A
OllamaEmbeddings:   4%|▎         | 9/249 [00:22<10:02,  2.51s/it][A
OllamaEmbeddings:   4%|▍         | 10/249 [00:25<09:57,  2.50s/it][A
OllamaEmbeddings:   4%|▍         | 11/249 [00:27<09:58,  2.51s/it][A
OllamaEmbeddings:   5%|▍         | 12/249 [00:30<09:49,  2.49s/it][A
OllamaEmbeddings:   5%|▌         | 13/249 [00:32<09:52,  2.51s/it][A
OllamaEmbeddings:   6%|▌         | 14

# Retrieval

In [11]:
# LLM from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

In [12]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant, Your task is to generate five
    different versions of the given user question to retrieve relevanta documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcom some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [13]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(search_kwargs={"k": 5}),
    llm,
    prompt=QUERY_PROMPT
)

# Enhanced RAG prompt
template = """Answer the question based on the following context from the Harry Potter book series. 
If the answer isn't explicitly stated in the context, say so, but try to provide the most relevant information available.
If the question is vague, consider multiple interpretations across all the books.

Context: {context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [14]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
chain.invoke(input(""))

 How does it end?



OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00

"The story ends with Dumbledore reminiscing about his past obsession with the Deathly Hallows and how he had finally acquired two of them (the Cloak and the Resurrection Stone). He realizes that he was no match for Voldemort, but Harry is the better man. The chapter ends with a sense of closure and reflection on the part of Dumbledore, as he acknowledges his mistakes in the past and finds comfort in Harry's forgiveness."

In [17]:
chain.invoke("In your opinion, who is the best character?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.40s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.35s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00

'A subjective question!\n\nBased on the context from the Harry Potter book series, it seems that Harry is reflecting on his mother\'s letter and thinking about Dumbledore. While there isn\'t a direct statement about who is the "best" character, we can infer that Harry has strong feelings of joy and grief while reading his mother\'s letter, which suggests a deep emotional connection to her.\n\nAs for Dumbledore, Harry seems to have mixed feelings - he\'s impressed by Dumbledore\'s actions, but also acknowledges that some things might seem "incredible" about him. This hints at a sense of respect and admiration for the Headmaster, without necessarily implying that he is the "best" character.\n\nOf course, this is just one possible interpretation!'

In [18]:
chain.invoke("At what point does Luke die?")


OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.38s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it][A

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00

'Luke does not die in this passage. In fact, Luke is not mentioned at all in this context from the Harry Potter book series. The story only mentions Hermione, Ron, Fred, Percy, and Harry, but there is no mention of a character named Luke.'

In [5]:
## Delete all collections in the db
vector_db.delete_collection()

NameError: name 'vector_db' is not defined