## Ingesting PDF

In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from IPython.display import display as Markdown
from tqdm.autonotebook import tqdm as notebook_tqdm

  from tqdm.autonotebook import tqdm as notebook_tqdm


In [2]:
local_path = "3dtbp_2024.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [8]:
# Preview first page
Markdown(data[0].page_content)

'5 1 3 3 6 7 0 1 4 2 0 2 5 4 4 3 6 S E S C I / 9 0 1 1 0 1 : I\n\n.\n\n.\n\n.\n\nO D | E E E I\n\n4 2 0 2 © 0 0 1 3 $ / 4 2 / 4 - 6 3 0 4 - 5 1 3 3 - 8 - 9 7 9 | ) S E S C I (\n\n.\n\ns\n\nm e t s y S t r e p x E e b a n a t s u S n o e c n e r e f n o C\n\nl\n\ni\n\nl\n\na n o i t a n r e t n\n\nI\n\nh t 4 4 2 0 2\n\nProceedings of the International Conference on Sustainable Expert Systems (ICSES-2024) IEEE Xplore Part Number: CFP24VS6-ART; ISBN: 979-8-3315-4036-4\n\nComparative Analysis of Deep Learning Models for Early Skin Cancer Detection Using 3D Total Body Photography\n\n1st Keshavagari Smithin Reddy Department of Computer Science and Engineering, Amrita School of Computing, Amrita Vishwa Vidyapeetham Chennai, India smithinreddy4@gmail.com\n\n2nd Ramya Polaki Department of Computer Science and Engineering, Amrita School of Computing, Amrita Vishwa Vidyapeetham Chennai, India ramyapolaki6046@gmail.com\n\n3rd V Sulochana Anna Administrative Staff College, Chennai, India sulo62002@

## Vector Embeddings

In [17]:
!ollama list

NAME                       ID              SIZE      MODIFIED    
minicpm-v:latest           1862d7d5fee5    5.5 GB    11 days ago    
llava:latest               8dd30f6b0cb1    4.7 GB    2 weeks ago    
nomic-embed-text:latest    0a109f422b47    274 MB    2 weeks ago    
llama3.2:latest            a80c4f17acd5    2.0 GB    4 weeks ago    
gemma2:latest              ff02c3702f32    5.4 GB    4 weeks ago    


In [18]:
# # Pull nomic-embed-text model from Ollama if you don't have it
# !ollama pull nomic-embed-text
# # List models again to confirm it's available
# !ollama list

In [20]:
# 1. First clean up any existing ChromaDB installations
%pip uninstall -y chromadb
%pip uninstall -y protobuf

# 2. Install specific versions known to work together
%pip install -q protobuf==3.20.3
%pip install -q chromadb==0.4.22  # Using a stable older version
%pip install -q langchain-ollama

# 3. Set the environment variable
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [21]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [22]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [25]:
# 1. First clean up any existing ChromaDB installations
%pip uninstall -y chromadb
%pip uninstall -y protobuf

# 2. Install specific versions known to work together
%pip install -q protobuf==3.20.3
%pip install -q chromadb==0.4.22  # Using a stable older version
%pip install -q langchain-ollama

# 3. Set the environment variable
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# 4. Now reimport with the new versions
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# 5. Try creating the vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)

/Users/tonykipkemboi/YouTube/Coding Projects/ollama_pdf_rag/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/tonykipkemboi/YouTube/Coding Projects/ollama_pdf_rag/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/tonykipkemboi/YouTube/Coding Projects/ollama_pdf_rag/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/tonykipkemboi/YouTube/Coding Projects/ollama_pdf_rag/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/tonykipkemboi/YouTube/Coding Projects/ollama_pdf_rag/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


## Retrieval

In [27]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [28]:
!ollama list

NAME                       ID              SIZE      MODIFIED    
minicpm-v:latest           1862d7d5fee5    5.5 GB    11 days ago    
llava:latest               8dd30f6b0cb1    4.7 GB    2 weeks ago    
nomic-embed-text:latest    0a109f422b47    274 MB    2 weeks ago    
llama3.2:latest            a80c4f17acd5    2.0 GB    4 weeks ago    
gemma2:latest              ff02c3702f32    5.4 GB    4 weeks ago    


In [29]:
# LLM from Ollama
local_model = "llama3.2"
llm = ChatOllama(model=local_model)

In [30]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [31]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [32]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [33]:
chain.invoke("What are the 5 pillars of global cooperation?")

'The 5 pillars of global cooperation, as analyzed by the Global Cooperation Barometer, are:\n\n1. **Trade and Capital**: Promote global development and resilience through the presence of global economic flows that promote likely opportunities for these outcomes.\n2. **Innovation and Technology**: Accelerate innovation and beneficial technological progress through the global sharing of underlying knowledge that contributes to these outcomes by fostering collaboration across global talent.\n3. **Climate and Natural Capital**: Focus on addressing climate change and promoting sustainable development through cooperation and collective action.\n4. **Health and Wellness**: Promote global health and well-being through cooperative efforts in areas such as disease prevention, health security, and access to healthcare.\n5. **Peace and Security**: Foster international peace and security through cooperation and conflict resolution mechanisms that promote stability and predictability.\n\nThese 5 pil

In [34]:
# Delete all collections in the db
vector_db.delete_collection()