In [22]:
from langchain_ollama import OllamaLLM
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings



MODEL = "llama3.1"
model = OllamaLLM(model=MODEL,
                  temperature=0)


embeddings = OllamaEmbeddings(model=MODEL)
vectorstore = Chroma(
    collection_name="pdf_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_pdf_db",
)


In [None]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

chain = model | parser
chain.invoke("tell me a joke")

## Text Splitter

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
loader = PyPDFDirectoryLoader('Documents/')
docs = loader.load()
chunks = text_splitter.split_documents(docs)
chunks

## Chunk ID 

In [None]:
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
    source = chunk.metadata.get("source")
    page = chunk.metadata.get("page")
    current_page_id = f"{str(source).replace('Documents\\','')}:{page}"
    
    if current_page_id == last_page_id:
        current_chunk_index +=1 
    else:
        current_chunk_index = 0
    last_page_id = current_page_id
        
    
    chunk_id = f"{current_page_id}:{current_chunk_index}"
    chunk.metadata["id"] = chunk_id

    

## ADD to ChromaDB

In [None]:
existing_items = vectorstore.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    vectorstore.add_documents(new_chunks, ids=new_chunk_ids)
    # vectorstore.persist()
else:
    print("âœ… No new documents to add")

In [None]:
from langchain.prompts import PromptTemplate

template = """
Answer the question below using the context. if you do not know the answer reply "I Don't Know."

question = {question}
context = {context} 
"""
prompt = PromptTemplate.from_template(template)
print(prompt.format(context = "You are a wizard", question = "what are you?"))

In [None]:
chain = prompt | model | parser

chain.invoke({
    "context" : "you are a dog",
    "question" : "what are you?"
})

In [None]:
retriever = vectorstore.as_retriever()
retriever.invoke("what are the steps to install python?")

In [50]:
from operator import itemgetter
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [None]:
print(chain.invoke({"question":"how can I install python?"}))