In [1]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores.chroma import Chroma

In [2]:
DATA_PATH=r"C:\Users\EK115DQ\GenAI Training\data"
CHROMA_PATH = "chroma"

In [3]:
document_loader = PyPDFDirectoryLoader(DATA_PATH)
documents = document_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=80,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(documents)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [4]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [5]:
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
    source = chunk.metadata.get("source")
    page = chunk.metadata.get("page")
    current_page_id = f"{source}:{page}"

    if current_page_id==last_page_id:
        current_chunk_index += 1
    else:
        current_chunk_index = 0
    chunk_id = f"{current_page_id}:{current_chunk_index}"
    last_page_id = current_page_id
    chunk.metadata["id"] = chunk_id
chunk_ids = [chunk.metadata["id"] for chunk in chunks]

In [6]:
def get_embedding_function():
    embedddings = OllamaEmbeddings(model="nomic-embed-text")
    return embedddings

In [7]:
db = Chroma(
    persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
)
 # Calculate Page IDs.
chunks_with_ids = calculate_chunk_ids(chunks)

# Add or Update the documents.
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks_with_ids:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"👉 Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    db.add_documents(new_chunks, ids=new_chunk_ids)
    db.persist()
else:
    print("✅ No new documents to add")

  embedddings = OllamaEmbeddings(model="nomic-embed-text")
  db = Chroma(


Number of existing documents in DB: 567
✅ No new documents to add


In [8]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [23]:
PROMPT_TEMPLATE = """
If the {question} is a generic conversation like "hi","hello", or similar questions like "how do you do" which do not seek knowledge
from the context, give a natural response without considering the below context. Otherwise answer the question based on the context: {context}.
QUESTION: {question}
ANSWER:
"""

In [27]:
def query_rag(query_text):
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    results = db.similarity_search_with_score(query_text,k=5)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="qwen2.5:1.5b")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [33]:
query_text=input("Enter query: "),
query_rag(query_text)

Response: ITC stands for Input Tax Credit.
Sources: ['C:\\Users\\EK115DQ\\GenAI Training\\data\\Aug 22.pdf:34:1', 'C:\\Users\\EK115DQ\\GenAI Training\\data\\Aug 22.pdf:29:2', 'C:\\Users\\EK115DQ\\GenAI Training\\data\\Aug 22.pdf:18:3', 'C:\\Users\\EK115DQ\\GenAI Training\\data\\Aug 22.pdf:25:4', 'C:\\Users\\EK115DQ\\GenAI Training\\data\\Aug 22.pdf:29:3']


'ITC stands for Input Tax Credit.'

In [None]:
query_text=input("Enter query: "),
query_rag(query_text)

In [None]:
# import chromadb
# chroma_client = chromadb.HttpClient(host='localhost', port=8000)
# collection = chroma_client.get_collection(name="gstbot")

In [None]:
# import os
# import certifi

# os.environ["SSL_CERT_FILE"] = certifi.where()
# import ssl

# ssl.SSLContext.verify_mode = ssl.VerifyMode.CERT_NONE

In [None]:
# documents = [chunk.page_content for chunk in chunks]  # LangChain uses page_content
# metadatas = [chunk.metadata for chunk in chunks]
# ids = chunk_ids

# # Add documents to ChromaDB collection
# collection.add(
#     documents=documents,
#     metadatas=metadatas,
#     ids=ids
# )

ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Basic Constraints of CA cert not marked critical (_ssl.c:1028) in add.