In [8]:
import os
from typing import List, Any
from langchain_ollama import ChatOllama
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# --- VECTOR STORE LOGIC ---
class ChromaVectorStore:
    def __init__(self, persist_dir: str = "chroma_store",
                 embedding_model_name: str = "all-MiniLM-L6-v2"):
        self.persist_dir = persist_dir
        self.embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_name)
        self.vectorstore = None
        
        # Load existing store if it exists
        if os.path.exists(os.path.join(self.persist_dir, "chroma.sqlite3")):
            self.vectorstore = Chroma(persist_directory=self.persist_dir,
                                      embedding_function=self.embedding_function)

    def build_from_documents(self, documents: List[Any]):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_documents(documents)
        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embedding_function,
            persist_directory=self.persist_dir
        )

# --- LCEL RAG IMPLEMENTATION ---
class RAGSearch:
    def __init__(self, pdf_docs: List[Any]):
        # Initialize Vector Store
        self.cvs = ChromaVectorStore()
        if self.cvs.vectorstore is None:
            self.cvs.build_from_documents(pdf_docs)
        
        # Define Retriever as a Runnable
        self.retriever = self.cvs.vectorstore.as_retriever(search_kwargs={"k": 3})

        # Initialize LLM
        self.llm = ChatOllama(model="llama3.2:latest")
        # Define Prompt Template
        template = """Answer the question based only on the following context:
        {context}

        Question: {question}
        """
        self.prompt = ChatPromptTemplate.from_template(template)

        # --- THE LCEL PIPELINE ---
        # This replaces manual loops and formatting
        self.rag_chain = (
            {"context": self.retriever | self._format_docs, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )

    def _format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)

    def ask(self, query: str):
        return self.rag_chain.invoke(query)

# --- EXECUTION ---
# Ensure a folder named 'pdf-files' exists in your Colab file explorer
if not os.path.exists("pdf-files"):
    os.makedirs("pdf-files")
    print("Please upload your PDFs to the 'pdf-files' folder and run again.")
else:
    loader = DirectoryLoader("pdf-files", glob="**/*.pdf", loader_cls=PyMuPDFLoader)
    docs = loader.load()
    
    if docs:
        rag_system = RAGSearch(pdf_docs=docs)
        
        question = "What does the Part-8 of the constitution talk about? "
        print(rag_system.ask(question))

The context provided only shows 3 instances of "7" with a common text. There is no specific mention of what Part-8 of the constitution talks about. The instances only describe the passage and promulgation of the Constitution, but do not provide any information about its content or structure.


In [9]:
!pip install langchain-chroma
!pip install langchain-ollama
!pip install langchain-huggingface



In [10]:
!pip install langchain-core langchain-community pymupdf chromadb langchain_google_genai

