In [64]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [65]:
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path

In [66]:
def process_all_pdfs(pdf_directory : str):
    
    all_documents = []
    pdf_dir = Path(pdf_directory)

    total_pages = 0
    max_page = 0
    min_page = float('inf')

    # find all the pdf in the dir 
    pdfs = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found : {len(pdfs)} PDFs files to Process")

    for pdf in pdfs:
        print(f"Procesing: {pdf.name}")

        try:
            loader = PyPDFLoader(str(pdf))
            documents = loader.load()

            # Add source informaion to the metadata
            for doc in documents:
                doc.metadata["source"] = pdf.name
                doc.metadata["format"] = 'pdf'
            
            total_pages += len(documents)
            max_page = max(max_page, len(documents))
            min_page = min(min_page, len(documents))
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"Documents Loaded: {len(documents)}")
    return {
        "all_pdf_documents" : all_documents,
        "total_documnets" : total_pages,
        "max_pages" : max_page,
        "min_pages" : min_page 
    }

In [67]:
all_pdf_documents = process_all_pdfs("../data")

Found : 49 PDFs files to Process
Procesing: 1.pdf
  ✓ Loaded 4 pages
Procesing: 10.pdf
  ✓ Loaded 3 pages
Procesing: 11.pdf
  ✓ Loaded 3 pages
Procesing: 12.pdf
  ✓ Loaded 2 pages
Procesing: 13.pdf
  ✓ Loaded 3 pages
Procesing: 14.pdf
  ✓ Loaded 4 pages
Procesing: 15.pdf
  ✓ Loaded 3 pages
Procesing: 16.pdf
  ✓ Loaded 3 pages
Procesing: 17.pdf
  ✓ Loaded 5 pages
Procesing: 18.pdf
  ✓ Loaded 6 pages
Procesing: 19.pdf
  ✓ Loaded 2 pages
Procesing: 2.pdf
  ✓ Loaded 3 pages
Procesing: 20.pdf
  ✓ Loaded 2 pages
Procesing: 21.pdf
  ✓ Loaded 3 pages
Procesing: 22.pdf
  ✓ Loaded 3 pages
Procesing: 23.pdf
  ✓ Loaded 2 pages
Procesing: 24.pdf
  ✓ Loaded 3 pages
Procesing: 25.pdf
  ✓ Loaded 3 pages
Procesing: 26.pdf
  ✓ Loaded 2 pages
Procesing: 27.pdf
  ✓ Loaded 2 pages
Procesing: 3.pdf
  ✓ Loaded 4 pages
Procesing: 4.pdf
  ✓ Loaded 4 pages
Procesing: 5.pdf
  ✓ Loaded 2 pages
Procesing: 6.pdf
  ✓ Loaded 5 pages
Procesing: 7.pdf
  ✓ Loaded 3 pages
Procesing: 8.pdf
  ✓ Loaded 3 pages
Procesing: 9.

In [68]:
all_pdf_documents

{'all_pdf_documents': [Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-12-27T14:42:57+05:30', 'author': '', 'moddate': '2025-12-27T14:42:57+05:30', 'title': 'Microsoft Word - datahacathon', 'source': '1.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'format': 'pdf'}, page_content='1. National Overseas Scholarship For Students With Disabilities \nDetails \nA scholarship scheme by the Ministry of Social Justice & Empowerment for regular, full-\ntime students with disabilities to obtain higher education viz., Master\'s degree, or Ph.D. \ncourses from foreign universities, in one of the speciﬁed ﬁelds of study. \nThe students with the speciﬁed disabilities deﬁned in Schedule of the "Rights of \nPersons with Disabilities Act, 2016" will be eligible under this scheme. This includes \npersons with visual, hearing, speech, loco-motor, mental retardation, and other \ndisabilities. NOS is implemented oƯline by the DEPwD. \nAt times SwDs 

In [69]:
def splitter(documents, chunk_size = 1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n", "\n\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents=documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"Example chunks: ")
        print(f"Content {split_docs[0].page_content[:]}")
        print(f"MetaData {split_docs[0].metadata}")
    
    return split_docs


In [70]:
# Ensure we pass the list of Document objects (not the dict) to splitter
docs = all_pdf_documents['all_pdf_documents'] if isinstance(all_pdf_documents, dict) and 'all_pdf_documents' in all_pdf_documents else all_pdf_documents

chunks = splitter(docs)
chunks

Split 243 documents into 412 chunks
Example chunks: 
Content 1. National Overseas Scholarship For Students With Disabilities 
Details 
A scholarship scheme by the Ministry of Social Justice & Empowerment for regular, full-
time students with disabilities to obtain higher education viz., Master's degree, or Ph.D. 
courses from foreign universities, in one of the speciﬁed ﬁelds of study. 
The students with the speciﬁed disabilities deﬁned in Schedule of the "Rights of 
Persons with Disabilities Act, 2016" will be eligible under this scheme. This includes 
persons with visual, hearing, speech, loco-motor, mental retardation, and other 
disabilities. NOS is implemented oƯline by the DEPwD. 
At times SwDs are deprived of harnessing their latent skills and thereby miss the 
opportunity. This scheme aims to support SwDs to study further in order to prepare 
themselves to earn their livelihood and to ﬁnd a digniﬁed place for themselves in the 
society as they face several barriers physical, ﬁn

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-12-27T14:42:57+05:30', 'author': '', 'moddate': '2025-12-27T14:42:57+05:30', 'title': 'Microsoft Word - datahacathon', 'source': '1.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'format': 'pdf'}, page_content='1. National Overseas Scholarship For Students With Disabilities \nDetails \nA scholarship scheme by the Ministry of Social Justice & Empowerment for regular, full-\ntime students with disabilities to obtain higher education viz., Master\'s degree, or Ph.D. \ncourses from foreign universities, in one of the speciﬁed ﬁelds of study. \nThe students with the speciﬁed disabilities deﬁned in Schedule of the "Rights of \nPersons with Disabilities Act, 2016" will be eligible under this scheme. This includes \npersons with visual, hearing, speech, loco-motor, mental retardation, and other \ndisabilities. NOS is implemented oƯline by the DEPwD. \nAt times SwDs are deprived of harnes

In [71]:
type(chunks)

list

In [72]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Tuple, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

In [82]:
class EmbeddingManager:
    """
    Handles document embedding generation using SentenceTransformer
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager.

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model: SentenceTransformer | None = None
        self._load_model()

    def _load_model(self) -> None:
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            dim = self.model.get_sentence_embedding_dimension()
            print(f"Model loaded successfully. Embedding dimension: {dim}")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of text strings to embed

        Returns:
            NumPy array of shape (len(texts), embedding_dim)
        """
        if self.model is None:
            raise RuntimeError("Embedding model is not loaded.")

        if not texts:
            raise ValueError("Input text list is empty.")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(
            texts,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [83]:
embedding_manager = EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


In [93]:
class VectorStore:

    def __init__(self, collection_name: str = "pdf_documents"):
        self.collection_name = collection_name

        BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
        self.persist_directory = BASE_DIR / "data" / "dataBase"

        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)

            self.client = chromadb.Client(
                Settings(
                    persist_directory=str(self.persist_directory),
                    anonymized_telemetry=False
                )
            )

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )

            print(f"Vector store initialized: {self.collection_name}")
            print(f"Persist directory: {self.persist_directory}")
            print(f"Documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("Documents and embeddings count mismatch")

        ids, metadatas, texts, embedding_list = [], [], [], []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            texts.append(doc.page_content)
            embedding_list.append(embedding.tolist())

        self.collection.add(
            ids=ids,
            documents=texts,
            metadatas=metadatas,
            embeddings=embedding_list
        )

        print(f"Added {len(documents)} documents")
        print(f"Total documents: {self.collection.count()}")

In [94]:
vector_store = VectorStore()

Vector store initialized: pdf_documents
Persist directory: c:\Users\hp\Desktop\Sarkari-Dost\NoteBooks\data\dataBase
Documents in collection: 412


In [95]:
text = [ doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(text)


vector_store.add_documents(chunks, embeddings)

Generating embeddings for 412 texts...


Batches: 100%|██████████| 13/13 [00:13<00:00,  1.04s/it]


Generated embeddings with shape: (412, 384)
Added 412 documents
Total documents: 824


In [97]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 7,
        score_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query.
        """

        if not query or not query.strip():
            raise ValueError("Query string is empty")

        print(f"Retrieving documents for query: '{query}'")
        print(f"Top-K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding: np.ndarray = (
            self.embedding_manager.generate_embeddings([query])[0]
        )

        retrieved_docs: List[Dict[str, Any]] = []

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            documents = results.get("documents", [[]])[0]
            metadatas = results.get("metadatas", [[]])[0]
            distances = results.get("distances", [[]])[0]
            ids = results.get("ids", [[]])[0]

            if not documents:
                print("No matching documents found")
                return []

            for rank, (doc_id, document, metadata, distance) in enumerate(
                zip(ids, documents, metadatas, distances), start=1
            ):
                # Cosine similarity (requires normalized embeddings)
                similarity_score = 1.0 - distance

                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": document,
                        "metadata": metadata,
                        "similarity_score": similarity_score,
                        "distance": distance,
                        "rank": rank
                    })

            print(f"Retrieved {len(retrieved_docs)} documents")

        except Exception as e:
            print(f"Error during retrieval: {e}")
            raise

        return retrieved_docs
    
ragRetriever = RAGRetriever(vector_store, embedding_manager)

In [100]:
ragRetriever.retrieve("A scholarship scheme by the Ministry of Social Justice & Empowerment for regular")

Retrieving documents for query: 'A scholarship scheme by the Ministry of Social Justice & Empowerment for regular'
Top-K: 7, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]


Generated embeddings with shape: (1, 384)
Retrieved 7 documents


[{'id': 'doc_0d0c810d_0',
  'content': '1. National Overseas Scholarship For Students With Disabilities \nDetails \nA scholarship scheme by the Ministry of Social Justice & Empowerment for regular, full-\ntime students with disabilities to obtain higher education viz., Master\'s degree, or Ph.D. \ncourses from foreign universities, in one of the speciﬁed ﬁelds of study. \nThe students with the speciﬁed disabilities deﬁned in Schedule of the "Rights of \nPersons with Disabilities Act, 2016" will be eligible under this scheme. This includes \npersons with visual, hearing, speech, loco-motor, mental retardation, and other \ndisabilities. NOS is implemented oƯline by the DEPwD. \nAt times SwDs are deprived of harnessing their latent skills and thereby miss the \nopportunity. This scheme aims to support SwDs to study further in order to prepare \nthemselves to earn their livelihood and to ﬁnd a digniﬁed place for themselves in the \nsociety as they face several barriers physical, ﬁnancial, 