Data Ingestion

In [1]:
###document structure

from langchain_core.documents import Document

In [2]:
# create a sample txt file
import os
os.makedirs("../data/text_files", exist_ok=True)


In [3]:
# Teaxt Loader


from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf8")
document = loader.load()
print(document)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a popular, high-level programming language known for its simplicity and readability. It was designed to allow developers to express ideas in fewer lines of code compared to many other languages, making it an excellent choice for beginners as well as experienced programmers. Python’s clean syntax emphasizes clarity, which helps reduce the cognitive load when learning how programs work.\n\nOne of Python’s key features is that it is an interpreted language. This means code is executed line by line, which makes debugging easier and allows for rapid experimentation. Python also uses dynamic typing, so variables do not need explicit type declarations, enabling faster development and more flexible coding styles.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Basic building blocks such as variables, data types (integers, floats, st

In [4]:
### directory loader
from langchain_community.document_loaders import DirectoryLoader

# Load all .txt files from the specified directory
dir_loader = DirectoryLoader(
    "../data/text_files", 
    glob="**/*.txt",   #pattern to match files
    loader_cls = TextLoader,    #loader class to use for each file
    loader_kwargs = {"encoding":"utf8"},
    show_progress = True   #another librarry reqired to shoe progress (tqdm)
    )

documents = dir_loader.load()
print(f"Number of documents loaded: {len(documents)}")
documents

100%|██████████| 2/2 [00:00<00:00, 1103.33it/s]

Number of documents loaded: 2





[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine learning is a branch of artificial intelligence that focuses on enabling systems to learn patterns from data rather than relying solely on explicit programming. At its core, a machine learning model is exposed to examples, identifies relationships within them, and then uses those relationships to make predictions or decisions on new, unseen data.\n\nA typical machine learning workflow begins with data collection and preprocessing. Raw data often contains noise, missing values, or inconsistencies, so cleaning and transforming it is a critical step. Once prepared, the data is split into training and testing sets to evaluate how well a model generalizes beyond the data it has already seen.\n\nThere are several major categories of machine learning. Supervised learning uses labeled data and is commonly applied to tasks such as image classification, spam detection, and price prediction. Unsuper

In [5]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

# Load all .txt files from the specified directory
dir_loader = DirectoryLoader(
    "../data/pdf", 
    glob="**/*.pdf",   #pattern to match files
    loader_cls = PyMuPDFLoader,    #loader class to use for each file

    show_progress = True   #another librarry reqired to shoe progress (tqdm)
    )

pdf_documents = dir_loader.load()
print(f"Number of documents loaded: {len(pdf_documents)}")
pdf_documents

100%|██████████| 35/35 [00:04<00:00,  8.65it/s]

Number of documents loaded: 3000





[Document(metadata={'producer': 'cairo 1.17.4 (https://cairographics.org)', 'creator': 'Mozilla Firefox', 'creationdate': '2023-06-26T15:59:50+05:30', 'source': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'file_path': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-05T12:19:17+05:30', 'trapped': '', 'modDate': "D:20250605121917+05'30'", 'creationDate': "D:20230626155950+05'30", 'page': 0}, page_content='Total No. of Questions : 8]\n[Total No. of Pages : 2\n[6004]-613\nB. E. (Mechanical Engineering)\nELECTIVE IV: ADDITIVE MANUFACTURING\n(2019 Pattern) (Semester - VII) (402045C)\nTime : 2½ Hours]\n[Max. Marks : 70\nInstructions to the candidates:\n1)\nSolve Q.1 or Q.2, Q.3 or Q.4, Q.5 or Q.6, Q.7 or Q.8\n2)\nNeat diagrams must be drawn wherever necessary.\n3)\nFigures to the right indicate full marks.\n4)\nUse of electronic pocket calculator i

Chunking

In [6]:
# Creating Data Chunks 

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from pathlib import Path

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks for better RAG performance.

    Accepts:
    - list[Document]
    - str / Path (directory containing PDFs)
    """

    # ---- NORMALIZE INPUT ----
    if isinstance(documents, (str, Path)):
        loader = DirectoryLoader(
            str(documents),
            glob="**/*.pdf",
            loader_cls=PyMuPDFLoader,
            show_progress=True
        )
        documents = loader.load()

    if not isinstance(documents, list):
        raise TypeError(
            f"Expected list of Documents or path, got {type(documents)}"
        )

    # ---- SPLITTER ----
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # ---- DEBUG SAMPLE ----
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [7]:
chunks = split_documents(pdf_documents)
chunks

Split 3000 documents into 2199 chunks

Example chunk:
Content: Total No. of Questions : 8]
[Total No. of Pages : 2
[6004]-613
B. E. (Mechanical Engineering)
ELECTIVE IV: ADDITIVE MANUFACTURING
(2019 Pattern) (Semester - VII) (402045C)
Time : 2½ Hours]
[Max. Marks...
Metadata: {'producer': 'cairo 1.17.4 (https://cairographics.org)', 'creator': 'Mozilla Firefox', 'creationdate': '2023-06-26T15:59:50+05:30', 'source': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'file_path': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-05T12:19:17+05:30', 'trapped': '', 'modDate': "D:20250605121917+05'30'", 'creationDate': "D:20230626155950+05'30", 'page': 0}


[Document(metadata={'producer': 'cairo 1.17.4 (https://cairographics.org)', 'creator': 'Mozilla Firefox', 'creationdate': '2023-06-26T15:59:50+05:30', 'source': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'file_path': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-05T12:19:17+05:30', 'trapped': '', 'modDate': "D:20250605121917+05'30'", 'creationDate': "D:20230626155950+05'30", 'page': 0}, page_content='Total No. of Questions : 8]\n[Total No. of Pages : 2\n[6004]-613\nB. E. (Mechanical Engineering)\nELECTIVE IV: ADDITIVE MANUFACTURING\n(2019 Pattern) (Semester - VII) (402045C)\nTime : 2½ Hours]\n[Max. Marks : 70\nInstructions to the candidates:\n1)\nSolve Q.1 or Q.2, Q.3 or Q.4, Q.5 or Q.6, Q.7 or Q.8\n2)\nNeat diagrams must be drawn wherever necessary.\n3)\nFigures to the right indicate full marks.\n4)\nUse of electronic pocket calculator i

embedding and vector store DB

In [8]:
import numpy as np  
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os
from pydoc import doc


In [9]:
class EmbeddingManager:
    
    #fnadles document embeddings generation using sentence transformer and storage in ChromaDB
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):

        # Initialize the embedding manager
        '''args:
                model_name: HuggingFace model name for sentece embeddings
        '''
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        # Load the sentence transformer model
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"Error loading model: {e}")
            raise 

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        ''' Generate embeddings for a list of texts
        args:
            texts: List of strings to generate embeddings for   
        returns:
            np.ndarray: Array of embeddings with shape (len(texts), embedding_dimension)
        '''
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        print (f"Generated embeddings shape: {embeddings.shape}")
        return embeddings
        
    def get_embedding_dimension(self) -> int:
        ''' Get the dimension of the embeddings generated by the model
        returns:
            int: Embedding dimension
        '''
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
#initialize the embedding manager
embedding_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2")
embedding_manager
        

Loading embedding model: all-MiniLM-L6-v2
Model Loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x12e66edf4d0>

Vector Store

In [10]:
class VectorStore:
    #Handles storage and retrieval of embeddings using ChromaDB
    def __init__(self, collection_name: str = "pdf_document", persist_directory: str = "../data/vector_store"):
        '''
        Initialize the vector store 

        args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the ChromaDB data
        '''
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        # Initialize ChromaDB client and collection

        try:
            #create persistent ChromaDB client
            print("Initializing ChromaDB client...")
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            #get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name, 
                metadata={"description": "PDF Document Embeddings for RAG"}
                )
            
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

    def add_documents(self, documents: List[Any], embedding: np.ndarray):
        '''
        Add documents and their embeddings to the vector store after generating embeddings

        args:
            documents: List of Document objects to add
            embedding_manager: Instance of EmbeddingManager to generate embeddings
        '''
        if len(documents) != len(embedding):
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store...")
        
        # prepare data for chromadb
        ids =[]
        metadatas = []
        document_texts = []
        embeddings_list = []

        for i,(doc, embedding) in enumerate (zip(documents, embedding)):
            #generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata
            metadata = dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index'] = i  # Add document index
            metadata['content_length'] = len(doc.page_content)  # Add content length
            metadatas.append(metadata)

            #document content
            document_texts.append(doc.page_content)

            #document embedding
            embeddings_list.append(embedding.tolist())  # Convert np.ndarray to list for ChromaDB
        
        #add to chromadb collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=document_texts,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to the vector store.")
            print(f"Total documents in collection now: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

#initialize vector store
vectorstore = VectorStore()
vectorstore

Initializing ChromaDB client...
Vector store initialized with collection: pdf_document
existing documents in collection: 0


<__main__.VectorStore at 0x12e721f8980>

In [11]:
chunks

[Document(metadata={'producer': 'cairo 1.17.4 (https://cairographics.org)', 'creator': 'Mozilla Firefox', 'creationdate': '2023-06-26T15:59:50+05:30', 'source': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'file_path': '..\\data\\pdf\\Additive Manufacturing END SEM 2023 QP.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-05T12:19:17+05:30', 'trapped': '', 'modDate': "D:20250605121917+05'30'", 'creationDate': "D:20230626155950+05'30", 'page': 0}, page_content='Total No. of Questions : 8]\n[Total No. of Pages : 2\n[6004]-613\nB. E. (Mechanical Engineering)\nELECTIVE IV: ADDITIVE MANUFACTURING\n(2019 Pattern) (Semester - VII) (402045C)\nTime : 2½ Hours]\n[Max. Marks : 70\nInstructions to the candidates:\n1)\nSolve Q.1 or Q.2, Q.3 or Q.4, Q.5 or Q.6, Q.7 or Q.8\n2)\nNeat diagrams must be drawn wherever necessary.\n3)\nFigures to the right indicate full marks.\n4)\nUse of electronic pocket calculator i

In [12]:
#convert chunks to texts for embedding generation
texts = [doc.page_content for doc in chunks]

#generate embeddings for the chunks
embeddings = embedding_manager.generate_embeddings(texts)

#store chunks and embeddings in vector store
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 2199 texts...
Generated embeddings shape: (2199, 384)
Adding 2199 documents to the vector store...
Successfully added 2199 documents to the vector store.
Total documents in collection now: 2199


Retrieve Pipeline From the Vector Store

In [13]:
class RAGRetriever:
    #Handles retrieval of relevant documents from vector store based on query embeddings
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager, ):
        '''
        Initialize the RAG retriever

        args:
            vector_store: Instance of VectorStore for retrieval
            embedding_manager: Instance of EmbeddingManager for query embeddings
            top_k: Number of top similar documents to retrieve
        '''
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        '''
        Retrieve top-k relevant documents for a given query

        args:
            query: User query string
            top_k: Number of top similar documents to retrieve
            score_threshold: Minimum similarity score threshold for filtering results

        returns:
            List of metadata dictionaries for the top-k similar documents and metadata
        '''
        print(f"Retrieving top {top_k} documents for query: '{query}'")
        print(f"Top K: {top_k} Using score threshold: {score_threshold}")

        #generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        #serch in vector store 
        try:
            results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        #extract relevant documents and metadata
            retrieved_docs = []

            if results['documents'] and results['metadatas'][0]:
                documents = results.get('distances')[0]
                metadatas = results.get('metadatas')[0]
                distances = results.get('distances')[0]
                ids = results.get('ids')[0]

                for i, (doc_id, document, metadata, distance ) in enumerate(zip(ids, documents, metadatas, distances)):
                    #convert distance to similarity score
                    similarity_score = 1 - distance  

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": distance,
                            "rank": i+ 1
                    })
                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
            else:
                print("No documents retrieved from vector store.")
        
            return retrieved_docs
    
        except Exception as e:
            print(f"Error during retrieval: {e}")
            raise

#initialize RAG retriever
rag_retriever = RAGRetriever(vector_store=vectorstore, embedding_manager=embedding_manager)

In [14]:
rag_retriever

<__main__.RAGRetriever at 0x12e721faa50>

In [15]:
rag_retriever.retrieve("What is Python programming?", top_k=3, score_threshold=0.1)

Retrieving top 3 documents for query: 'What is Python programming?'
Top K: 3 Using score threshold: 0.1
Generating embeddings for 1 texts...
Generated embeddings shape: (1, 384)
Retrieved 0 documents after applying score threshold.


[]

In [16]:
rag_retriever.retrieve("Who is Elon Musk?")

Retrieving top 5 documents for query: 'Who is Elon Musk?'
Top K: 5 Using score threshold: 0.0
Generating embeddings for 1 texts...
Generated embeddings shape: (1, 384)
Retrieved 0 documents after applying score threshold.


[]

In [17]:
rag_retriever.retrieve("tell me something about OS.", top_k=3, score_threshold=0.1)

Retrieving top 3 documents for query: 'tell me something about OS.'
Top K: 3 Using score threshold: 0.1
Generating embeddings for 1 texts...
Generated embeddings shape: (1, 384)
Retrieved 0 documents after applying score threshold.


[]

In [241]:
rag_retriever.retrieve("tell me something about turbo machinery.")

Retrieving top 5 documents for query: 'tell me something about turbo machinery.'
Top K: 5 Using score threshold: 0.0
Generating embeddings for 1 texts...
Generated embeddings shape: (1, 384)
Retrieved 5 documents after applying score threshold.


[{'id': 'doc_d37a9d9b_3531',
  'document': 0.2872103452682495,
  'metadata': {'trapped': '',
   'creationDate': '',
   'source': '..\\data\\pdf\\Turbo Short Notes.pdf',
   'subject': '',
   'keywords': '',
   'doc_index': 3531,
   'total_pages': 403,
   'content_length': 26,
   'author': '',
   'format': 'PDF 1.4',
   'modDate': '',
   'creationdate': '',
   'file_path': '..\\data\\pdf\\Turbo Short Notes.pdf',
   'moddate': '',
   'producer': '',
   'creator': 'Google',
   'title': '',
   'page': 3},
  'similarity_score': 0.7127896547317505,
  'distance': 0.2872103452682495,
  'rank': 1},
 {'id': 'doc_29fbfcd9_3534',
  'document': 0.43703293800354004,
  'metadata': {'producer': '',
   'total_pages': 403,
   'moddate': '',
   'format': 'PDF 1.4',
   'content_length': 24,
   'creationDate': '',
   'author': '',
   'page': 6,
   'title': '',
   'subject': '',
   'trapped': '',
   'modDate': '',
   'file_path': '..\\data\\pdf\\Turbo Short Notes.pdf',
   'creator': 'Google',
   'creationdat