### RAG Pipeline -  Data Ingestion To Vector DB pipeline 

In [1]:
%pip install -q langchain-text-splitters

import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


Note: you may need to restart the kernel to use updated packages.


In [2]:

def load_all_pdfs(pdf_directory):
    all_documents=[]
    pdf_dir=Path(pdf_directory)

    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"found {len(pdf_files)} PDF files in {pdf_directory}")

    for pdf in pdf_files:
        print(f"\nprocessing:{pdf.name}")
        try:
            loader=PyMuPDFLoader(str(pdf))
            documents=loader.load()

            for doc in documents:
                doc.metadata["source_file"]=pdf.name
                doc.metadata["file_type"]="pdf"

            all_documents.extend(documents)
            print(f"loaded {len(documents)} pages from the {pdf.name}")

        except Exception as e:
            print(f"error loading {pdf.name}: {e}")

    print(f"\ntotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=load_all_pdfs("../data")





found 6 PDF files in ../data

processing:1_Final_DFA.pdf
loaded 160 pages from the 1_Final_DFA.pdf

processing:HackWithInfy Practice Questions 2025_Part 1.pdf
loaded 27 pages from the HackWithInfy Practice Questions 2025_Part 1.pdf

processing:HackWithInfy Practice Questions 2025_Part 2.pdf
loaded 6 pages from the HackWithInfy Practice Questions 2025_Part 2.pdf

processing:MSU Baroda _ OA Shortlist.pdf
loaded 2 pages from the MSU Baroda _ OA Shortlist.pdf

processing:SP-DSE Process - Sample Question Paper_Nov. 2025.pdf
loaded 16 pages from the SP-DSE Process - Sample Question Paper_Nov. 2025.pdf

processing:SSBEIII_AJT_AssignmentList (1).pdf
loaded 2 pages from the SSBEIII_AJT_AssignmentList (1).pdf

total documents loaded: 213


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-09-18T14:45:11+05:30', 'trapped': '', 'modDate': "D:20240918144511+05'30'", 'creationDate': "D:20240918144511+05'30'", 'page': 0, 'source_file': '1_Final_DFA.pdf', 'file_type': 'pdf'}, page_content='Deterministic\nFinite Automata\n9/18/2024\n1\nFinite Automata\nAnd Regular Languages'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]

In [4]:
### text splitting get into chunks

from langchain_core import documents


def split_documents(documents,chunk_size=1000,chunk_overlap=200): # what is a chunk overlap - ans - it is the number of characters that will be repeated in the next chunk to provide context
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len, # this is the function that will be used to calculate the length of the text, in this case we are using the built-in len function which counts the number of characters in the text. This is important because we want to split the text into chunks of a certain size, and we need to know how long the text is to do that.
        separators=["\n\n","\n"," ",""] #  these are the separators that will be used to split the text. The text splitter will try to split the text using these separators in order. So it will first try to split the text using double newlines, then single newlines, then spaces, and finally if it can't split the text using any of those separators, it will split the text at the chunk size regardless of the separator. This is important because we want to try to split the text at natural break points (like paragraphs or sentences) before splitting it at arbitrary points (like in the middle of a word).
    )

    split_docs=text_splitter.split_documents(all_pdf_documents)
    print(f"split {len(all_pdf_documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print("\n example chunk:")
        print(f"content: {split_docs[0].page_content[:200]}")
        print(f"metadata: {split_docs[0].metadata}")
    return split_docs

chunks=split_documents(all_pdf_documents)
chunks

split 213 documents into 268 chunks

 example chunk:
content: Deterministic
Finite Automata
9/18/2024
1
Finite Automata
And Regular Languages
metadata: {'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-09-18T14:45:11+05:30', 'trapped': '', 'modDate': "D:20240918144511+05'30'", 'creationDate': "D:20240918144511+05'30'", 'page': 0, 'source_file': '1_Final_DFA.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-09-18T14:45:11+05:30', 'trapped': '', 'modDate': "D:20240918144511+05'30'", 'creationDate': "D:20240918144511+05'30'", 'page': 0, 'source_file': '1_Final_DFA.pdf', 'file_type': 'pdf'}, page_content='Deterministic\nFinite Automata\n9/18/2024\n1\nFinite Automata\nAnd Regular Languages'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]

### Embedding & Vector DB

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing  import List , Dict , Any , Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
class EmbeddingManager:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentance embeddings
        """
        self.model_name=model_name
        self.model=None
        self._load_model() # its a protected function because we don't want the user to call it directly, we want them to use the get_embedding function which will call this function if the model is not already loaded.


    def _load_model(self):
        """
        Load the SentenceTransformer model
        """
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}:{e}")
            raise e
        
    def generate_embeddings(self,texts:List[str])->np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
        texts:list of text strings to embed

        Returns:
            numpy array of embeddins with shape (len(texts),embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() to load the model before generating embeddings.")
        
        print(f"Generating embeddings fro {len(texts)} texts")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embedings with stage:{embeddings.shape}")
        return embeddings
    
embedding_manager=EmbeddingManager()
embedding_manager

 
    

Loading embedding model:all-MiniLM-L6-v2


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension:384


<__main__.EmbeddingManager at 0x1c3c47ad550>

### Vector Store

In [7]:
class VectorStore:
    """Manages a vector store using ChromaDB for storing and retrieving document embeddings."""

    def __init__(self,collection_name:str ="pdf_documents",persist_directory: str ="../data/vector_store"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""

        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)

            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embeddings for RAG"}
            )
            print(f"vector store initialize. collection:{self.collection_name}")
            print(f"Existing documents in collection:{self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        
        """
        add documents and their embedings tp the vector store
        
        Args:
            documents: list of documents to add to the store. Each document should have a unique ID in its metadata under the key "id".
            embeddings: numpy array of embeddings corresponding to the documents, with shape (len(documents), embedding_dim)
        """

        if(len(documents)!=len(embeddings)):
            raise ValueError("number of documents and embeddings must be the same" )

        print(f"adding {len(documents)} document to vector store...")

        #prepare data for chroma db    
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):

            #generate id
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}" # generate a unique id for the document using uuid and the index of the document in the list. This ensures that even if there are duplicate documents, they will have unique IDs in the vector store.
            ids.append(doc_id)

            #generate metadata
            metadata=dict(doc.metadata) # make a copy of the document metadata to avoid modifying the original document's metadata
            metadata['doc_index']=i # add the index of the document in the original list to the metadata. This can be useful for debugging and for retrieving the original document later if needed.
            metadata['context_length']=len(doc.page_content) # add the length of the document text to the metadata. This can be useful for filtering documents based on their length during retrieval.  
            metadatas.append(metadata)

            #Document content

            documents_text.append(doc.page_content)

            #embeddings
            embeddings_list.append(embedding.tolist()) # convert the embedding from a numpy array to a list so that it can be stored in ChromaDB, which expects embeddings to be in list format.

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"succesfully added {len(documents)} documents to vector store")
            print(f"total documnets in collection:{self.collection.count()}")

        except Exception as e:
            print(f"error adding documents to vector store: {e}")
            raise e 
        

vector_store=VectorStore()
vector_store


vector store initialize. collection:pdf_documents
Existing documents in collection:0


<__main__.VectorStore at 0x1c3c71252b0>

In [8]:
chunks

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-09-18T14:45:11+05:30', 'trapped': '', 'modDate': "D:20240918144511+05'30'", 'creationDate': "D:20240918144511+05'30'", 'page': 0, 'source_file': '1_Final_DFA.pdf', 'file_type': 'pdf'}, page_content='Deterministic\nFinite Automata\n9/18/2024\n1\nFinite Automata\nAnd Regular Languages'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]

## convert the text to embeddings

In [12]:
# extraxt the text
texts=[doc.page_content for doc in chunks] # extract the text content from each document chunk to create a list of strings that can be passed to the embedding model for generating embeddings. Each element in the "texts" list corresponds to the text content of a document chunk, which will be embedded and stored in the vector store for later retrieval during RAG operations.

# generate the embeddings
embeddings=embedding_manager.generate_embeddings(texts)

# store in the vecor db
vector_store.add_documents(chunks,embeddings)

Generating embeddings fro 268 texts


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Generated embedings with stage:(268, 384)
adding 268 document to vector store...
succesfully added 268 documents to vector store
total documnets in collection:268
