### RAG Pipelines- Data Ingestion to Vector DB Pipeline 

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdf(pdf_directory):
    """Process all PDF files in a directory."""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\n Processing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata["source_file"]= pdf_file.name
                doc.metadata["file_type"]= "pdf"
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {pdf_file.name}")

        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents            

all_pdf_documents=process_all_pdf("../data/pdf")



Found 1 PDF files to process.

 Processing file: agent_ppt.pdf
Loaded 43 pages from agent_ppt.pdf

Total documents loaded: 43


In [4]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split the document into smaller chunks for better RAG performance."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")


    if split_docs:
        print("Sample chunk")
        print(f"Content: {split_docs[0].page_content[:200]}....")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [5]:
chunks = split_documents(all_pdf_documents)

Split 43 documents into 46 chunks.
Sample chunk
Content: LangChain Essentials
1....
Metadata: {'producer': '', 'creator': 'Google', 'creationdate': '', 'source': '..\\data\\pdf\\agent_ppt.pdf', 'file_path': '..\\data\\pdf\\agent_ppt.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'LangChain V1 Essentials', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'agent_ppt.pdf', 'file_type': 'pdf'}


### Embedding and Vector Store DB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
class EmbeddingManager:
    """Handles Documents Embedding Generation using Sentence Transformers"""

    def __init__(self, model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the Embedding Manager with a specified model.

        Args:
            model_name (str): HuggingFace model name for Sentence Embedding
        """
        self.model_name = model_name
        self.model = None
        self._load_model()


    def _load_model(self):
        """Load the Sentence Transformers model."""
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully. Embedding dimensions: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embedding for a list of texts

        Args:
            texts(List[str]): List of text string to generate embedding for 

        Returns:
            numpy array for embeddings of shape (len(texts), embedding_dimension)
        """

        if not self.model:
            raise ValueError("Model Not Loaded Properly Try Loading Model Again.")
        
        print(f"Generating embedding for {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embedding with shape : {embeddings.shape}")
        return embeddings
    

### intialize the embedding manager 
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model Loaded Successfully. Embedding dimensions: 384


<__main__.EmbeddingManager at 0x25c910e8c20>

In [None]:
class VectorStore:
    """ Manages Documents embedding in a ChromaDB Vector Store """

    def __init__(self, collection_name: str="pdf_documents", persist_directory: str= "../data/vector_store"):
        """
        Initialize the Vector Store with ChromaDB

        Args: 
            collection_name (str): Name of the ChromaDB Collection
            persist_directory (str): Directory to persist the Vector store 
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB Client and Collection"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Documents Embedding for RAG"}
                )
            print(f"Vector Store Initialized. Collection: {self.collection_name}")
            print(f"Existing Documents in Collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error Initializing Vector Store: {e}")
            raise 

    def add_documents(self, documents: List[Any], embedding: np.ndarray):
        """
        Add Documents and their Embedding to the vector Store


        Args:
            documents List[Any]: List of all the Langchain Documents 
            embedding np.ndarray: Corresponding Embedding to the documents 
        """
        if len(documents) != len(embedding):
            raise ValueError("Number of documents and embedding must match the number of embeddings.")
        
        print(f"Adding {len(documents)} documents to vector Store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i,(doc, embedding) in enumerate(zip(documents, embedding)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)


            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)


            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to the vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to the vector store {e}")
            raise

vector_store = VectorStore()
vector_store

            

Vector Store Initialized. Collection: pdf_documents
Existing Documents in Collection: 0


<__main__.VectorStore at 0x25ca2bf9940>

In [None]:
# Lets Convert the chunk to embedding
texts=[doc.page_content for doc in chunks ]

embeddings=embedding_manager.generate_embeddings(texts)

vector_store.add_documents(chunks,embeddings)


Generating embedding for 46 texts....


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches: 100%|██████████| 2/2 [00:06<00:00,  3.02s/it]

Generated embedding with shape : (46, 384)
Adding 46 documents to vector Store...
Successfully added 46 documents to the vector store
Total documents in collection: 92
<__main__.VectorStore object at 0x0000025CA2BF9940>



