### Data Ingestion

In [23]:
### Document Data Structure

from langchain_core.documents import Document

Document(page_content="ok", metadata={})

Document(metadata={}, page_content='ok')

In [24]:
doc=Document(
    page_content="this is the main content I am using to create a RAG",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "Anshuman",
        "date_created": "2026-01-27"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Anshuman', 'date_created': '2026-01-27'}, page_content='this is the main content I am using to create a RAG')

In [25]:
### create a simple txt File
import os
os.makedirs('../data/text_files', exist_ok=True)

In [26]:
sample_text = {
    '../data/text_files/python_intro.txt':"""python programming introduction"""
}
for filepath, content in sample_text.items():
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

print("Sample txt file created!!!")

Sample txt file created!!!


In [27]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../data/text_files/python_intro.txt', encoding='utf-8')
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='python programming introduction')]


In [28]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    '../data/text_files',
    glob='**/*.txt', ## pattern to match the files
    loader_cls=TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='python programming introduction'),
 Document(metadata={'source': '../data/text_files/machine_learning_intro.txt'}, page_content='Machine learning introduction')]

In [29]:
### PDF files
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    '../data/pdf_files',
    glob='**/*.pdf', ## pattern to match the files
    loader_cls=PyMuPDFLoader, ##loader class to use
    show_progress=False
)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'xdvipdfmx (20250410)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-03T14:47:38+00:00', 'source': '../data/pdf_files/AnshResume.pdf', 'file_path': '../data/pdf_files/AnshResume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20251003144738Z', 'page': 0}, page_content='Anshuman\nAspiring Full Stack Developer\nanshuman302004@gmail.com | +91 9871980990 | Faridabad, Haryana, India\nlinkedin.com/in/heyansh | github.com/Ansh30a | Portfolio\nSUMMARY\nHighly motivated MERN Stack Developer skilled in building scalable full-stack applications using MongoDB,\nExpress.js, React, and Node.js. Experienced in developing RESTful APIs and integrating third-party services.\nQuick learner with strong problem-solving and debugging skills.\nEXPERIENCE\nMERN Stack Developer Intern\nJun 2025 – Jul 2025\nCodec Technologies\n• Developed full-stack

In [30]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [31]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [32]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 1 PDF files to process

Processing: AnshResume.pdf
  ✓ Loaded 1 pages

Total documents loaded: 1


In [33]:
all_pdf_documents

[Document(metadata={'producer': 'xdvipdfmx (20250410)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-03T14:47:38+00:00', 'source': '../data/pdf_files/AnshResume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'AnshResume.pdf', 'file_type': 'pdf'}, page_content='Anshuman\nAspiring Full Stack Developer\nanshuman302004@gmail.com | +91 9871980990 | Faridabad, Haryana, India\nlinkedin.com/in/heyansh | github.com/Ansh30a | Portfolio\nSUMMARY\nHighly motivated MERN Stack Developer skilled in building scalable full-stack applications using MongoDB,\nExpress.js, React, and Node.js. Experienced in developing RESTful APIs and integrating third-party services.\nQuick learner with strong problem-solving and debugging skills.\nEXPERIENCE\nMERN Stack Developer Intern Jun 2025 – Jul 2025\nCodec Technologies\n• Developed full-stack applications using the MERN stack.\n• Worked on designing and implementing scalable APIs, dynamic user interfaces, and seamless integratio

In [34]:
### Text splitting get into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [35]:
chunks=split_documents(all_pdf_documents)
chunks

Split 1 documents into 4 chunks

Example chunk:
Content: Anshuman
Aspiring Full Stack Developer
anshuman302004@gmail.com | +91 9871980990 | Faridabad, Haryana, India
linkedin.com/in/heyansh | github.com/Ansh30a | Portfolio
SUMMARY
Highly motivated MERN Stac...
Metadata: {'producer': 'xdvipdfmx (20250410)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-03T14:47:38+00:00', 'source': '../data/pdf_files/AnshResume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'AnshResume.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'xdvipdfmx (20250410)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-03T14:47:38+00:00', 'source': '../data/pdf_files/AnshResume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'AnshResume.pdf', 'file_type': 'pdf'}, page_content='Anshuman\nAspiring Full Stack Developer\nanshuman302004@gmail.com | +91 9871980990 | Faridabad, Haryana, India\nlinkedin.com/in/heyansh | github.com/Ansh30a | Portfolio\nSUMMARY\nHighly motivated MERN Stack Developer skilled in building scalable full-stack applications using MongoDB,\nExpress.js, React, and Node.js. Experienced in developing RESTful APIs and integrating third-party services.\nQuick learner with strong problem-solving and debugging skills.\nEXPERIENCE\nMERN Stack Developer Intern Jun 2025 – Jul 2025\nCodec Technologies\n• Developed full-stack applications using the MERN stack.\n• Worked on designing and implementing scalable APIs, dynamic user interfaces, and seamless integratio

### Embedding and VectorStoreDB

In [36]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    def __init__(self, model_name: str = "all-miniLM-L6-v2"):
        """
        Initialise the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args: 
            texts: List of text strings to embed

            Returns: numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded!!!")
        
        print(f"Generating embeddings for {len(texts)} text...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    ## initialise the embedding manager
embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-miniLM-L6-v2


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-miniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model Loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x76874f976180>

### Vector Store

In [41]:
class VectorStore:
    """Manage document embeddings in a chromaDB vector store """

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = '../data/vector_store'):
        """
        Initialise the vector store 

        Args:
            collection_name: Name of the chromaDB collection 
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Intialise chromaDB client and collection"""   
        try: 
            # create persistent chromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector Store initialised. Collection: {self.collection_name}")
            print(f"Existing documents in Collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initialsing vector store {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args: 
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings!!!")
        
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list =[]

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID 
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_lenght'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector Store initialised. Collection: pdf_documents
Existing documents in Collection: 0


<__main__.VectorStore at 0x76874d74d370>

In [42]:
### Conver the text to embeddings
texts=[doc.page_content for doc in chunks]
# texts ---- commented out

## Generate the embeddings
embeddings=embedding_manager.generate_embeddings(texts)

## store in the vectorDB
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 4 text...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (4, 384)
Adding 4 documents to vector store...
Successfully added 4 documents to vector store
Total documents in collection: 4


### Retriever Pipeline from VectorStore

In [None]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vectorstore: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialise the retriever

        Args: 
            vector_store: vectorstore containing document embeddings
            embedding_manager: Manager for generating query embeeddings
        """

        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: the search query 
            top_k: number of top results to return 
            score_threshold: minimum simialarity score threshold 

        Returns:
            list of dictionaries containing retreieved documents and metadata 
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threhold: {score_threshold}")

        # generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vectorstore.collection.query(

            )

        
