### Data Ingestion


In [2]:
from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="Main text content i am using to make the RAG",
    metadata={
        "source": "example.txt",
        "author": "Akshat",
        "page": 1,
        "date_created": "2025-10-01",
        })

In [4]:
doc

Document(metadata={'source': 'example.txt', 'author': 'Akshat', 'page': 1, 'date_created': '2025-10-01'}, page_content='Main text content i am using to make the RAG')

In [5]:
## creating a txt file to test the document class

import os
os.makedirs("../data/text_files", exist_ok=True)

In [6]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [8]:
### Reading the files using textLoaded 

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
data = loader.load()
print(data)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [11]:
### Directory loader

from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader("../data/text_files", glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding":"utf-8"}, show_progress=True)
dir_data = dir_loader.load()
print(dir_data)

100%|██████████| 2/2 [00:00<00:00, 1294.34it/s]

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popul




In [14]:
### Directory loader

from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

dir_loader = DirectoryLoader("../data/pdf", glob="**/*.pdf", loader_cls=PyMuPDFLoader,show_progress=True)
pdf_data = dir_loader.load()
print(pdf_data)

100%|██████████| 2/2 [00:00<00:00, 19.00it/s]

[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_path': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'format': 'PDF 1.4', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Professional and Practical Internship Report \nSubmitted in partial fulfillment of the requirements for the award of \nthe \nBachelor of Technology in \nSchool of Computer Science, Engineering & Technology \nBennett University \n \n \nBy \n \nAkshat Tyagi \nRoll Number: E21CSEU0165'), Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_path': '..\\data\\pdf\\Akshat Ty




In [17]:
type(pdf_data[0])

langchain_core.documents.base.Document

### Embeddings and vector store db

In [5]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")

Ignoring wrong pointing object 525 0 (offset 0)


Found 3 PDF files to process

Processing: Akshat Tyagi Final Internship Report 8th Sem.pdf
  ✓ Loaded 17 pages

Processing: CDX_26C_1026669_1Aug2025.pdf
  ✓ Loaded 3 pages

Processing: Psychology.pdf
  ✓ Loaded 5 pages

Total documents loaded: 25


In [8]:
all_pdf_documents


[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'source_file': 'Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_type': 'pdf'}, page_content='Professional  and  Practical  Internship  Report  \nSubmitted  in  partial  fulfillment  of  the  requirements  for  the  award  of  \nthe\n Bachelor  of  Technology  in  School  of  Computer  Science,  Engineering  &  Technology  \nBennett\n \nUniversity\n   By   Akshat  Tyagi  Roll  Number:  E21CSEU0165'),
 Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 1, 'page_label': '2', 's

In [9]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [10]:
chunks=split_documents(all_pdf_documents)
chunks

Split 25 documents into 61 chunks

Example chunk:
Content: Professional  and  Practical  Internship  Report  
Submitted  in  partial  fulfillment  of  the  requirements  for  the  award  of  
the
 Bachelor  of  Technology  in  School  of  Computer  Science,  ...
Metadata: {'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'source_file': 'Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'source_file': 'Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_type': 'pdf'}, page_content='Professional  and  Practical  Internship  Report  \nSubmitted  in  partial  fulfillment  of  the  requirements  for  the  award  of  \nthe\n Bachelor  of  Technology  in  School  of  Computer  Science,  Engineering  &  Technology  \nBennett\n \nUniversity\n   By   Akshat  Tyagi  Roll  Number:  E21CSEU0165'),
 Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 1, 'page_label': '2', 's

In [22]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

In [23]:

class EmbeddingManager:
    """Manages embeddings and similarity search using ChromaDB and Sentence Transformers."""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the sentence transformer model."""
        try:
            print(f"Loading model '{self.model_name}'...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")
            raise
        
    def generate_embedding(self, text: List[str]) -> np.ndarray:
        """Generate embedding for a given text."""
        if not self.model:
            raise ValueError("Model not loaded.")

        print(f"Generating embeddings for {len(text)} texts...")
        embeddings= self.model.encode(text, show_progress_bar=True)
        print("Embeddings generated.")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the dimension of the embeddings."""
        if not self.model:
            raise ValueError("Model not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
    
    
## Initialize the EmbeddingManager
embedding_manager = EmbeddingManager()
embedding_manager

Loading model 'all-MiniLM-L6-v2'...
Model 'all-MiniLM-L6-v2' loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1e70f911fd0>

### Vector Store

In [24]:
from typing import List, Any
import numpy as np
import os
import uuid
import chromadb

class VectorStore:
    """Manages document embeddings in a chromaDB vector store."""
    
    def __init__(self, collection_name: str= 'pdf_documents', persist_directory: str = "../data/vector_store"):
        
        """Initialize the VectorStore with ChromaDB.
        Args:
            collection_name (str): Name of the ChromaDB collection.
            persist_directory (str): Directory to persist the ChromaDB data.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            #Create persistent chromadb client'
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            #Get or create collection
            self.collection = self.client.get_or_create_collection(name=self.collection_name, metadata={"description": "Document embeddings collection"})
            print(f"ChromaDB collection '{self.collection_name}' initialized successfully.")
            print(f"Existing number of documents in the collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
            """
            Add documents and their embeddings to the vector store
            
            Args:
                documents: List of LangChain documents
                embeddings: Corresponding embeddings for the documents
            """
            if len(documents) != len(embeddings):
                raise ValueError("Number of documents must match number of embeddings")
            
            print(f"Adding {len(documents)} documents to vector store...")
            
            # Prepare data for ChromaDB
            ids = []
            metadatas = []
            documents_text = []
            embeddings_list = []
            
            for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
                # Generate unique ID
                doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
                ids.append(doc_id)
                
                # Prepare metadata
                metadata = dict(doc.metadata)
                metadata['doc_index'] = i
                metadata['content_length'] = len(doc.page_content)
                metadatas.append(metadata)
                
                # Document content
                documents_text.append(doc.page_content)
                
                # Embedding
                embeddings_list.append(embedding.tolist())
            
            # Add to collection
            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                print(f"Successfully added {len(documents)} documents to vector store")
                print(f"Total documents in collection: {self.collection.count()}")
                
            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise

vectorstore=VectorStore()
vectorstore

ChromaDB collection 'pdf_documents' initialized successfully.
Existing number of documents in the collection: 0


<__main__.VectorStore at 0x1e70fa6e660>

In [25]:
chunks 

[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'source_file': 'Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_type': 'pdf'}, page_content='Professional  and  Practical  Internship  Report  \nSubmitted  in  partial  fulfillment  of  the  requirements  for  the  award  of  \nthe\n Bachelor  of  Technology  in  School  of  Computer  Science,  Engineering  &  Technology  \nBennett\n \nUniversity\n   By   Akshat  Tyagi  Roll  Number:  E21CSEU0165'),
 Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'page': 1, 'page_label': '2', 's

In [26]:
### Converting chunks to embeddings and adding to vector store

texts = [doc.page_content for doc in chunks]

## Generate embeddings for the chunks
embeddings = embedding_manager.generate_embedding(texts)

### Store in the vector store
vectorstore.add_documents(chunks, embeddings)


Generating embeddings for 61 texts...


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Embeddings generated.
Adding 61 documents to vector store...
Successfully added 61 documents to vector store
Total documents in collection: 61
