In [30]:
#importing libraries
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from sklearn.metrics.pairwise import cosine_similarity
from typing import List,Dict,Any,Tuple
from chromadb.config import Settings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pathlib import Path
import os
import numpy as np
import uuid
import chromadb

In [31]:
#Document Object
doc=Document(
    page_content="this is the main context I am using to create RAG",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Krish Naik",
        "date_created":"2025-11-05"
    }
)

In [32]:
#Creating a txt file
os.makedirs("../data/text_files",exist_ok=True)

In [33]:
sample_texts=sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """
}

for filepath,content in sample_texts.items():
    with open(filepath,"w",encoding="utf-8") as f:
        f.write(content)
print("Sample files created")

Sample files created


In [34]:
#Textloader
#Structure raw text into Langchain Document format
loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()

In [35]:
#Directory Loader
#Load all text files from the directory 
dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)
documents=dir_loader.load()

100%|██████████| 2/2 [00:00<00:00, 4862.96it/s]


In [36]:
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True
)
pdf_documents=dir_loader.load()

100%|██████████| 3/3 [00:01<00:00,  2.22it/s]


In [37]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 3 PDF files to process

Processing: javanotes5.pdf
  ✓ Loaded 699 pages

Processing: Dsa.pdf
  ✓ Loaded 112 pages

Processing: 245078a9e1ffa18cdbd78bd9f50285d9.pdf
  ✓ Loaded 17 pages

Total documents loaded: 828


In [38]:
# Text splitting get into chunks
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
   
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs
chunks=split_documents(all_pdf_documents)

Split 828 documents into 2916 chunks

Example chunk:
Content: Introduction to Programming Using Java
V ersion 5.0, December 2006
(Version 5.0.2, with minor corrections, November 2007)
David J. Eck
Hobart and William Smith Colleges...
Metadata: {'producer': 'AFPL Ghostscript 8.51', 'creator': 'dvips(k) 5.95b Copyright 2005 Radical Eye Software', 'creationdate': 'D:20071115202101', 'moddate': 'D:20071115202101', 'title': 'javanotes.dvi', 'source': '../data/pdf/javanotes5.pdf', 'total_pages': 699, 'page': 0, 'page_label': '1', 'source_file': 'javanotes5.pdf', 'file_type': 'pdf'}


In [39]:
#Embeddings
class EmbeddingManager:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded sucessfully.")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self,texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embeddings for {len(texts)} texts....")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded sucessfully.


<__main__.EmbeddingManager at 0x756b33c87200>

In [40]:
#VectoreStore
class VectorStore:
    def __init__(self,collection_name: str="pdf_documents",persist_directory:str="../data/vector_store"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()


    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection=self.client.get_or_create_collection(

                name=self.collection_name,
                metadata={"description":"PDF documents embeddings for RAG"}
            )
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise


    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents must match the embeddings")
        print(f"Adding {len(document)} douments to vector store")

        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata=dict(doc.metadata)
            metadata['doc_index']=i
            metadata['context_length']=len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())


        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f'Sucessfully added {len(documents)} documents to vector store')
            print(f'Total documents in collection: {self.collection.count()}')
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

<__main__.VectorStore at 0x756b33cdd7c0>

In [41]:
#Converting texts to embeddings
texts=[doc.page_content for doc in chunks]
embeddings=embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 2916 texts....


Batches: 100%|██████████| 92/92 [01:21<00:00,  1.13it/s]


Generated embeddings with shape: (2916, 384)
Adding 1 douments to vector store
Sucessfully added 2916 documents to vector store
Total documents in collection: 3147


In [42]:
#Retriever Pipeline from Vector Store
class RAGRetriever:
    def __init__(self,vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store=vector_store
        self.embedding_manager=embedding_manager
         

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
  
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):

                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [44]:
rag_retriever.retrieve("What is Chemistry")

Retrieving documents for query: 'What is Chemistry'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 109.88it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_8c8d0daa_2869',
  'content': 'chemistry and also realised that it influences every\nsphere of human life.  The principles of chemistry have\nbeen used for the benefit of mankind. Think of\ncleanliness — the materials like soaps, detergents,\nhousehold bleaches, tooth pastes, etc. will come to your\nmind. Look towards the beautiful clothes — immediately\nchemicals of the synthetic fibres used for making clothes\nand chemicals giving colours to them will come to your\nmind. Food materials — again a number of chemicals\nabout which you have learnt in the previous Unit will\nappear in your mind. Of course, sickness and diseases\nremind us of medicines — again chemicals. Explosives,\nfuels, rocket propellents, building and electronic\nmaterials, etc., are all chemicals. Chemistry has\ninfluenced our life so much that we do not even realise\nthat we come across chemicals at every moment; that\nwe ourselves are beautiful chemical creations and all\nour activities are controlled b