### Neccessary Libraries

In [1]:
import os 
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from tqdm import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb 
from chromadb.config import Settings
import uuid
import numpy as np
from typing import List, Dict, Tuple, Any
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


### Loading Data

In [2]:
data = DirectoryLoader(
    path="../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True
    )
datas = data.load()
datas

100%|██████████| 12/12 [00:01<00:00,  6.51it/s]


[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-11-10T11:43:58+00:00', 'source': '..\\data\\pdf\\LangChain.pdf', 'file_path': '..\\data\\pdf\\LangChain.pdf', 'total_pages': 14, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-11-10T11:43:58+00:00', 'trapped': '', 'modDate': 'D:20241110114358Z', 'creationDate': 'D:20241110114358Z', 'page': 0}, page_content='LangChain\nVasilios Mavroudis\nAlan Turing Institute\nvmavroudis@turing.ac.uk\nAbstract. LangChain is a rapidly emerging framework that offers a ver-\nsatile and modular approach to developing applications powered by large\nlanguage models (LLMs). By leveraging LangChain, developers can sim-\nplify complex stages of the application lifecycle—such as development,\nproductionization, and deployment—making it easier to build scalable,\nstateful, and contextually aware applications. It provides tools for han-\ndling chat models, integ

### Chunking

In [3]:
def text_splitting(document, chunk_size:int=1000, chunk_overlap:int=200):
    
    text_splits = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function = len,
        separators=["\n\n","\n",""]
    )
    
    chunks = text_splits.split_documents(documents=document)
    print(f"{len(document)} document split into {len(chunks)} chunks")
    return chunks

In [4]:
chunk = text_splitting(datas)
chunk

155 document split into 497 chunks


[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-11-10T11:43:58+00:00', 'source': '..\\data\\pdf\\LangChain.pdf', 'file_path': '..\\data\\pdf\\LangChain.pdf', 'total_pages': 14, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-11-10T11:43:58+00:00', 'trapped': '', 'modDate': 'D:20241110114358Z', 'creationDate': 'D:20241110114358Z', 'page': 0}, page_content='LangChain\nVasilios Mavroudis\nAlan Turing Institute\nvmavroudis@turing.ac.uk\nAbstract. LangChain is a rapidly emerging framework that offers a ver-\nsatile and modular approach to developing applications powered by large\nlanguage models (LLMs). By leveraging LangChain, developers can sim-\nplify complex stages of the application lifecycle—such as development,\nproductionization, and deployment—making it easier to build scalable,\nstateful, and contextually aware applications. It provides tools for han-\ndling chat models, integ

### Embedding Manager 


In [5]:
class Embedding_manager:
    def __init__(self, model_name:str="BAAI/bge-small-en-v1.5"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        try:
            self.model = SentenceTransformer(model_name_or_path=self.model_name)
            print(f"{self.model_name} loaded from Hugging_face")
        except Exception as e:
            raise f"Model {self.model_name} not Loaded: error - {e}"
    def generate(self, text:List[str])->np.ndarray:
        if not self.model:
            raise "Model Not loaded"
        embeddings = self.model.encode(text)
        print(f"Embeddings {len(embeddings)} generated")
        return embeddings

In [6]:
embedding_manager = Embedding_manager()
embedding_manager

BAAI/bge-small-en-v1.5 loaded from Hugging_face


<__main__.Embedding_manager at 0x217dae6bf50>

### Vector Store

In [7]:
class VectorDB:
    def __init__(self, collection_name:str = "pdf_info",
                 persist_directory:str = "../vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self._initialize_store()
    def _initialize_store(self):
        try:
            # Create file
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.Client(
                Settings(
                    persist_directory = self.persist_directory,
                    anonymized_telemetry = False
                )
            )
            
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"info":"PDF Embeddings"}
            )
            
            print(f"Vector_store initialized {self.collection_name}")
        except Exception as e:
            print(f"Store Not loaded {e}")
            self.client = None
            raise
    def add_docs(self, documents:List[Any], embeddings:np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Length of Documents and embeddings should be same")
        ids = []
        document_text = []
        metadatas = []
        embedding_list = []
        
        for i, (document, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"Doc_{uuid.uuid4().hex[:10]}_{i}"
            ids.append(doc_id)
            metadata = dict(document.metadata)
            metadata["id"] = i
            metadata["content"] = len(document.page_content)
            metadatas.append(metadata)
            
            document_text.append(document.page_content)
            embedding_list.append(embedding.tolist())
        
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=document_text
            )
            print(f"successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding docs to vector store: {e}")
            raise
vector_store = VectorDB()
vector_store

Vector_store initialized pdf_info


<__main__.VectorDB at 0x217907cda10>

In [8]:
texts = [docs.page_content for docs in chunk]
texts

['LangChain\nVasilios Mavroudis\nAlan Turing Institute\nvmavroudis@turing.ac.uk\nAbstract. LangChain is a rapidly emerging framework that offers a ver-\nsatile and modular approach to developing applications powered by large\nlanguage models (LLMs). By leveraging LangChain, developers can sim-\nplify complex stages of the application lifecycle—such as development,\nproductionization, and deployment—making it easier to build scalable,\nstateful, and contextually aware applications. It provides tools for han-\ndling chat models, integrating retrieval-augmented generation (RAG),\nand offering secure API interactions. With LangChain, rapid deployment\nof sophisticated LLM solutions across diverse domains becomes feasible.\nHowever, despite its strengths, LangChain’s emphasis on modularity and\nintegration introduces complexities and potential security concerns that\nwarrant critical examination. This paper provides an in-depth analysis\nof LangChain’s architecture and core components, incl

In [9]:
embeddings = embedding_manager.generate(texts)
vector_store.add_docs(chunk, embeddings)


Embeddings 497 generated
successfully added 497 documents to vector store
Total documents in collection: 497


In [20]:
# Retrivel Pipeline

class RAG:
    def __init__(self, vector_store:VectorDB, embedding_manager:Embedding_manager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query:str, top_k:int = 5, threshold:float = 0.0)-> List[Dict[str, Any]]:
        print(f"Top_k = {top_k}, threshold = {threshold}")
        try:
            query_embeddings = self.embedding_manager.generate([query])[0]
            
            results = self.vector_store.collection.query(
                query_embeddings = [query_embeddings.tolist()],
                n_results = top_k,
                include=["metadatas", "documents", "distances"]
            )
            
            retrieved_docs = []
            
            if results["documents"] and results["documents"][0]:
                documents = results["documents"][0]
                metadatas = results["metadatas"][0]
                distances = results["distances"][0]
                ids = results["ids"][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity = 1 - distance
                    if similarity < threshold:
                        continue
                    retrieved_docs.append({
                        "ids":doc_id,
                        "content":document,
                        "metadata":metadata,
                        "distance":distance,
                        "rank":i+1
                    })
            if not retrieved_docs:
                print("Document not found")
                
            return retrieved_docs
        
        except Exception as e:
            print(f"error during retrivel |-> {e}")
            return []
        
rag = RAG(vector_store, embedding_manager)
rag

<__main__.RAG at 0x21794998510>

In [26]:
response = rag.retrieve("what is RAG")
response

Top_k = 5, threshold = 0.0
Embeddings 1 generated


[{'ids': 'Doc_47ae39f46f_265',
  'content': 'Highlights in Science, Engineering and Technology \nCSIC 2024\nVolume 124 (2025) \n \n136 \n4. Solutions and Advancements in RAG Systems \nRecent years have witnessed significant advancements in RAG systems, addressing key challenges \nin retrieval efficiency, scalability, and knowledge integration. This section explores two innovative \napproaches that represent the cutting edge of RAG technology. Self-RAG, introduced by Asai et al., \nand represents a significant shift in RAG system design. It incorporates retrieval, generation, and \nevaluation into a single framework, allowing the model to iteratively improve its own performance. \nThis self-improving capability addresses challenges in query reformulation and result quality \nassessment, potentially leading to more accurate and relevant responses over time [15]. \nGraphRAG, developed by Microsoft, takes a different approach by leveraging graph structures for',
  'metadata': {'modDate': "