### Data Ingestion


In [26]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path

In [27]:
dir_loader=DirectoryLoader(
    "../docs/pdfs",
    loader_cls=PyMuPDFLoader

)

In [28]:
all_documents=dir_loader.load()
all_documents

[Document(metadata={'producer': 'PDFlib+PDI 9.2.0 (C++/Win64); modified using iTextSharp™ 5.5.13.4 ©2000-2024 iText Group NV (AGPL-version)', 'creator': 'PTC Arbortext Layout Developer 12.0.5638/W-x64', 'creationdate': '2026-01-02T16:45:07+05:30', 'source': '..\\docs\\pdfs\\A systematic review and comparative analysis of deep learning models for Twitter X-based traffic event detection.pdf', 'file_path': '..\\docs\\pdfs\\A systematic review and comparative analysis of deep learning models for Twitter X-based traffic event detection.pdf', 'total_pages': 29, 'format': 'PDF 1.5', 'title': 'A systematic review and comparative analysis of deep learning models for Twitter/X-based traffic eve', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2026-01-02T16:46:42+05:30', 'trapped': '', 'modDate': "D:20260102164642+05'30'", 'creationDate': "D:20260102164507+05'30'", 'page': 0}, page_content='International Journal of Digital Earth\nISSN: 1753-8947 (Print) 1753-8955 (Online) Journal homepa

Chunks


In [29]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n",""," "]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    return split_docs



In [30]:
chunks=split_documents(all_documents)
chunks

Split 82 documents into 492 chunks


[Document(metadata={'producer': 'PDFlib+PDI 9.2.0 (C++/Win64); modified using iTextSharp™ 5.5.13.4 ©2000-2024 iText Group NV (AGPL-version)', 'creator': 'PTC Arbortext Layout Developer 12.0.5638/W-x64', 'creationdate': '2026-01-02T16:45:07+05:30', 'source': '..\\docs\\pdfs\\A systematic review and comparative analysis of deep learning models for Twitter X-based traffic event detection.pdf', 'file_path': '..\\docs\\pdfs\\A systematic review and comparative analysis of deep learning models for Twitter X-based traffic event detection.pdf', 'total_pages': 29, 'format': 'PDF 1.5', 'title': 'A systematic review and comparative analysis of deep learning models for Twitter/X-based traffic eve', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2026-01-02T16:46:42+05:30', 'trapped': '', 'modDate': "D:20260102164642+05'30'", 'creationDate': "D:20260102164507+05'30'", 'page': 0}, page_content='International Journal of Digital Earth\nISSN: 1753-8947 (Print) 1753-8955 (Online) Journal homepa

In [31]:
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
import uuid
from typing import List,Tuple,Dict,Any
from sklearn.metrics.pairwise import cosine_similarity


In [32]:
#importing HuggingFace model(miniLM L6 v2)
class embeddingManager:
    def __init__(self,model_name: str="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self.load_model()
    def load_model(self):
        #loading the sentence-transformer model
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Loaded model successfully,embedding dimensions:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error in Loading the model:{e}")
            raise
    def generate_embeddings(self,texts:List[str])->np.ndarray:
        #generate embeddings for list of models
        if not self.model:
            raise ValueError("Model not found")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated Embeddings with shape {embeddings.shape}")
        return embeddings
embedding_manager=embeddingManager()
embedding_manager


    



Loading embedding model:all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1042.51it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loaded model successfully,embedding dimensions:384


<__main__.embeddingManager at 0x22777202f10>

Vector store


In [33]:
class vectorstore:
    def __init__(self,collection_name: str="pdf_docs",persist_directory="../docs/vector_store"):
    #initilization of the vector store
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self.initialize_store()
    def initialize_store(self):
        #Initilize chroma db
        try:
            #Creating Persistent Chroma DB
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)
            #Get or create collection
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF's documentation embedding for RAGs"}

            )
            print(f"Collection:{self.collection_name}")
            print(f"Existing docs in collection:{self.collection.count()}")
        except Exception as e:
            print(e) 
            raise 
    def add_documents(self,documents:List[Any],embeddings=np.ndarray):
        "Add documents and their embeddings to the vector store"
        if len(documents)!=len(embeddings):
            raise ValueError("No of documents should match with no of embeddings")
        print(f"adding {len(documents)} documents to the store")
        #prepare data for chroma db
        ids =[]
        metadatas=[]
        document_texts=[]
        embedding_list=[]
        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            #generating doc id using uuid(unique)
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            #meta data
            metadata=dict(doc.metadata)
            metadata['doc_index']=i
            metadata["content_length"]=len(doc.page_content)
            metadatas.append(metadata)
            #document_text
            document_texts.append(doc.page_content)
            #embedding lists
            embedding_list.append(embedding.tolist())
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=document_texts
            )
            print(f"Successfully added {len(documents)} in the vector store")
        except Exception as e:
            print(f"Error loading documents in the vector store:{e}")
            raise

Vectorstore=vectorstore()
Vectorstore

            

Collection:pdf_docs
Existing docs in collection:1476


<__main__.vectorstore at 0x227760c2610>

In [34]:
#converting texts into chunks
texts=[doc.page_content for doc in chunks]
#converting chunks into embeddings
embeddings=embedding_manager.generate_embeddings(texts)
#storing into vector db
Vectorstore.add_documents(chunks,embeddings)

Batches: 100%|██████████| 16/16 [00:09<00:00,  1.62it/s]


Generated Embeddings with shape (492, 384)
adding 492 documents to the store
Successfully added 492 in the vector store


Retriver pipeline from the vector store


In [62]:
class retriever:
    def __init__(self,vector_store:vectorstore,embedding_manager:embeddingManager):
        self.vector_store =vector_store
        self.embedding_manager=embedding_manager
    def retrieve(self,query:str,top_k:int=5,score_threshold : float=0,)-> List[dict[str,any]]:
        #retrieve relevant document
        print(f"Retrieving documents for query:{query}")
        print(f"Top k:{top_k},score_threshold:{score_threshold}")
        #generate query embedding
        query_embedding=self.embedding_manager.generate_embeddings([query])[0]
        #Search in vector store
        try:
            results=self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            # Process results
            retrived_docs=[]
            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]
                for i,(document,metadata,distance,doc_id) in enumerate(zip(documents,metadatas,distances,ids)):
                    similarity_score=1/(1+distance)
                    if similarity_score >= score_threshold:
                        retrived_docs.append({
                            'id':doc_id,
                            'content':document,
                            'metadata':metadata,
                            'similarity_score':similarity_score,
                            'distance':distance,
                            'rank':i+1
                        })
                print(f"retreived {len(retrived_docs)} documents after filtering")
            else:
                print("No documents found")
            return retrived_docs
        except Exception as e:
            print(f"Error during retrieval {e}")
            return []
rag_retriever=retriever(Vectorstore,embedding_manager)


In [63]:
docs = rag_retriever.retrieve("Explain about the docs")

Retrieving documents for query:Explain about the docs
Top k:5,score_threshold:0


Batches: 100%|██████████| 1/1 [00:00<00:00, 92.48it/s]

Generated Embeddings with shape (1, 384)
retreived 5 documents after filtering





In [49]:
for i, d in enumerate(docs, start=1):
    print(f"\n========== Document {i} ==========")
    print(d["content"])