### DATA INGESTION

In [11]:
### document datastructure

from langchain_core.documents import Document

In [12]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

# 1. Point this to the folder containing your PDFs
folder_path = "../data/pdf"

# 2. Configure the DirectoryLoader
loader = DirectoryLoader(
    path=folder_path,
    glob="**/*.pdf",             # Looks for all .pdf files, including in subfolders
    loader_cls=PyMuPDFLoader,    # Tells it to use the faster PyMuPDFLoader for each file
    show_progress=True           # Shows a handy progress bar in your notebook
)

# 3. Load everything into a single list of Document objects
documents = loader.load()

# print(f"Loaded a total of {len(docs)} pages from the folder.")

100%|██████████| 2/2 [00:00<00:00, 10.71it/s]


In [13]:
# embedding and vectorstore db
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the EmbeddingManager with a specified sentence transformer model.

        Args:
            model_name (str): The name of the sentence transformer model to use for generating embeddings.
        """
 
        self.model_name = model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        """
        Loads the specified sentence transformer model for generating embeddings.
        """
        try:
            print(f"Loading model '{self.model_name}'...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully.embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")
            self.model = None

    def generate_embeddings(self,text: List[str]) -> np.ndarray:
        """
        Generates embeddings for a list of texts using the specified sentence transformer model.

        Args:
            text (List[str]): A list of strings for which to generate embeddings.

        Returns:
            np.ndarray: A 2D numpy array containing the embeddings for each input text.
        """

        if not self.model:
            raise ValueError("Model not loaded. Please check the model name and try again.")
        
        print(f"Generating embeddings for {len(text)} texts using model '{self.model_name}'...")
        embeddings = self.model.encode(text, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
        
        
##initialize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading model 'all-MiniLM-L6-v2'...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 754.64it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model 'all-MiniLM-L6-v2' loaded successfully.embedding dimension: 384


<__main__.EmbeddingManager at 0x73cad3667ec0>

In [15]:
class VectorStore:
    """Manages document embeddings in a chromadb document"""

    def __init__(self, collection_name: str = "document_embeddings", persist_directory: str = "../data/vector_store"):
        """
        Initializes the VectorStore with a specified collection name for storing document embeddings.

        Args:
            collection_name (str): The name of the collection in ChromaDB to store document embeddings.
            persist_directory: The directory path where the ChromaDB database will be persisted.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        """
        Initializes the ChromaDB client and collection for storing document embeddings.
        """
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection= self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "pdf doc embeddings for RAG system"}
            )
            print(f"VectorStore initialized with collection '{self.collection_name}' at '{self.persist_directory}'")
            print(f"Existing document IDs in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing VectorStore: {e}")
            raise
            
    
    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Adds a list of documents to the vector store by generating embeddings and storing them in ChromaDB.

        Args:
            documents (List[Document]): A list of Document objects to be added to the vector store.
            embeddings: corresponding list of embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("The number of documents must match the number of embeddings.")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        ids=[]
        metadatas=[]
        document_texts=[]
        embeddings_list=[]

        for i , (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = str(uuid.uuid4())
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content-length']=len(doc.page_content)
            metadatas.append(metadata)

            document_texts.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=document_texts,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to the vector store.")
        except Exception as e:
            print(f"Error adding documents to the vector store: {e}")
            raise


vectorstore=VectorStore()
vectorstore

VectorStore initialized with collection 'document_embeddings' at '../data/vector_store'
Existing document IDs in collection: 222


<__main__.VectorStore at 0x73cad362de80>

In [16]:
texts=[doc.page_content for doc in documents]

embeddings=embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(documents,embeddings)

Generating embeddings for 66 texts using model 'all-MiniLM-L6-v2'...


Batches: 100%|██████████| 3/3 [00:01<00:00,  1.82it/s]

Generated embeddings with shape: (66, 384)
Adding 66 documents to the vector store...
Successfully added 66 documents to the vector store.





In [17]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5 , score_threshold: float=0.0) -> List[Dict[str, Any]]:
        """Retrieves relevant documents from the vector store based on a query.
            Args:
                query (str): The input query for which to retrieve relevant documents.
                top_k (int): The number of top relevant documents to retrieve.
                score_threshold (float): The minimum cosine similarity score required for a document to be considered relevant.

                
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving relevant documents for query: '{query}'")
        print(f"top k: {top_k}")

        query_embedding= self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results=self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs= []

            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]

                for i ,(doc_id, document, metadata , distance) in enumerate(zip(results['ids'][0], documents, metadatas, results['distances'][0])):
                    similarity_score=1-distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                    print(f"Retrieved document {i+1}: ID={doc_id}, Similarity Score={similarity_score:.4f}, Metadata={metadata}")

                    
            else:
                print("No relevant documents found for the query.")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []


In [18]:
ragretriever=RAGRetriever(vectorstore,embedding_manager)
ragretriever.retrieve('What are the signs of stroke and what should you do?')

Retrieving relevant documents for query: 'What are the signs of stroke and what should you do?'
top k: 5
Generating embeddings for 1 texts using model 'all-MiniLM-L6-v2'...


Batches: 100%|██████████| 1/1 [00:00<00:00, 54.02it/s]

Generated embeddings with shape: (1, 384)
Retrieved document 1: ID=f742ac25-66f4-4c68-a0c1-0eb9dfd1e53a, Similarity Score=0.4540, Metadata={'modDate': "D:20050607145633+03'00'", 'content-length': 1569, 'doc_index': 37, 'trapped': '', 'total_pages': 48, 'creationdate': '2005-04-27T17:21:18+00:00', 'title': '39140_OMS.indd', 'format': 'PDF 1.3', 'creationDate': 'D:20050427172118Z', 'author': '', 'source': '../data/pdf/publications-avoiding-english.pdf', 'subject': '', 'producer': 'Adobe PDF Library 6.0', 'moddate': '2005-06-07T14:56:33+03:00', 'page': 19, 'file_path': '../data/pdf/publications-avoiding-english.pdf', 'keywords': '', 'creator': 'Adobe InDesign CS (3.0)'}
Retrieved document 2: ID=af2dac22-fc89-45cc-85ef-6340ede3bf98, Similarity Score=0.4540, Metadata={'content-length': 1569, 'total_pages': 48, 'author': '', 'source': '../data/pdf/publications-avoiding-english.pdf', 'moddate': '2005-06-07T14:56:33+03:00', 'creationdate': '2005-04-27T17:21:18+00:00', 'file_path': '../data/pdf




[{'id': 'f742ac25-66f4-4c68-a0c1-0eb9dfd1e53a',
  'metadata': {'modDate': "D:20050607145633+03'00'",
   'content-length': 1569,
   'doc_index': 37,
   'trapped': '',
   'total_pages': 48,
   'creationdate': '2005-04-27T17:21:18+00:00',
   'title': '39140_OMS.indd',
   'format': 'PDF 1.3',
   'creationDate': 'D:20050427172118Z',
   'author': '',
   'source': '../data/pdf/publications-avoiding-english.pdf',
   'subject': '',
   'producer': 'Adobe PDF Library 6.0',
   'moddate': '2005-06-07T14:56:33+03:00',
   'page': 19,
   'file_path': '../data/pdf/publications-avoiding-english.pdf',
   'keywords': '',
   'creator': 'Adobe InDesign CS (3.0)'},
  'similarity_score': 0.4540315866470337,
  'distance': 0.5459684133529663,
  'rank': 1},
 {'id': 'af2dac22-fc89-45cc-85ef-6340ede3bf98',
  'metadata': {'content-length': 1569,
   'total_pages': 48,
   'author': '',
   'source': '../data/pdf/publications-avoiding-english.pdf',
   'moddate': '2005-06-07T14:56:33+03:00',
   'creationdate': '2005-04-

In [32]:
##integration vectordb context pipeline with LLM output
import os
from langchain_groq import ChatGroq
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm= ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)


## rag function
def rag_simple(query, retriever,llm,top_k=3):
    ##retrieve the context  
    results= retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found in answer the question."
    

    prompt=f""" 
    Use the following context to answer questions concisely
    context:{context}

    Question:{query}
    Answer:
    
    """

    response=llm.invoke(prompt.format(contex=context,query=query))
    return response.content

In [34]:
answer=rag_simple("How is the prime minister of India?",ragretriever,llm,3)
print(answer)

Retrieving relevant documents for query: 'How is the prime minister of India?'
top k: 3
Generating embeddings for 1 texts using model 'all-MiniLM-L6-v2'...


Batches: 100%|██████████| 1/1 [00:00<00:00, 45.31it/s]

Generated embeddings with shape: (1, 384)
Retrieved document 1: ID=b0caf4b2-b5ec-41af-80e7-5feed75223a2, Similarity Score=-0.4507, Metadata={'author': '', 'total_pages': 48, 'keywords': '', 'page': 1, 'subject': '', 'content-length': 2201, 'title': '39140_OMS.indd', 'doc_index': 19, 'format': 'PDF 1.3', 'creationDate': 'D:20050427172118Z', 'trapped': '', 'moddate': '2005-06-07T14:56:33+03:00', 'file_path': '../data/pdf/publications-avoiding-english.pdf', 'producer': 'Adobe PDF Library 6.0', 'creationdate': '2005-04-27T17:21:18+00:00', 'source': '../data/pdf/publications-avoiding-english.pdf', 'creator': 'Adobe InDesign CS (3.0)', 'modDate': "D:20050607145633+03'00'"}
Retrieved document 2: ID=a4087a46-d646-4f0e-9342-7e23ce52dc79, Similarity Score=-0.4507, Metadata={'author': '', 'source': '../data/pdf/publications-avoiding-english.pdf', 'creator': 'Adobe InDesign CS (3.0)', 'title': '39140_OMS.indd', 'page': 1, 'subject': '', 'content-length': 2201, 'trapped': '', 'keywords': '', 'creat


