### RAG Pipelines- Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        documents = loader.load()
        # Add source information to metadata
        for doc in documents:
            doc.metadata['source_file'] = pdf_file.name
            doc.metadata['file_type'] = 'pdf'            
        all_documents.extend(documents)
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs(r"D:\ML_Projects\TripTeller-RAG-Chatbot\Data")


Total documents loaded: 285


In [5]:
### Text splitting get into chunks

def split_documents(documents, chunk_size, chunk_overlap):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    
    return split_docs


In [6]:
chunks=split_documents(all_pdf_documents, chunk_size=1000, chunk_overlap=200)
chunks

[Document(metadata={'producer': 'iLovePDF', 'creator': 'CorelDRAW X8', 'creationdate': '2024-02-28T16:06:20+05:30', 'author': 'Admin', 'title': 'Full book Text.cdr', 'moddate': '2024-02-28T11:56:15+00:00', 'source': 'D:\\ML_Projects\\TripTeller-RAG-Chatbot\\Data\\India Tourism Statistics 2023-English.pdf', 'total_pages': 216, 'page': 0, 'page_label': '1', 'source_file': 'India Tourism Statistics 2023-English.pdf', 'file_type': 'pdf'}, page_content='INDIA\nTourism\nStatistics\n2023\nMinistry of Tourism\n Government of India'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'CorelDRAW X8', 'creationdate': '2024-02-28T16:06:20+05:30', 'author': 'Admin', 'title': 'Full book Text.cdr', 'moddate': '2024-02-28T11:56:15+00:00', 'source': 'D:\\ML_Projects\\TripTeller-RAG-Chatbot\\Data\\India Tourism Statistics 2023-English.pdf', 'total_pages': 216, 'page': 1, 'page_label': '2', 'source_file': 'India Tourism Statistics 2023-English.pdf', 'file_type': 'pdf'}, page_content='iMinistry of To

### embedding And vectorStoreDB

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing import List, Dict, Any

In [7]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        self.model = SentenceTransformer(self.model_name)
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings

embedding_manager=EmbeddingManager()
embedding_manager

<__main__.EmbeddingManager at 0x273dea1a2d0>

### VectorStore

In [None]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description": "PDF document embeddings for RAG"})

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text
        )
        print(f"Successfully added {len(documents)} documents to vector store")
        print(f"Total documents in collection: {self.collection.count()}")

vectorstore=VectorStore()
vectorstore

<__main__.VectorStore at 0x273dfe2ff90>

In [None]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]
## Generate the Embeddings
embeddings=embedding_manager.generate_embeddings(texts)
##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches: 100%|██████████| 16/16 [00:12<00:00,  1.23it/s]


Successfully added 506 documents to vector store
Total documents in collection: 506


### Retriever Pipeline From VectorStore

In [10]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        # Search in vector store
        results = self.vector_store.collection.query(query_embeddings=[query_embedding.tolist()], n_results=top_k)
        # Process results
        retrieved_docs = []        
        if results['documents'] and results['documents'][0]:
            documents = results['documents'][0]
            metadatas = results['metadatas'][0]
            distances = results['distances'][0]
            ids = results['ids'][0]
            for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                # Convert distance to similarity score (ChromaDB uses cosine distance)
                similarity_score = 1 - distance
                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        'id': doc_id,
                        'content': document,
                        'metadata': metadata,
                        'similarity_score': similarity_score,
                        'distance': distance,
                        'rank': i + 1
                    })
            print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
        else:
            print("No documents found")
        return retrieved_docs

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [None]:
rag_retriever.retrieve("List nearby places of Munnar")

### RAG Pipeline- VectorDB To LLM Output Generation

In [None]:
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [None]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage

In [14]:
class GroqLLM:
    def __init__(self, model_name: str = "gemma2-9b-it", api_key: str =None):
        self.model_name = model_name
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )
    def generate_response(self, query: str, context: str, max_length: int = 500) -> str:
        # Create prompt template
        prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.
            Context: {context}
            Question: {question}
            Answer: Provide a clear and informative answer based on the context above.
                    If the context doesn't contain enough information to answer the question, say so."""
        )
        
        # Format the prompt
        formatted_prompt = prompt_template.format(context=context, question=query)
        # Generate response
        messages = [HumanMessage(content=formatted_prompt)]
        response = self.llm.invoke(messages)
        return response.content
        
    def generate_response_simple(self, query: str, context: str) -> str:
        simple_prompt = f"""Based on this context: {context}
                            Question: {query}
                            Answer:"""
        messages = [HumanMessage(content=simple_prompt)]
        response = self.llm.invoke(messages)
        return response.content    

In [15]:
# Initialize Groq LLM (you'll need to set GROQ_API_KEY environment variable)
try:
    groq_llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
    print("Groq LLM initialized successfully!")
except ValueError as e:
    print(f"Warning: {e}")
    print("Please set your GROQ_API_KEY environment variable to use the LLM.")
    groq_llm = None

Groq LLM initialized successfully!


### Integration Vectordb Context pipeline With LLM output

In [29]:
### Simple RAG pipeline with Groq LLM
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="gemma2-9b-it",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer user's question.
                If the context does not contain enough information, mention it in the chat and answer the question using your general knowledge.
                Answer in a friendly, conversational way. Start with greetings or some friendly note.
                Context: {context}
                Question: {query}
                Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [31]:
answer=rag_simple("Is there any reviews of Munnar from tourists?",rag_retriever,llm)
print(answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 19.98it/s]


Retrieved 3 documents (after filtering)
Hey there!  

That's a great question!  While the text talks about how popular Munnar is with tourists, it doesn't actually include any reviews from them.  

I can tell you that Munnar is known for its beautiful scenery, pleasant weather, and tea estates, so I imagine tourists have lots of positive things to say about it!  


