# üîß RAG System - FIXES APPLIED

## Problems Identified:
1. ‚ùå **Chunk size too small** (100 chars ‚Üí only captures headers, not content)
2. ‚ùå **No score threshold** (accepting very poor matches)
3. ‚ùå **Only 1 result returned** (vector store might not have relevant chunks)

## Solutions Applied:
1. ‚úÖ Increased chunk size to 800 characters
2. ‚úÖ Set minimum similarity threshold to 0.3
3. ‚úÖ Added debugging utilities
4. ‚úÖ Better query strategies

## Step 1: Clear Old Vector Store and Start Fresh

In [None]:
import shutil
import os

# Clear old vector store
if os.path.exists('../chroma_db'):
    shutil.rmtree('../chroma_db')
    print("‚úì Cleared old vector store")
else:
    print("‚Ñπ No existing vector store found")

## Step 2: Load Documents (Same as Before)

In [None]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ‚úì Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ‚úó Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")

## Step 3: FIXED - Split with Optimized Chunk Size

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np

def split_documents(documents, chunk_size=800, chunk_overlap=200):
    """
    üîß FIXED: Increased chunk size from 100 to 800
    
    Why this matters:
    - 100 chars: Only captures "Anshuman\nAspiring Full Stack Developer" (no details)
    - 800 chars: Captures entire skill sections with context
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    
    print(f"‚úì Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show statistics
    if split_docs:
        chunk_lengths = [len(doc.page_content) for doc in split_docs]
        print(f"  Average chunk length: {np.mean(chunk_lengths):.0f} characters")
        print(f"  Min/Max lengths: {min(chunk_lengths)}/{max(chunk_lengths)} characters")
        print(f"\nüìÑ Example chunk (first 500 chars):")
        print("=" * 70)
        print(split_docs[0].page_content[:500])
        print("=" * 70)
    
    return split_docs

# Create chunks with OPTIMIZED size
chunks = split_documents(
    all_pdf_documents,
    chunk_size=800,      # üîß CHANGED from 100
    chunk_overlap=200    # üîß CHANGED from 25
)

## Step 4: Inspect Chunks for Tech Stack Keywords

In [None]:
def inspect_chunks(chunks, keywords=["tech", "python", "java", "javascript", "stack"]):
    """
    Debug utility to see which chunks contain technical keywords
    """
    print(f"\n{'='*70}")
    print(f"üîç INSPECTING {len(chunks)} CHUNKS FOR TECHNICAL CONTENT")
    print(f"{'='*70}\n")
    
    for keyword in keywords:
        matches = []
        for i, chunk in enumerate(chunks):
            if keyword.lower() in chunk.page_content.lower():
                matches.append((i, chunk))
        
        print(f"Keyword: '{keyword}' ‚Üí Found in {len(matches)} chunks")
        
        if matches:
            # Show first match
            idx, chunk = matches[0]
            print(f"  Example from chunk #{idx}:")
            print(f"  {chunk.page_content[:200]}...")
        print()
    
    print(f"{'='*70}\n")

# Run inspection
inspect_chunks(chunks)

## Step 5: Generate Embeddings and Store (Same as Before)

In [None]:
import os
import uuid
import numpy as np
import chromadb

from typing import List, Any
from sentence_transformers import SentenceTransformer

In [None]:
# Assuming you already have EmbeddingManager and VectorStore classes defined
# (Copy your existing classes here)

class VectorStore:
    """Manage document embeddings in a chromaDB vector store """

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = '../data/vector_store'):
        """
        Initialise the vector store 

        Args:
            collection_name: Name of the chromaDB collection 
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Intialise chromaDB client and collection"""   
        try: 
            # create persistent chromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector Store initialised. Collection: {self.collection_name}")
            print(f"Existing documents in Collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initialsing vector store {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args: 
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings!!!")
        
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list =[]

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID 
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    def __init__(self, model_name: str = "all-miniLM-L6-v2"):
        """
        Initialise the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args: 
            texts: List of text strings to embed

            Returns: numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded!!!")
        
        print(f"Generating embeddings for {len(texts)} text...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    ## initialise the embedding manager

# Initialize
embedding_manager = EmbeddingManager()
vectorstore = VectorStore()

# Convert text to embeddings
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Store in vector DB
vectorstore.add_documents(chunks, embeddings)

## Step 6: FIXED - Better Retrieval with Score Threshold

In [None]:
from typing import List, Dict, Any

class ImprovedRAGRetriever:
    """Enhanced retriever with better defaults"""
    
    def __init__(self, vectorstore, embedding_manager):
        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0):
        """
        üîß FIXED: Added score_threshold parameter with default 0.3
        
        Similarity scores:
        - 0.7-1.0: Excellent match ‚≠ê‚≠ê‚≠ê
        - 0.5-0.7: Good match ‚≠ê‚≠ê
        - 0.3-0.5: Moderate match ‚≠ê
        - 0.0-0.3: Poor match ‚ùå
        """
        print(f"\n{'='*70}")
        print(f"üîç QUERY: '{query}'")
        print(f"‚öôÔ∏è  Settings: top_k={top_k}, score_threshold={score_threshold}")
        print(f"{'='*70}\n")
        
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k * 2  # Get extra to filter by score
            )
            
            retrieved_documents = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        # Determine quality indicator
                        if similarity_score >= 0.7:
                            quality = "‚≠ê‚≠ê‚≠ê"
                        elif similarity_score >= 0.5:
                            quality = "‚≠ê‚≠ê"
                        elif similarity_score >= 0.3:
                            quality = "‚≠ê"
                        else:
                            quality = "‚ùå"
                        
                        retrieved_documents.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1,
                            'quality': quality
                        })
                
                retrieved_documents = retrieved_documents[:top_k]
                
                if retrieved_documents:
                    print(f"‚úÖ Retrieved {len(retrieved_documents)} relevant documents:\n")
                    
                    for doc in retrieved_documents:
                        print(f"{doc['quality']} Rank {doc['rank']} | Score: {doc['similarity_score']:.3f} | "
                              f"Source: {doc['metadata'].get('source_file', 'unknown')}")
                        print(f"   Content: {doc['content'][:150]}...")
                        print()
                else:
                    print(f"‚ö†Ô∏è  No documents found above threshold {score_threshold}")
                    print("   Try lowering score_threshold or rephrasing your query")
            else:
                print("‚ùå No documents found in vector store")
            
            return retrieved_documents
        
        except Exception as e:
            print(f"‚ùå Error during retrieval: {e}")
            return []

# Initialize improved retriever
rag_retriever = ImprovedRAGRetriever(vectorstore, embedding_manager)

## Step 7: Test Different Queries

In [None]:
# Test Query 1: Original query
results = rag_retriever.retrieve(
    "ANshuman full stack github?",
    top_k=5,
    score_threshold=0.2
)

In [None]:
# Test Query 2: More specific
results = rag_retriever.retrieve(
    "python java javascript programming languages skills",
    top_k=5,
    score_threshold=0.3
)

In [None]:
# Test Query 3: Try lowering threshold if needed
results = rag_retriever.retrieve(
    "technical skills programming",
    top_k=10,
    score_threshold=0.2  # Lower threshold to see more results
)

## Step 8: Analyze Results

In [None]:
def analyze_results(results):
    """
    Helper to analyze retrieval results
    """
    if not results:
        print("‚ùå No results to analyze")
        return
    
    print(f"\n{'='*70}")
    print(f"üìä RESULTS ANALYSIS")
    print(f"{'='*70}\n")
    
    scores = [r['similarity_score'] for r in results]
    print(f"Number of results: {len(results)}")
    print(f"Score range: {min(scores):.3f} - {max(scores):.3f}")
    print(f"Average score: {np.mean(scores):.3f}")
    
    print(f"\nFull content of top result:\n")
    print("=" * 70)
    print(results[0]['content'])
    print("=" * 70)

# Analyze your results
analyze_results(results)

## Troubleshooting Guide

### If you're STILL getting poor results:

1. **Check if tech stack info exists in your PDF:**
   ```python
   for chunk in chunks:
       if 'python' in chunk.page_content.lower():
           print(chunk.page_content)
   ```

2. **Try different embedding model:**
   ```python
   # In EmbeddingManager class, change:
   self.model = SentenceTransformer('all-mpnet-base-v2')  # Better but slower
   ```

3. **Adjust chunk size based on your content:**
   ```python
   # For very technical resumes:
   chunks = split_documents(all_pdf_documents, chunk_size=1000, chunk_overlap=250)
   
   # For shorter documents:
   chunks = split_documents(all_pdf_documents, chunk_size=500, chunk_overlap=100)
   ```

4. **Try hybrid search (keyword + semantic):**
   ```python
   # First filter by keywords, then rank by similarity
   keyword_matches = [c for c in chunks if 'python' in c.page_content.lower()]
   # Then search within keyword_matches
   ```

### Integration Of VectorDB Context Pipeline with LLM Output

In [None]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialise Groq LLM
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-8b-instant", temperature=0.1, max_tokens=1024)
# llm=ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.1, max_tokens=1024)

### Simple RAG Function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k=3):
    ## retrieve the context
    results=retriever.retrieve(query, top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "NO Relevant context found to answer the question!!!"

    ## Generate the answer using the LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context, query=query)])
    return response.content

In [None]:
answer=rag_simple("what is anshuman's cgpa?", rag_retriever, llm)
print(answer)