In [None]:
"""
FIXED RAG CODE - Key Changes:
1. Increased chunk size to 500-1000 characters
2. Better chunk overlap ratio
3. Added debugging utilities
4. Improved retrieval with better score threshold
"""

import os
from pathlib import Path
from typing import List, Dict, Any
import uuid
import numpy as np
import chromadb
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# ============================================================================
# FIXED: Better Chunk Size Configuration
# ============================================================================

def split_documents_optimized(documents, chunk_size=800, chunk_overlap=200):
    """
    FIXED: Increased chunk size for better context retention
    
    Old: chunk_size=100, chunk_overlap=25 (TOO SMALL!)
    New: chunk_size=800, chunk_overlap=200 (BETTER!)
    
    For technical documents like resumes:
    - 800 chars ≈ 120-150 words
    - Captures complete sections (skills, experience, etc.)
    - Better semantic coherence
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]  # Added ". " for better sentence breaks
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"✓ Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show statistics
    if split_docs:
        chunk_lengths = [len(doc.page_content) for doc in split_docs]
        print(f"  - Avg chunk length: {np.mean(chunk_lengths):.0f} chars")
        print(f"  - Min/Max: {min(chunk_lengths)}/{max(chunk_lengths)} chars")
        print(f"\nExample chunk (first 300 chars):")
        print(f"{split_docs[0].page_content[:300]}...")
    
    return split_docs


# ============================================================================
# DEBUGGING: Inspect your chunks
# ============================================================================

def inspect_chunks(chunks, search_term="tech"):
    """
    Debug utility to see which chunks contain certain keywords
    """
    print(f"\n{'='*70}")
    print(f"INSPECTING CHUNKS FOR: '{search_term}'")
    print(f"{'='*70}\n")
    
    matches = []
    for i, chunk in enumerate(chunks):
        content_lower = chunk.page_content.lower()
        if search_term.lower() in content_lower:
            matches.append({
                'index': i,
                'content': chunk.page_content,
                'length': len(chunk.page_content),
                'source': chunk.metadata.get('source_file', 'unknown')
            })
    
    if matches:
        print(f"✓ Found {len(matches)} chunks containing '{search_term}':\n")
        for match in matches[:3]:  # Show first 3 matches
            print(f"Chunk #{match['index']} ({match['length']} chars) from {match['source']}:")
            print(f"{match['content'][:500]}...")
            print(f"{'-'*70}\n")
    else:
        print(f"✗ No chunks found containing '{search_term}'")
        print("This might mean:")
        print("  1. The term doesn't appear in your documents")
        print("  2. Chunk size is too small to capture it")
        print("  3. The document wasn't loaded properly\n")
    
    return matches


# ============================================================================
# FIXED: Better Retrieval with Score Threshold
# ============================================================================

class ImprovedRAGRetriever:
    """Enhanced retriever with better defaults and debugging"""
    
    def __init__(self, vectorstore, embedding_manager):
        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.3):
        """
        FIXED: Added reasonable score threshold (0.3 instead of 0.0)
        
        Similarity score interpretation:
        - 0.7-1.0: Excellent match
        - 0.5-0.7: Good match
        - 0.3-0.5: Moderate match (might be relevant)
        - 0.0-0.3: Poor match (likely not relevant)
        """
        print(f"\n{'='*70}")
        print(f"QUERY: '{query}'")
        print(f"Settings: top_k={top_k}, score_threshold={score_threshold}")
        print(f"{'='*70}\n")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k * 2  # FIXED: Get more results to filter
            )
            
            retrieved_documents = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_documents.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                # Limit to top_k after filtering
                retrieved_documents = retrieved_documents[:top_k]
                
                print(f"✓ Retrieved {len(retrieved_documents)} documents:\n")
                
                for doc in retrieved_documents:
                    print(f"  Rank {doc['rank']} | Score: {doc['similarity_score']:.3f} | "
                          f"Source: {doc['metadata'].get('source_file', 'unknown')}")
                    print(f"  Content preview: {doc['content'][:150]}...")
                    print()
            else:
                print("✗ No documents found")
            
            return retrieved_documents
        
        except Exception as e:
            print(f"✗ Error during retrieval: {e}")
            return []


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

def reprocess_documents_with_better_chunks():
    """
    Complete workflow to reprocess your documents with optimized settings
    """
    
    print("\n" + "="*70)
    print("REPROCESSING DOCUMENTS WITH OPTIMIZED SETTINGS")
    print("="*70 + "\n")
    
    # 1. Load PDFs
    print("Step 1: Loading PDFs...")
    pdf_dir = Path("../data/pdf_files")
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files\n")
    
    all_documents = []
    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            all_documents.extend(documents)
            print(f"  ✓ Loaded {pdf_file.name}: {len(documents)} pages")
        except Exception as e:
            print(f"  ✗ Error loading {pdf_file.name}: {e}")
    
    print(f"\nTotal pages loaded: {len(all_documents)}\n")
    
    # 2. Split with BETTER chunk size
    print("Step 2: Splitting documents with OPTIMIZED chunk size...")
    chunks = split_documents_optimized(
        all_documents,
        chunk_size=800,      # INCREASED from 100
        chunk_overlap=200    # INCREASED from 25
    )
    
    # 3. Inspect chunks for tech stack keywords
    print("\nStep 3: Inspecting chunks for technical content...")
    inspect_chunks(chunks, "tech")
    inspect_chunks(chunks, "python")
    inspect_chunks(chunks, "java")
    
    return chunks


# ============================================================================
# QUICK FIX INSTRUCTIONS
# ============================================================================

print("""
╔══════════════════════════════════════════════════════════════════════╗
║                    HOW TO FIX YOUR RAG SYSTEM                        ║
╚══════════════════════════════════════════════════════════════════════╝

IMMEDIATE FIXES:

1. CHANGE YOUR CHUNK SIZE (Most Important!):
   
   Replace this line:
   chunks = split_documents(all_pdf_documents)
   
   With:
   chunks = split_documents(all_pdf_documents, chunk_size=800, chunk_overlap=200)

2. ADJUST SCORE THRESHOLD:
   
   Replace:
   rag_retriever.retrieve("which tech stack does anshuman use?")
   
   With:
   rag_retriever.retrieve("which tech stack does anshuman use?", 
                          top_k=10, 
                          score_threshold=0.3)

3. CLEAR AND REBUILD YOUR VECTOR STORE:
   
   # Delete old vector store
   import shutil
   shutil.rmtree('../chroma_db', ignore_errors=True)
   
   # Re-initialize with new chunks
   vectorstore = VectorStore()
   texts = [doc.page_content for doc in chunks]
   embeddings = embedding_manager.generate_embeddings(texts)
   vectorstore.add_documents(chunks, embeddings)

4. VERIFY YOUR CHUNKS:
   
   # Check if tech stack info exists
   for i, chunk in enumerate(chunks):
       if 'python' in chunk.page_content.lower():
           print(f"Chunk {i}: {chunk.page_content[:200]}")

ALTERNATIVE APPROACHES:

Option A - Try Different Query Phrasings:
   - "python java javascript programming languages"
   - "technical skills programming"
   - "software development experience"

Option B - Use Larger Model (if available):
   - Switch from 'all-MiniLM-L6-v2' to 'all-mpnet-base-v2'
   - Better semantic understanding, but slower

Option C - Use MMR (Maximal Marginal Relevance):
   - Retrieve more diverse results
   - Reduces redundancy in retrieved chunks
""")