# Advanced RAG Exploration 🔬

This notebook explores cutting-edge Retrieval-Augmented Generation (RAG) techniques.

## Exploration Areas
- Hierarchical document chunking
- Query decomposition strategies
- Contextual compression
- Multi-modal retrieval
- Performance optimization

## Setup

In [None]:
import os
import sys
import numpy as np
from pathlib import Path
from typing import List, Dict, Any

# Add utils to path
sys.path.append(str(Path.cwd().parent / 'utils'))

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Configuration
try:
    from utils.config import get_api_key
    api_key = get_api_key('openai')
    DEMO_MODE = not api_key
except ImportError:
    api_key = os.getenv('OPENAI_API_KEY')
    DEMO_MODE = not api_key

print(f"🔬 Advanced RAG Exploration Lab")
print(f"Demo mode: {DEMO_MODE}")

## Experiment 1: Hierarchical Document Chunking

In [None]:
class HierarchicalChunker:
    """Advanced chunking that preserves document hierarchy."""
    
    def __init__(self):
        self.chunk_sizes = [2000, 1000, 500]
        self.overlap = 100
    
    def chunk_with_hierarchy(self, text: str) -> List[Dict[str, Any]]:
        """Create hierarchical chunks with metadata."""
        chunks = []
        
        # Large chunks for context
        large_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_sizes[0], chunk_overlap=self.overlap
        )
        large_chunks = large_splitter.split_text(text)
        
        for i, large_chunk in enumerate(large_chunks):
            # Small chunks for precise retrieval
            small_splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.chunk_sizes[2], chunk_overlap=self.overlap
            )
            small_chunks = small_splitter.split_text(large_chunk)
            
            for j, small_chunk in enumerate(small_chunks):
                chunks.append({
                    'content': small_chunk,
                    'chunk_id': f"{i}-{j}",
                    'parent_context': large_chunk[:200] + "...",
                    'size': len(small_chunk)
                })
        
        return chunks

# Test hierarchical chunking
sample_text = """
Artificial Intelligence (AI) is transforming industries worldwide.

Machine Learning, a subset of AI, enables computers to learn from data without explicit programming.
Deep Learning uses neural networks to solve complex problems.
Natural Language Processing helps computers understand human language.

Applications include healthcare diagnostics, autonomous vehicles, and financial fraud detection.
""" * 5

chunker = HierarchicalChunker()
hierarchical_chunks = chunker.chunk_with_hierarchy(sample_text)

print(f"📊 Hierarchical Chunking Results:")
print(f"Total chunks: {len(hierarchical_chunks)}")
print(f"Average chunk size: {np.mean([chunk['size'] for chunk in hierarchical_chunks]):.0f} chars")

# Show sample chunks
print("\n🔍 Sample chunks:")
for i, chunk in enumerate(hierarchical_chunks[:2]):
    print(f"Chunk {chunk['chunk_id']}: {chunk['content'][:100]}...")
    print(f"Context: {chunk['parent_context'][:100]}...\n")

## Experiment 2: Query Decomposition

In [None]:
class QueryDecomposer:
    """Decompose complex queries into simpler sub-queries."""
    
    def __init__(self, llm):
        self.llm = llm
    
    def decompose_query(self, query: str) -> List[str]:
        """Break down complex query into simpler questions."""
        if DEMO_MODE:
            # Mock decomposition
            if "machine learning" in query.lower():
                return [
                    "What is machine learning?",
                    "How does machine learning work?",
                    "What are machine learning applications?"
                ]
            return [query]
        
        prompt = f"""
Break down this query into 3 simpler questions:
Query: {query}

Questions:
1.
2.
3.
"""
        
        try:
            response = self.llm.predict(prompt)
            questions = []
            for line in response.split('\n'):
                if line.strip() and line[0].isdigit():
                    question = line.split('.', 1)[-1].strip()
                    if len(question) > 10:
                        questions.append(question)
            return questions[:3]
        except:
            return [query]

# Test query decomposition
llm = ChatOpenAI(openai_api_key=api_key, temperature=0) if not DEMO_MODE else None
decomposer = QueryDecomposer(llm)

test_query = "How does machine learning work and what are its applications?"
sub_queries = decomposer.decompose_query(test_query)

print(f"🧪 Query Decomposition:")
print(f"Original: {test_query}")
print("Decomposed into:")
for i, sq in enumerate(sub_queries, 1):
    print(f"  {i}. {sq}")

## Experiment 3: Contextual Compression

In [None]:
class ContextualCompressor:
    """Compress documents to extract only relevant information."""
    
    def __init__(self, llm):
        self.llm = llm
    
    def compress_document(self, query: str, document: str) -> str:
        """Extract only query-relevant information from document."""
        if DEMO_MODE:
            # Mock compression - return first 200 chars
            return document[:200] + "... [compressed]"
        
        prompt = f"""
Extract only information relevant to this query:
Query: {query}

Document: {document}

Relevant information:
"""
        
        try:
            return self.llm.predict(prompt)
        except:
            return document[:300]  # Fallback to truncation
    
    def score_relevance(self, query: str, document: str) -> float:
        """Score document relevance to query."""
        if DEMO_MODE:
            # Mock scoring based on keyword overlap
            query_words = set(query.lower().split())
            doc_words = set(document.lower().split())
            overlap = len(query_words & doc_words)
            return min(overlap * 2.0, 10.0)
        
        prompt = f"""
Rate relevance (1-10) of this document to the query:
Query: {query}
Document: {document[:500]}

Score (number only):
"""
        
        try:
            response = self.llm.predict(prompt)
            return float(''.join(filter(str.isdigit, response.split()[0])))
        except:
            return 5.0  # Default score

# Test contextual compression
compressor = ContextualCompressor(llm)

sample_docs = [
    "Machine learning is a subset of AI that enables computers to learn from data.",
    "Python is a programming language popular in data science and web development.",
    "Deep learning uses neural networks to solve complex pattern recognition problems."
]

query = "What is machine learning?"

print(f"🔍 Contextual Compression for: '{query}'\n")

for i, doc in enumerate(sample_docs):
    relevance = compressor.score_relevance(query, doc)
    compressed = compressor.compress_document(query, doc)
    
    print(f"Document {i+1} (Relevance: {relevance:.1f}/10):")
    print(f"Original: {doc}")
    print(f"Compressed: {compressed[:100]}...\n")

## Experiment 4: Multi-Vector Retrieval

In [None]:
class MultiVectorRetriever:
    """Retrieval using multiple vector representations."""
    
    def __init__(self):
        self.documents = []
        self.document_vectors = {}  # Simulated vector storage
    
    def add_document(self, doc: Document):
        """Add document with multiple vector representations."""
        self.documents.append(doc)
        doc_id = len(self.documents) - 1
        
        # Simulate different vector types
        self.document_vectors[doc_id] = {
            'content_vector': self._mock_embedding(doc.page_content),
            'summary_vector': self._mock_embedding(doc.page_content[:100]),
            'keyword_vector': self._keyword_vector(doc.page_content)
        }
    
    def _mock_embedding(self, text: str) -> List[float]:
        """Mock embedding based on text characteristics."""
        # Simple hash-based mock embedding
        import hashlib
        hash_val = int(hashlib.md5(text.encode()).hexdigest(), 16)
        return [(hash_val >> i) % 100 / 100.0 for i in range(10)]
    
    def _keyword_vector(self, text: str) -> Dict[str, float]:
        """Create keyword-based vector."""
        words = text.lower().split()
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
        return word_counts
    
    def similarity_search(self, query: str, k: int = 3) -> List[Document]:
        """Search using multiple vector types."""
        query_vector = self._mock_embedding(query)
        query_keywords = self._keyword_vector(query)
        
        doc_scores = []
        
        for doc_id, vectors in self.document_vectors.items():
            # Combine different similarity scores
            content_sim = self._cosine_similarity(query_vector, vectors['content_vector'])
            keyword_sim = self._keyword_similarity(query_keywords, vectors['keyword_vector'])
            
            # Weighted combination
            combined_score = 0.7 * content_sim + 0.3 * keyword_sim
            doc_scores.append((doc_id, combined_score))
        
        # Sort by score and return top k
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        return [self.documents[doc_id] for doc_id, _ in doc_scores[:k]]
    
    def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity."""
        dot_product = sum(a * b for a, b in zip(vec1, vec2))
        norm1 = sum(a * a for a in vec1) ** 0.5
        norm2 = sum(b * b for b in vec2) ** 0.5
        return dot_product / (norm1 * norm2) if norm1 * norm2 > 0 else 0
    
    def _keyword_similarity(self, keywords1: Dict, keywords2: Dict) -> float:
        """Calculate keyword overlap similarity."""
        common_words = set(keywords1.keys()) & set(keywords2.keys())
        if not common_words:
            return 0.0
        
        overlap_score = sum(min(keywords1[word], keywords2[word]) for word in common_words)
        total_words = sum(keywords1.values()) + sum(keywords2.values())
        return overlap_score / total_words if total_words > 0 else 0

# Test multi-vector retrieval
retriever = MultiVectorRetriever()

# Add sample documents
docs = [
    Document(page_content="Machine learning algorithms learn patterns from data to make predictions."),
    Document(page_content="Deep learning is a subset of machine learning using neural networks."),
    Document(page_content="Python programming is essential for data science and machine learning."),
    Document(page_content="Natural language processing helps computers understand human language.")
]

for doc in docs:
    retriever.add_document(doc)

# Test retrieval
query = "machine learning algorithms"
results = retriever.similarity_search(query, k=2)

print(f"🔍 Multi-Vector Retrieval for: '{query}'\n")
print(f"Top {len(results)} results:")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.page_content}")

## Experiment 5: Performance Analysis

In [None]:
import time

def benchmark_retrieval_methods():
    """Benchmark different retrieval approaches."""
    
    # Simulate different methods
    methods = {
        'Basic Vector Search': lambda: time.sleep(0.1),
        'Hierarchical Chunking': lambda: time.sleep(0.15),
        'Query Decomposition': lambda: time.sleep(0.25),
        'Contextual Compression': lambda: time.sleep(0.3),
        'Multi-Vector': lambda: time.sleep(0.2)
    }
    
    results = {}
    
    for method_name, method_func in methods.items():
        times = []
        for _ in range(5):  # Run 5 times
            start_time = time.time()
            method_func()
            times.append(time.time() - start_time)
        
        results[method_name] = {
            'avg_time': np.mean(times),
            'std_time': np.std(times)
        }
    
    return results

# Run benchmark
print("⚡ Benchmarking Retrieval Methods:\n")
benchmark_results = benchmark_retrieval_methods()

for method, stats in benchmark_results.items():
    print(f"{method}:")
    print(f"  Average time: {stats['avg_time']:.3f}s")
    print(f"  Std deviation: {stats['std_time']:.3f}s\n")

# Quality vs Speed analysis
print("📊 Quality vs Speed Trade-offs:")
tradeoffs = {
    'Basic Vector Search': {'quality': 7, 'speed': 10},
    'Hierarchical Chunking': {'quality': 8, 'speed': 8},
    'Query Decomposition': {'quality': 9, 'speed': 6},
    'Contextual Compression': {'quality': 9, 'speed': 5},
    'Multi-Vector': {'quality': 8, 'speed': 7}
}

for method, scores in tradeoffs.items():
    efficiency = (scores['quality'] + scores['speed']) / 2
    print(f"{method}: Quality={scores['quality']}/10, Speed={scores['speed']}/10, Efficiency={efficiency:.1f}/10")

## Key Findings and Recommendations

### 🎯 Best Practices Discovered

1. **Hierarchical Chunking**: Preserves context while enabling precise retrieval
2. **Query Decomposition**: Improves complex query handling
3. **Contextual Compression**: Reduces noise and improves relevance
4. **Multi-Vector Approaches**: Combine semantic and keyword-based retrieval

### 🔄 Trade-offs

- **Quality vs Speed**: Advanced methods improve quality but increase latency
- **Complexity vs Reliability**: Simple methods are more robust
- **Cost vs Performance**: LLM-based compression is expensive but effective

### 🚀 Next Steps

1. Test with real datasets
2. Implement caching for expensive operations
3. Explore async processing for better performance
4. Develop domain-specific optimizations