In [3]:
# First, install additional dependencies
!pip install sentence-transformers[train] datasets accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torc

In [1]:
!pip install transformers sentence-transformers faiss-cpu PyPDF2 torch openai python-dotenv
!pip install --upgrade langchain langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-m

In [40]:
import os
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional
import PyPDF2
import faiss
import torch
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

In [41]:
from sentence_transformers import SentenceTransformer
import random
from collections import defaultdict

In [42]:
# Your existing classes (keeping them as they work well)
class DocumentProcessor:
    def __init__(self):
        self.documents = {}

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    page_text = self.enhance_table_extraction(page_text)
                    text += f"\nPage {page_num + 1}: {page_text}\n"
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
        return text

    def enhance_table_extraction(self, text: str) -> str:
        lines = text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            if re.search(r'(self-route|routing|self-reflection)', line, re.IGNORECASE):
                processed_lines.append(f"SELF_ROUTE_CONTENT: {line}")
            elif re.search(r'(failure|error).*(type|case|category)', line, re.IGNORECASE):
                processed_lines.append(f"FAILURE_TYPES: {line}")
            elif re.search(r'(multi-step|general knowledge|implicit|long.?complex)', line, re.IGNORECASE):
                processed_lines.append(f"FAILURE_DETAIL: {line}")
            elif re.search(r'(mrr|recall@|ndcg@|precision|f1)', line, re.IGNORECASE):
                line = re.sub(r'\s+', ' | ', line)
                processed_lines.append(f"METRICS_TABLE: {line}")
            elif re.search(r'(chunk|segment|overlap|window)', line, re.IGNORECASE):
                processed_lines.append(f"CHUNKING_STRATEGY: {line}")
            elif re.search(r'(outperform|superior|better|vs|versus|comparison)', line, re.IGNORECASE):
                processed_lines.append(f"PERFORMANCE_COMPARISON: {line}")
            elif re.search(r'(goal|objective|aim|purpose|method)', line, re.IGNORECASE):
                processed_lines.append(f"METHOD_GOAL: {line}")
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def clean_text(self, text: str) -> str:
        text = re.sub(r'\n\s*\n', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'([.!?])\s+([A-Z])', r'\1\n\2', text)
        text = re.sub(r'[^\w\s.,;:!?()\%@\-\[\]{}|]', '', text)
        return text.strip()

    def upload_and_process_pdfs(self) -> Dict[str, str]:
        print("Please upload your PDF files:")
        uploaded = files.upload()

        for filename, content in uploaded.items():
            if filename.endswith('.pdf'):
                with open(filename, 'wb') as f:
                    f.write(content)

                text = self.extract_text_from_pdf(filename)
                cleaned_text = self.clean_text(text)
                self.documents[filename] = cleaned_text
                print(f"✅ Processed {filename}: {len(cleaned_text)} characters")

        return self.documents

class TextChunker:
    def __init__(self, chunk_size: int = 512, overlap: int = 100):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text: str, document_name: str) -> List[Dict]:
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""
        word_count = 0

        for para in paragraphs:
            sentences = re.split(r'(?<=[.!?])\s+', para)

            for sentence in sentences:
                sentence_words = sentence.split()

                if word_count + len(sentence_words) > self.chunk_size and current_chunk:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'document': document_name,
                        'chunk_id': len(chunks),
                        'word_count': word_count
                    })

                    overlap_text = ' '.join(current_chunk.split()[-self.overlap:])
                    current_chunk = overlap_text + " " + sentence
                    word_count = len(current_chunk.split())
                else:
                    current_chunk += " " + sentence
                    word_count += len(sentence_words)

        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'document': document_name,
                'chunk_id': len(chunks),
                'word_count': word_count
            })

        return chunks

    def chunk_documents(self, documents: Dict[str, str]) -> List[Dict]:
        all_chunks = []
        for doc_name, text in documents.items():
            chunks = self.chunk_text(text, doc_name)
            all_chunks.extend(chunks)

        print(f"✅ Created {len(all_chunks)} chunks total")
        return all_chunks

# Simplified embedding manager - NO FINE-TUNING
class SimpleEmbeddingManager:
    """Simple embedding manager with no fine-tuning - just different base models"""

    def __init__(self, model_name: str):
        self.model_name = model_name
        self.model = None
        self.embeddings = None
        self.chunks = None
        self.index = None

    def load_model(self):
        """Load the model"""
        print(f"🔄 Loading model: {self.model_name}")
        self.model = SentenceTransformer(self.model_name)
        print("✅ Model loaded successfully")

    def create_embeddings(self, chunks: List[Dict]) -> np.ndarray:
        """Create embeddings for chunks"""
        if self.model is None:
            self.load_model()

        texts = [chunk['text'] for chunk in chunks]
        embeddings = self.model.encode(texts, show_progress_bar=True, batch_size=32)

        self.chunks = chunks
        self.embeddings = embeddings

        # Create FAISS index for fast similarity search
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)

        # Normalize embeddings for cosine similarity
        embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        self.index.add(embeddings_normalized.astype('float32'))

        print(f"✅ Created {len(embeddings)} embeddings with {self.model_name} (dim: {dimension})")
        return embeddings

    def retrieve_relevant_chunks(self, query: str, top_k: int = 8) -> List[Dict]:
        """Retrieve relevant chunks using semantic similarity"""
        if self.index is None:
            raise ValueError("Index not created. Please create embeddings first.")

        # Encode query
        query_embedding = self.model.encode([query])
        query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

        # Search for similar chunks
        scores, indices = self.index.search(query_normalized.astype('float32'), top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks):
                chunk = self.chunks[idx].copy()
                chunk['similarity_score'] = float(score)
                results.append(chunk)

        return results

# Evaluation system for measuring improvement
class RetrievalEvaluator:
    """Evaluate retrieval performance using standard IR metrics"""

    def __init__(self):
        self.test_queries = [
            "What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?",
            "Explain why the researchers believe RAG might still be useful despite the superior performance of long-context LLMs",
            "Compare the reranking techniques mentioned in the Wang paper. How do they impact the retrieval quality?",
            "What are the trade-offs involved when using different chunking strategies in RAG systems?",
            "How does multimodal retrieval enhance the capabilities of RAG?",
            "What were the key failure cases for RAG in handling long context retrievals, as noted by Zhuowan Li?",
            "Why does the Zhuowan paper claim that long-context LLMs outperformed RAG in most cases? What benefits does RAG still offer?",
            "Describe the metrics used to evaluate the different embedding models for RAG in Wang's paper",
            "Discuss the implications of using self-reflection in routing queries between RAG and long-context LLMs",
            "How does query rewriting contribute to the overall efficiency of RAG according to Wang's findings?",
            "Compare the cost-efficiency and performance trade-offs between RAG and long-context language models (LC) as discussed in the Wang and Zhuowan Li papers. How do these methods balance the ability to handle large volumes of text with computational demands?",
            "In terms of chunking methods in Wang's paper, what is the difference in performance between the best and second-best methods in Table 4?",
            "What are the best approaches for the retrieval and reranking modules according to Table 11 in Wang paper?"
        ]

    def evaluate_system(self, embedding_manager: SimpleEmbeddingManager,
                       system_name: str = "RAG System") -> Dict:
        """Evaluate the RAG system using retrieval metrics"""

        print(f"📊 Evaluating {system_name}...")

        results = {
            'system_name': system_name,
            'queries': [],
            'avg_similarity': 0.0,
            'top_1_scores': [],
            'top_3_avg_scores': [],
            'retrieval_success_rate': 0.0
        }

        total_similarity = 0.0
        successful_retrievals = 0

        for i, query in enumerate(self.test_queries):
            try:
                chunks = embedding_manager.retrieve_relevant_chunks(query, top_k=5)

                if chunks:
                    top_1_score = chunks[0]['similarity_score']
                    top_3_avg = np.mean([c['similarity_score'] for c in chunks[:3]])

                    results['queries'].append({
                        'query': query,
                        'top_1_score': top_1_score,
                        'top_3_avg': top_3_avg,
                        'num_results': len(chunks)
                    })

                    results['top_1_scores'].append(top_1_score)
                    results['top_3_avg_scores'].append(top_3_avg)
                    total_similarity += top_1_score

                    # Consider retrieval successful if top score > 0.3
                    if top_1_score > 0.3:
                        successful_retrievals += 1

            except Exception as e:
                print(f"❌ Error evaluating query {i+1}: {e}")

        # Calculate aggregate metrics
        if results['top_1_scores']:
            results['avg_similarity'] = total_similarity / len(results['top_1_scores'])
            results['retrieval_success_rate'] = successful_retrievals / len(results['top_1_scores'])

        self._print_evaluation_results(results)
        return results

    def _print_evaluation_results(self, results: Dict):
        """Print formatted evaluation results"""
        print(f"\n📊 EVALUATION RESULTS: {results['system_name']}")
        print("=" * 50)
        print(f"Average Top-1 Similarity: {results['avg_similarity']:.4f}")
        print(f"Retrieval Success Rate: {results['retrieval_success_rate']:.2%}")

        if results['top_1_scores']:
            print(f"Best Query Score: {max(results['top_1_scores']):.4f}")
            print(f"Worst Query Score: {min(results['top_1_scores']):.4f}")
            print(f"Total Queries Evaluated: {len(results['top_1_scores'])}")

        # Show per-query breakdown for detailed analysis
        print(f"\n📋 PER-QUERY BREAKDOWN:")
        for i, query_result in enumerate(results['queries'], 1):
            query_short = query_result['query'][:60] + "..." if len(query_result['query']) > 60 else query_result['query']
            print(f"{i:2d}. Score: {query_result['top_1_score']:.3f} | {query_short}")

        print("=" * 50)

# Simple RAG system with model comparison (NO FINE-TUNING)
class SimpleRAGComparison:
    """Simple RAG system that compares different base models"""

    def __init__(self):
        self.base_embedding_manager = None
        self.enhanced_embedding_manager = None
        self.documents = {}
        self.chunks = []

    def setup_documents(self, documents: Dict[str, str]):
        """Setup documents and prepare for comparison"""
        self.documents = documents

        # Create chunks
        chunker = TextChunker(chunk_size=400, overlap=80)
        self.chunks = chunker.chunk_documents(documents)

        # Setup base system (MPNet)
        print("🔧 Setting up base embedding system (MPNet)...")
        self.base_embedding_manager = SimpleEmbeddingManager("sentence-transformers/all-mpnet-base-v2")
        self.base_embedding_manager.create_embeddings(self.chunks)

        # Setup enhanced system (BGE)
        print("🔧 Setting up enhanced embedding system (BGE)...")
        self.enhanced_embedding_manager = SimpleEmbeddingManager("BAAI/bge-base-en-v1.5")
        self.enhanced_embedding_manager.create_embeddings(self.chunks)

        print("✅ Both RAG systems ready!")

    def compare_systems(self) -> Dict:
        """Compare base vs enhanced systems"""
        evaluator = RetrievalEvaluator()

        # Evaluate base system
        base_results = evaluator.evaluate_system(self.base_embedding_manager, "Base System (MPNet)")

        # Evaluate enhanced system
        enhanced_results = evaluator.evaluate_system(self.enhanced_embedding_manager, "Enhanced System (BGE)")

        results = {
            'base': base_results,
            'enhanced': enhanced_results
        }

        # Calculate improvement
        improvement = enhanced_results['avg_similarity'] - base_results['avg_similarity']
        improvement_pct = (improvement / base_results['avg_similarity']) * 100 if base_results['avg_similarity'] > 0 else 0

        print(f"\n🎯 IMPROVEMENT SUMMARY:")
        print(f"Absolute Improvement: +{improvement:.4f}")
        print(f"Relative Improvement: +{improvement_pct:.1f}%")

        results['improvement'] = {
            'absolute': improvement,
            'relative_pct': improvement_pct
        }

        return results

    def generate_answer(self, query: str, use_enhanced: bool = True) -> Dict:
        """Generate answer using specified system"""
        manager = self.enhanced_embedding_manager if use_enhanced else self.base_embedding_manager

        try:
            chunks = manager.retrieve_relevant_chunks(query, top_k=5)
            answer = self._create_answer_from_chunks(query, chunks)

            return {
                'query': query,
                'answer': answer,
                'chunks': chunks,
                'system_used': 'enhanced' if use_enhanced else 'base'
            }
        except Exception as e:
            return {
                'query': query,
                'answer': f"Error: {str(e)}",
                'chunks': [],
                'system_used': 'error'
            }

    def _create_answer_from_chunks(self, query: str, chunks: List[Dict]) -> str:
        """Create answer from retrieved chunks"""
        if not chunks:
            return "No relevant information found."

        # Combine top chunks
        combined_text = " ".join([chunk['text'] for chunk in chunks[:3]])

        # Simple extractive approach
        sentences = re.split(r'(?<=[.!?])\s+', combined_text)
        query_words = set(query.lower().split())

        scored_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) < 20:
                continue

            sentence_words = set(sentence.lower().split())
            overlap = len(query_words.intersection(sentence_words))

            # Bonus for academic terms
            if any(term in sentence.lower() for term in ['rag', 'retrieval', 'embedding', 'method', 'performance']):
                overlap += 2

            if overlap > 0:
                scored_sentences.append((overlap, sentence.strip()))

        if scored_sentences:
            scored_sentences.sort(reverse=True, key=lambda x: x[0])
            top_sentences = [sent for _, sent in scored_sentences[:2]]
            return ". ".join(top_sentences) + "."

        return "Based on the retrieved information: " + combined_text[:200] + "..."

# Simple demo function
def run_simple_rag_comparison():
    """Run simple RAG comparison without fine-tuning"""
    print("🚀 Starting Simple RAG Model Comparison")
    print("=" * 60)

    # Step 1: Upload and process documents
    doc_processor = DocumentProcessor()
    documents = doc_processor.upload_and_process_pdfs()

    if not documents:
        print("❌ No documents uploaded.")
        return None

    # Step 2: Setup systems
    rag_system = SimpleRAGComparison()
    rag_system.setup_documents(documents)

    # Step 3: Compare systems
    comparison_results = rag_system.compare_systems()

    # Step 4: Test both systems on ALL queries
    test_queries = comparison_results['base']['queries'] if 'base' in comparison_results else []

    print(f"\n💬 Testing All {len(test_queries)} Queries:")
    print("=" * 60)

    detailed_results = []

    for i, query_data in enumerate(test_queries, 1):
        query = query_data['query']
        print(f"\n🔍 Query {i}: {query}")
        print("-" * 60)

        # Test base system
        base_result = rag_system.generate_answer(query, use_enhanced=False)
        base_score = base_result['chunks'][0]['similarity_score'] if base_result['chunks'] else 0.0

        # Test enhanced system
        enhanced_result = rag_system.generate_answer(query, use_enhanced=True)
        enhanced_score = enhanced_result['chunks'][0]['similarity_score'] if enhanced_result['chunks'] else 0.0

        # Calculate improvement
        improvement = enhanced_score - base_score
        improvement_pct = (improvement / base_score * 100) if base_score > 0 else 0

        print(f"📊 Similarity Scores:")
        print(f"   Base System (MPNet):  {base_score:.4f}")
        print(f"   Enhanced System (BGE): {enhanced_score:.4f}")
        print(f"   Improvement:          +{improvement:.4f} ({improvement_pct:+.1f}%)")

        print(f"\n📄 Base Answer: {base_result['answer'][:200]}...")
        print(f"\n🚀 Enhanced Answer: {enhanced_result['answer'][:200]}...")

        detailed_results.append({
            'query_id': i,
            'query': query,
            'base_score': base_score,
            'enhanced_score': enhanced_score,
            'improvement': improvement,
            'improvement_pct': improvement_pct,
            'base_answer': base_result['answer'],
            'enhanced_answer': enhanced_result['answer']
        })

        print("-" * 60)

    # Summary statistics
    if detailed_results:
        avg_base = np.mean([r['base_score'] for r in detailed_results])
        avg_enhanced = np.mean([r['enhanced_score'] for r in detailed_results])
        avg_improvement = avg_enhanced - avg_base
        avg_improvement_pct = (avg_improvement / avg_base * 100) if avg_base > 0 else 0

        print(f"\n📈 OVERALL SUMMARY:")
        print("=" * 60)
        print(f"Average Base Score (MPNet):  {avg_base:.4f}")
        print(f"Average Enhanced Score (BGE): {avg_enhanced:.4f}")
        print(f"Average Improvement:         +{avg_improvement:.4f} ({avg_improvement_pct:+.1f}%)")

        # Best improvements
        best_improvements = sorted(detailed_results, key=lambda x: x['improvement_pct'], reverse=True)[:3]
        print(f"\n🏆 TOP 3 IMPROVEMENTS:")
        for j, result in enumerate(best_improvements, 1):
            print(f"{j}. Query {result['query_id']}: {result['improvement_pct']:+.1f}% improvement")
            print(f"   {result['query'][:80]}...")

        # Save detailed results
        results_df = pd.DataFrame(detailed_results)
        results_df.to_csv('simple_model_comparison.csv', index=False)
        print(f"\n💾 Detailed results saved to 'simple_model_comparison.csv'")

        comparison_results['detailed_query_results'] = detailed_results

    return rag_system, comparison_results

if __name__ == "__main__":
    print("Simple RAG Model Comparison Ready!")
    print("Run: rag_system, results = run_simple_rag_comparison()")

Simple RAG Model Comparison Ready!
Run: rag_system, results = run_simple_rag_comparison()


In [43]:
results = run_simple_rag_comparison()

🚀 Starting Simple RAG Model Comparison
Please upload your PDF files:


Saving 2407.pdf to 2407 (7).pdf
✅ Processed 2407 (7).pdf: 127122 characters
✅ Created 62 chunks total
🔧 Setting up base embedding system (MPNet)...
🔄 Loading model: sentence-transformers/all-mpnet-base-v2
✅ Model loaded successfully


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Created 62 embeddings with sentence-transformers/all-mpnet-base-v2 (dim: 768)
🔧 Setting up enhanced embedding system (BGE)...
🔄 Loading model: BAAI/bge-base-en-v1.5
✅ Model loaded successfully


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Created 62 embeddings with BAAI/bge-base-en-v1.5 (dim: 768)
✅ Both RAG systems ready!
📊 Evaluating Base System (MPNet)...

📊 EVALUATION RESULTS: Base System (MPNet)
Average Top-1 Similarity: 0.6151
Retrieval Success Rate: 100.00%
Best Query Score: 0.7465
Worst Query Score: 0.3508
Total Queries Evaluated: 13

📋 PER-QUERY BREAKDOWN:
 1. Score: 0.351 | What is the primary goal of the SELF-ROUTE method proposed b...
 2. Score: 0.704 | Explain why the researchers believe RAG might still be usefu...
 3. Score: 0.726 | Compare the reranking techniques mentioned in the Wang paper...
 4. Score: 0.472 | What are the trade-offs involved when using different chunki...
 5. Score: 0.630 | How does multimodal retrieval enhance the capabilities of RA...
 6. Score: 0.662 | What were the key failure cases for RAG in handling long con...
 7. Score: 0.639 | Why does the Zhuowan paper claim that long-context LLMs outp...
 8. Score: 0.482 | Describe the metrics used to evaluate the different embeddin...
 