In [3]:
# First, install additional dependencies
!pip install sentence-transformers[train] datasets accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torc

In [1]:
!pip install transformers sentence-transformers faiss-cpu PyPDF2 torch openai python-dotenv
!pip install --upgrade langchain langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-m

In [11]:
import os
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
import PyPDF2
from io import BytesIO
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

In [12]:
import json
from sentence_transformers import SentenceTransformer, losses, InputExample
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader

In [14]:
# Document processor for handling PDF files
class DocumentProcessor:
    def __init__(self):
        self.documents = {}

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file"""
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    # Enhanced table extraction
                    page_text = self.enhance_table_extraction(page_text)
                    text += f"\nPage {page_num + 1}: {page_text}\n"
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
        return text

    def enhance_table_extraction(self, text: str) -> str:
        """Improve table formatting and preserve critical academic content"""
        lines = text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Self-route method detection
            if re.search(r'(self-route|routing|self-reflection)', line, re.IGNORECASE):
                processed_lines.append(f"SELF_ROUTE_CONTENT: {line}")

            # Failure type lists
            elif re.search(r'(failure|error).*(type|case|category)', line, re.IGNORECASE):
                processed_lines.append(f"FAILURE_TYPES: {line}")
            elif re.search(r'(multi-step|general knowledge|implicit|long.?complex)', line, re.IGNORECASE):
                processed_lines.append(f"FAILURE_DETAIL: {line}")

            # Evaluation metrics and tables
            elif re.search(r'(mrr|recall@|ndcg@|precision|f1)', line, re.IGNORECASE):
                line = re.sub(r'\s+', ' | ', line)
                processed_lines.append(f"METRICS_TABLE: {line}")

            # Chunking strategy content
            elif re.search(r'(chunk|segment|overlap|window)', line, re.IGNORECASE):
                processed_lines.append(f"CHUNKING_STRATEGY: {line}")

            # Performance comparisons
            elif re.search(r'(outperform|superior|better|vs|versus|comparison)', line, re.IGNORECASE):
                processed_lines.append(f"PERFORMANCE_COMPARISON: {line}")

            # Method objectives and goals
            elif re.search(r'(goal|objective|aim|purpose|method)', line, re.IGNORECASE):
                processed_lines.append(f"METHOD_GOAL: {line}")

            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = re.sub(r'\n\s*\n', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'([.!?])\s+([A-Z])', r'\1\n\2', text)
        text = re.sub(r'[^\w\s.,;:!?()\%@\-\[\]{}|]', '', text)
        return text.strip()

    def upload_and_process_pdfs(self) -> Dict[str, str]:
        """Upload and process PDF files"""
        print("Please upload your PDF files:")
        uploaded = files.upload()

        for filename, content in uploaded.items():
            if filename.endswith('.pdf'):
                # Save uploaded file
                with open(filename, 'wb') as f:
                    f.write(content)

                # Process the PDF
                text = self.extract_text_from_pdf(filename)
                cleaned_text = self.clean_text(text)
                self.documents[filename] = cleaned_text
                print(f"✅ Processed {filename}: {len(cleaned_text)} characters")

        return self.documents


In [15]:
# Text chunking with overlap
class TextChunker:
    def __init__(self, chunk_size: int = 512, overlap: int = 100):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text: str, document_name: str) -> List[Dict]:
        """Split text into overlapping chunks"""
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""
        word_count = 0

        for para in paragraphs:
            sentences = re.split(r'(?<=[.!?])\s+', para)

            for sentence in sentences:
                sentence_words = sentence.split()

                if word_count + len(sentence_words) > self.chunk_size and current_chunk:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'document': document_name,
                        'chunk_id': len(chunks),
                        'word_count': word_count
                    })

                    # Create new chunk with overlap
                    overlap_text = ' '.join(current_chunk.split()[-self.overlap:])
                    current_chunk = overlap_text + " " + sentence
                    word_count = len(current_chunk.split())
                else:
                    current_chunk += " " + sentence
                    word_count += len(sentence_words)

        # Add final chunk
        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'document': document_name,
                'chunk_id': len(chunks),
                'word_count': word_count
            })

        return chunks

    def chunk_documents(self, documents: Dict[str, str]) -> List[Dict]:
        """Chunk all documents"""
        all_chunks = []
        for doc_name, text in documents.items():
            chunks = self.chunk_text(text, doc_name)
            all_chunks.extend(chunks)

        print(f"✅ Created {len(all_chunks)} chunks total")
        return all_chunks

# Embedding manager for document retrieval
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.embeddings = None
        self.chunks = None
        self.index = None

    def create_embeddings(self, chunks: List[Dict]) -> np.ndarray:
        """Create embeddings for chunks"""
        texts = [chunk['text'] for chunk in chunks]
        embeddings = self.model.encode(texts, show_progress_bar=True)

        self.chunks = chunks
        self.embeddings = embeddings

        # Create FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)

        # Normalize embeddings for cosine similarity
        embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        self.index.add(embeddings_normalized.astype('float32'))

        print(f"✅ Created {len(embeddings)} embeddings with dimension {dimension}")
        return embeddings

    def expand_query(self, query: str) -> List[str]:
        """Expand query for better retrieval"""
        base_query = query.lower()
        expanded_queries = [query]

        # Self-route specific expansions
        if 'self-route' in base_query and 'goal' in base_query:
            expanded_queries.extend([
                "SELF-ROUTE self-reflection routing decision",
                "model self-reflection dynamically route queries",
                "routing between RAG and LC cost context length",
                "Zhuowan Li SELF-ROUTE method objective",
                "self-reflection mechanism route queries RAG long-context"
            ])

        # Failure types expansions
        if 'failure' in base_query and ('four' in base_query or 'types' in base_query or 'cases' in base_query):
            expanded_queries.extend([
                "four failure types RAG multi-step general implicit long complex",
                "Multi-step reasoning failure General knowledge failure",
                "Implicit knowledge failure Long complex context failure",
                "failure categories RAG handling long context",
                "Zhuowan Li four key failure cases"
            ])

        # Add more expansions as needed...
        return expanded_queries

    def retrieve_relevant_chunks(self, query: str, top_k: int = 8) -> List[Dict]:
        """Retrieve relevant chunks for query"""
        if self.index is None:
            raise ValueError("Index not created. Please create embeddings first.")

        expanded_queries = self.expand_query(query)
        all_results = []

        for exp_query in expanded_queries:
            query_embedding = self.model.encode([exp_query])
            query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

            scores, indices = self.index.search(query_normalized.astype('float32'), top_k)

            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.chunks):  # Safety check
                    chunk = self.chunks[idx].copy()
                    chunk['similarity_score'] = float(score)
                    chunk['query_variant'] = exp_query
                    all_results.append(chunk)

        # Remove duplicates and sort by score
        seen_chunks = set()
        unique_results = []
        for result in all_results:
            chunk_id = result['chunk_id']
            if chunk_id not in seen_chunks:
                seen_chunks.add(chunk_id)
                unique_results.append(result)

        unique_results.sort(key=lambda x: x['similarity_score'], reverse=True)
        return unique_results[:top_k]

In [16]:
# Main RAG System
class ImprovedRAGSystem:
    def __init__(self):
        self.embedding_manager = EmbeddingManager()
        self.documents = {}

    def setup_documents(self, documents: Dict[str, str]):
        """Setup documents for RAG system"""
        self.documents = documents
        chunker = TextChunker(chunk_size=400, overlap=80)
        chunks = chunker.chunk_documents(documents)
        self.embedding_manager.create_embeddings(chunks)
        print("✅ RAG system ready!")

    def generate_answer(self, query: str, top_k: int = 8) -> Dict:
        """Generate answer for query"""
        try:
            # Retrieve relevant chunks
            relevant_chunks = self.embedding_manager.retrieve_relevant_chunks(query, top_k)

            # Create context from chunks
            context = self.create_context(relevant_chunks, query)

            # Generate extractive answer
            answer = self.generate_extractive_answer(query, context)

            return {
                'query': query,
                'answer': answer,
                'relevant_chunks': relevant_chunks,
                'context_used': context
            }

        except Exception as e:
            print(f"Error in generate_answer: {e}")
            return {
                'query': query,
                'answer': f"Error processing query: {str(e)}",
                'relevant_chunks': [],
                'context_used': ""
            }

    def create_context(self, chunks: List[Dict], query: str, max_length: int = 1000) -> str:
        """Create focused context from chunks"""
        context_parts = []
        current_length = 0

        sorted_chunks = sorted(chunks, key=lambda x: x['similarity_score'], reverse=True)

        for i, chunk in enumerate(sorted_chunks):
            relevant_sentences = self.extract_relevant_sentences(chunk['text'], query)

            if relevant_sentences:
                chunk_context = f"[Source {i+1}]: {relevant_sentences}"

                if current_length + len(chunk_context) > max_length:
                    break

                context_parts.append(chunk_context)
                current_length += len(chunk_context)

        return "\n\n".join(context_parts)

    def extract_relevant_sentences(self, text: str, query: str) -> str:
        """Extract most relevant sentences from text"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        query_words = set(query.lower().split())

        scored_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) < 15:
                continue

            sentence_words = set(sentence.lower().split())
            overlap = len(query_words.intersection(sentence_words))

            # Bonus for academic terms
            if any(term in sentence.lower() for term in ['rag', 'llm', 'retrieval', 'embedding', 'failure', 'performance']):
                overlap += 1

            if overlap > 0:
                scored_sentences.append((overlap, sentence))

        scored_sentences.sort(reverse=True, key=lambda x: x[0])
        top_sentences = [sent for _, sent in scored_sentences[:3]]

        return ". ".join(top_sentences) + "." if top_sentences else ""

    def generate_extractive_answer(self, query: str, context: str) -> str:
        """Generate extractive answer from context"""
        if not context:
            return "No relevant information found in the documents."

        query_lower = query.lower()

        # Extract based on query type
        sentences = re.split(r'(?<=[.!?])\s+', context)
        scored_sentences = []
        query_words = set(query.split())

        for sentence in sentences:
            if len(sentence.strip()) < 25:
                continue

            sentence_lower = sentence.lower()
            sentence_words = set(sentence_lower.split())

            # Calculate relevance score
            overlap = len(query_words.intersection(sentence_words))

            # Add bonuses for specific terms
            bonus = 0
            if 'self-route' in query_lower:
                if any(term in sentence_lower for term in ['self-route', 'routing', 'reflection']):
                    bonus += 3

            if 'failure' in query_lower:
                if any(term in sentence_lower for term in ['failure', 'error', 'problem']):
                    bonus += 3

            total_score = overlap + bonus

            if total_score >= 3:
                scored_sentences.append((total_score, sentence.strip()))

        if scored_sentences:
            scored_sentences.sort(reverse=True, key=lambda x: x[0])
            top_sentences = [sent for _, sent in scored_sentences[:3]]
            return ". ".join(top_sentences) + "."

        return "Based on the available documents, no specific information was found."

In [17]:
# Demo function
def run_rag_demo():
    """Run complete RAG demonstration"""
    print("🚀 Starting Advanced RAG System Demo")
    print("=" * 50)

    # Step 1: Initialize components
    doc_processor = DocumentProcessor()
    rag_system = ImprovedRAGSystem()

    # Step 2: Upload and process documents
    print("\n📁 Step 1: Document Processing")
    documents = doc_processor.upload_and_process_pdfs()

    if not documents:
        print("❌ No documents uploaded. Demo cannot continue.")
        return None

    # Step 3: Setup RAG system
    print("\n🔧 Step 2: Setting up RAG System")
    rag_system.setup_documents(documents)

    # Step 4: Demo queries
    demo_queries = [
        "What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?",
        "What are the four failure types for RAG identified in the research?",
        "How does chunking strategy affect RAG performance?",
        "What metrics are used to evaluate embedding models?",
        "Why might RAG still be useful despite long-context LLM superiority?"
    ]

    print("\n💬 Step 3: Testing Queries")
    print("-" * 30)

    results = []
    for i, query in enumerate(demo_queries, 1):
        print(f"\n🔍 Query {i}: {query}")
        result = rag_system.generate_answer(query)
        results.append(result)

        print(f"📄 Answer: {result['answer'][:200]}...")
        if result['relevant_chunks']:
            print(f"📊 Top similarity score: {result['relevant_chunks'][0]['similarity_score']:.3f}")
        print("-" * 30)

    print(f"\n✅ Demo completed! Processed {len(results)} queries.")
    return rag_system, results

# Test queries for evaluation
TEST_QUERIES = [
    "What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?",
    "Explain why the researchers believe RAG might still be useful despite the superior performance of long-context LLMs",
    "Compare the reranking techniques mentioned in the Wang paper. How do they impact the retrieval quality?",
    "What are the trade-offs involved when using different chunking strategies in RAG systems?",
    "How does multimodal retrieval enhance the capabilities of RAG?",
    "What were the key failure cases for RAG in handling long context retrievals, as noted by Zhuowan Li?",
    "Why does the Zhuowan paper claim that long-context LLMs outperformed RAG in most cases? What benefits does RAG still offer?",
    "Describe the metrics used to evaluate the different embedding models for RAG in Wang's paper",
    "Discuss the implications of using self-reflection in routing queries between RAG and long-context LLMs",
    "How does query rewriting contribute to the overall efficiency of RAG according to Wang's findings?"
]

if __name__ == "__main__":
    print("Advanced RAG System Ready!")
    print("Run: rag_system, results = run_rag_demo()")

Advanced RAG System Ready!
Run: rag_system, results = run_rag_demo()


In [19]:
results = run_rag_demo()

🚀 Starting Advanced RAG System Demo


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


📁 Step 1: Document Processing
Please upload your PDF files:


Saving 2407.pdf to 2407.pdf
✅ Processed 2407.pdf: 127122 characters

🔧 Step 2: Setting up RAG System
✅ Created 62 chunks total


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Created 62 embeddings with dimension 384
✅ RAG system ready!

💬 Step 3: Testing Queries
------------------------------

🔍 Query 1: What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?
📄 Answer: [Source 1]: While recent LLMs like Gemini-1.5 (Reid et al., 2024), GPT-4 (Achiam et al., 2023), Claude- 3 (Anthropic, 2024) achieve significantly larger CHUNKING_STRATEGY: context window size, long-co...
📊 Top similarity score: 0.528
------------------------------

🔍 Query 2: What are the four failure types for RAG identified in the research?
📄 Answer: For NarrativeQA, which are long stories containing
FAILURE_TYPES: a lot of dialogues, most failure cases are due to im-
plicit queries that requires understanding the whole
context (shown in green)......
📊 Top similarity score: 0.482
------------------------------

🔍 Query 3: How does chunking strategy affect RAG performance?
📄 Answer: Based on the available documents, no specific information was found....
📊 Top similarity s