# Notebook 04: RAG System Prototype

### Build end-to-end retrieval-augmented generation system

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import requests
import os
from dotenv import load_dotenv
import time
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()
HF_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

if not HF_API_KEY:
    print("⚠️  Warning: HUGGINGFACE_API_KEY not found in .env file")
    print("Please create a .env file with your API key")

print("="*60)
print("RAG SYSTEM PROTOTYPE")
print("="*60)

RAG SYSTEM PROTOTYPE


In [3]:
# ================================
# 1. LOAD DATA & INDEX
# ================================

print("\n📂 Loading data and index...")

# Load cleaned papers
df = pd.read_csv('../data/arxiv_papers_clean.csv')
print(f"✓ Loaded {len(df)} papers")

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Loaded embedding model")

# Load FAISS index
index = faiss.read_index('../data/faiss_index.bin')
print(f"✓ Loaded FAISS index ({index.ntotal} vectors)")



📂 Loading data and index...
✓ Loaded 2130 papers
✓ Loaded embedding model
✓ Loaded FAISS index (2130 vectors)


In [4]:

# ================================
# 2. RETRIEVAL FUNCTION
# ================================

def retrieve_relevant_papers(query, top_k=5):
    """
    Retrieve papers most relevant to the query
    
    Args:
        query: User's search query
        top_k: Number of papers to retrieve
    
    Returns:
        DataFrame with retrieved papers and scores
    """
    start_time = time.time() # START TIMING

    # Encode query
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_normalized = query_embedding / np.linalg.norm(query_embedding)
    
    # Search in FAISS
    distances, indices = index.search(query_normalized.astype('float32'), top_k)
    
    # Get papers
    retrieved_papers = df.iloc[indices[0]].copy()
    retrieved_papers['relevance_score'] = distances[0]
    
    end_time = time.time() # END TIMING
    search_latency = end_time - start_time

    return retrieved_papers,search_latency


In [5]:
# ================================
# 3. SUMMARIZATION FUNCTION
# ================================

def summarize_with_llm(text, model_name="facebook/bart-large-cnn", max_length=130):
    """
    Summarize text using Hugging Face Inference API
    ...
    Returns:
        Generated summary and the generation latency (in seconds)
    """
    API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
    headers = {"Authorization": f"Bearer {HF_API_KEY}"}
    
    # Truncate if too long (API limits)
    if len(text.split()) > 500:
        text = ' '.join(text.split()[:500])
    
    payload = {
        "inputs": text,
        "parameters": {
            "max_length": max_length,
            "min_length": 30,
            "do_sample": False
        }
    }
    
    start_time = time.time() # START TIMING
    
    # Inner function to handle API call and retry
    def api_call(url, headers, json_payload):
        try:
            response = requests.post(url, headers=headers, json=json_payload, timeout=30)
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    return result[0].get('summary_text', ''), response.status_code
            return f"Error: Could not generate summary (Status {response.status_code})", response.status_code
        except Exception as e:
            return f"Error: {str(e)}", -1 # Use -1 for generic exception

    summary, status_code = api_call(API_URL, headers, payload)
    
    # If model is loading, wait and retry
    if status_code == 503:
        print("  Model loading... waiting 20 seconds...")
        time.sleep(20)
        summary, status_code = api_call(API_URL, headers, payload)

    end_time = time.time() # END TIMING
    latency = end_time - start_time

    return summary, latency # RETURN LATENCY

In [6]:
# ================================
# 4. RAG PIPELINE
# ================================

def rag_pipeline(query, top_k=3, summarize=True):
    """
    Full RAG pipeline: Retrieve + Generate
    
    Args:
        query: User's question
        top_k: Number of papers to retrieve
        summarize: Whether to generate summaries
    
    Returns:
        Dictionary with retrieved papers and summaries
    """
    print(f"\n🔍 Query: '{query}'")
    print("-" * 60)
    
    # Step 1: Retrieve relevant papers
    print(f"\n📚 Retrieving top {top_k} relevant papers...")
    papers, search_latency = retrieve_relevant_papers(query, top_k) # CAPTURE LATENCY
    
    print(f"✓ Retrieved {len(papers)} papers (Time: {search_latency:.3f}s)")
    
    # Step 2: Display papers
    results = []
    total_summary_latency = 0 # NEW VARIABLE
    for idx, (i, paper) in enumerate(papers.iterrows(), 1):
        print(f"\n{idx}. {paper['title']}")
        print(f"   Score: {paper['relevance_score']:.4f}")
        print(f"   Authors: {paper['authors'][:60]}...")
        print(f"   Categories: {paper['categories']}")
        
        result = {
            'rank': idx,
            'title': paper['title'],
            'abstract': paper['abstract_clean'],
            'authors': paper['authors'],
            'score': paper['relevance_score'],
            'pdf_url': paper['pdf_url']
        }
        
        # Step 3: Generate summary if requested
        if summarize and HF_API_KEY:
            
            summary, summary_latency = summarize_with_llm(paper['abstract_clean']) # CAPTURE LATENCY
            total_summary_latency += summary_latency # ACCUMULATE LATENCY

            print(f"  Summary: {summary} (Time: {summary_latency:.3f}s)")
            result['summary'] = summary
            
        
        results.append(result)
        
        if idx < len(papers):  # Avoid rate limiting
            time.sleep(1)
    
    return {
        'results': results,
        'search_latency': search_latency,
        'avg_summary_latency': total_summary_latency / len(papers) if len(papers) > 0 and summarize else 0
           }

In [7]:
# ================================
# 5. TEST RAG SYSTEM
# ================================

print("\n" + "="*60)
print("TESTING RAG SYSTEM")
print("="*60)

# Test queries
test_queries = [
    "attention mechanisms in transformers",
    "graph neural networks applications",
    "few-shot learning methods"
]

all_results = {}
all_search_latencies = [] # NEW LIST
all_summary_latencies = []

for query in test_queries:
    metrics = rag_pipeline(query, top_k=3, summarize=True)
    all_results[query] = metrics['results']
    print("\n" + "="*60)


    all_search_latencies.append(metrics['search_latency']) # STORE
    if metrics['avg_summary_latency'] > 0:
        all_summary_latencies.append(metrics['avg_summary_latency']) # STORE
    
    print("\n" + "="*60)


TESTING RAG SYSTEM

🔍 Query: 'attention mechanisms in transformers'
------------------------------------------------------------

📚 Retrieving top 3 relevant papers...
✓ Retrieved 3 papers (Time: 0.046s)

1. Vision Transformers: State of the Art and Research Challenges
   Score: 0.5049
   Authors: Bo-Kai Ruan, Hong-Han Shuai, Wen-Huang Cheng...
   Categories: cs.CV
  Summary: Transformers have achieved great success in natural language processing. This paper presents a comprehensive overview of the literature on different architecture designs and training tricks. Our goal is to provide a systematic review with the open research opportunities. (Time: 1.846s)

2. Tensor-to-Image: Image-to-Image Translation with Vision Transformers
   Score: 0.4946
   Authors: Yiğit Gündüç...
   Categories: cs.CV, cs.AI, cs.LG
  Summary: Vision transformers paper also proved that they can be used for computer vision tasks. With the help of self-attention, our model was able to generalize and apply to dif

In [8]:

# ================================
# 6. EVALUATE RETRIEVAL QUALITY
# ================================

print("\n" + "="*60)
print("RETRIEVAL QUALITY EVALUATION")
print("="*60)

# Manual evaluation: Check if retrieved papers are relevant
print("\nManual Relevance Assessment:")
print("For each query, review if retrieved papers match the topic\n")

for query, results in all_results.items():
    print(f"\n Query: '{query}'")
    for result in results:
        print(f"  {result['rank']}. {result['title'][:70]}...")
        print(f"     Relevance score: {result['score']:.4f}")



RETRIEVAL QUALITY EVALUATION

Manual Relevance Assessment:
For each query, review if retrieved papers match the topic


 Query: 'attention mechanisms in transformers'
  1. Vision Transformers: State of the Art and Research Challenges...
     Relevance score: 0.5049
  2. Tensor-to-Image: Image-to-Image Translation with Vision Transformers...
     Relevance score: 0.4946
  3. Perspectives and Prospects on Transformer Architecture for Cross-Modal...
     Relevance score: 0.4841

 Query: 'graph neural networks applications'
  1. Theory of Graph Neural Networks: Representation and Learning...
     Relevance score: 0.6616
  2. Deep Learning and Geometric Deep Learning: an introduction for mathema...
     Relevance score: 0.6293
  3. Automated Graph Machine Learning: Approaches, Libraries, Benchmarks an...
     Relevance score: 0.5737

 Query: 'few-shot learning methods'
  1. An Overview of Deep Learning Architectures in Few-Shot Learning Domain...
     Relevance score: 0.6662
  2. How to fi

In [9]:
# 7. COMPARE CONTEXT WINDOW SIZES
# ================================

print("\n" + "="*60)
print("CONTEXT WINDOW EXPERIMENT")
print("="*60)

query = "deep learning for image classification"

for k in [1, 3, 5]:
    print(f"\n Retrieving top-{k} papers:")
    papers,_ = retrieve_relevant_papers(query, top_k=k)
    
    context_length = sum(papers['abstract_clean'].str.split().str.len())
    print(f"  Total context words: {context_length}")
    print(f"  Average relevance: {papers['relevance_score'].mean():.4f}")
    print(f"  Min relevance: {papers['relevance_score'].min():.4f}")

print("\n Insight: top-3 provides good balance of relevance and context")



CONTEXT WINDOW EXPERIMENT

 Retrieving top-1 papers:
  Total context words: 97
  Average relevance: 0.6542
  Min relevance: 0.6542

 Retrieving top-3 papers:
  Total context words: 420
  Average relevance: 0.6202
  Min relevance: 0.5872

 Retrieving top-5 papers:
  Total context words: 676
  Average relevance: 0.6049
  Min relevance: 0.5774

 Insight: top-3 provides good balance of relevance and context


In [10]:
# ================================
# 8. PROMPT ENGINEERING EXPERIMENTS
# ================================

print("\n" + "="*60)
print("PROMPT ENGINEERING FOR BETTER SUMMARIES")
print("="*60)

# Experiment with adding context to summarization
def summarize_with_context(abstract, query):
    """Generate query-focused summary"""
    # Prepend query context
    contextualized_text = f"Research question: {query}\n\nAbstract: {abstract}"
    return summarize_with_llm(contextualized_text, max_length=150)

test_paper = df.iloc[0]
test_query = "machine learning"

print(f"\n📄 Paper: {test_paper['title'][:60]}...")
print(f"  Query: {test_query}")

print("\n1. Standard summary:")
summary1 = summarize_with_llm(test_paper['abstract_clean'])
print(f"   {summary1}")

print("\n2. Query-focused summary:")
summary2 = summarize_with_context(test_paper['abstract_clean'], test_query)
print(f"   {summary2}")


PROMPT ENGINEERING FOR BETTER SUMMARIES

📄 Paper: An Optimal Control View of Adversarial Machine Learning...
  Query: machine learning

1. Standard summary:
   ("I describe an optimal control view of adversarial machine learning. The control costs are defined by the adversary's goals to do harm and be hard to detect. This view encompasses many types of adversaries, including test-item attacks, training-data poisoning, and adversarial reward shaping.", 2.2076175212860107)

2. Query-focused summary:
   ("An optimal control view of adversarial machine learning. The control costs are defined by the adversary's goals to do harm and be hard to detect. This view encompasses many types of adversaria machine learning, including test-item attacks, training-data poisoning, and adversarial reward shaping.", 2.236933946609497)


In [11]:
# ================================
# 9. SAVE RAG FUNCTIONS
# ================================

print("\n" + "="*60)
print("SAVING RAG FUNCTIONS FOR APP")
print("="*60)

# These functions will be moved to utils.py for the Streamlit app
rag_functions = """
# Core RAG functions (to be used in app/utils.py):
# - retrieve_relevant_papers()
# - summarize_with_llm()
# - rag_pipeline()
"""

print(rag_functions)
print("\n✅ RAG prototype complete!")
print("Functions ready to be integrated into Streamlit app")


SAVING RAG FUNCTIONS FOR APP

# Core RAG functions (to be used in app/utils.py):
# - retrieve_relevant_papers()
# - summarize_with_llm()
# - rag_pipeline()


✅ RAG prototype complete!
Functions ready to be integrated into Streamlit app


In [12]:
# ================================
# 10. SUMMARY
# ================================

print("\n" + "="*60)
print("RAG SYSTEM SUMMARY")
print("="*60)

# CALCULATE ACTUAL AVERAGES
avg_search_latency = np.mean(all_search_latencies) if all_search_latencies else 0
avg_summary_latency = np.mean(all_summary_latencies) if all_summary_latencies else 0

summary_stats = {
    'Embedding Model': 'all-MiniLM-L6-v2',
    'Summarization Model': 'facebook/bart-large-cnn',
    'Retrieval Method': 'FAISS + Cosine Similarity',
    'Optimal Context Window': 'top-3 papers',
    'Avg Search Latency': f'{avg_search_latency*1000:.2f}ms', # Use real value
    'Avg Summary Generation Time (per paper)': f'{avg_summary_latency:.2f} seconds' # Use real value
}




for key, value in summary_stats.items():
    print(f"{key}: {value}")

print("\n✅ Ready for A/B testing in notebook 05")
print("="*60)


RAG SYSTEM SUMMARY
Embedding Model: all-MiniLM-L6-v2
Summarization Model: facebook/bart-large-cnn
Retrieval Method: FAISS + Cosine Similarity
Optimal Context Window: top-3 papers
Avg Search Latency: 29.68ms
Avg Summary Generation Time (per paper): 2.26 seconds

✅ Ready for A/B testing in notebook 05
