# 04 - Retrieval System Comparison

**Tesis:** Diseño y Validación de un Modelo Semántico Híbrido para Optimizar Sistemas RAG

Within-subjects experiment comparing 3 retrieval systems:
1. **Control 1 - BM25**: Pure lexical retrieval (BM25Okapi)
2. **Control 2 - Dense**: Pure semantic retrieval (BGE-large + FAISS)
3. **Experimental - Hybrid**: BM25 + Dense with RRF/Linear fusion + optional reranking

**Variables:** Fusion method (RRF vs Linear), alpha values, reranker presence.

In [None]:
import sys
sys.path.insert(0, '..')

import json
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid', font_scale=1.1)

print('Imports OK')

## 1. Load Hybrid Index

Load the pre-built FAISS + BM25 indices (adaptive/500/bge-large).

In [None]:
from src.embedding.embedding_manager import EmbeddingManager
from src.embedding.index.hybrid_index import HybridIndex
from src.retrieval.query_processor import QueryProcessor
from src.retrieval.bm25_retriever import BM25Retriever
from src.retrieval.dense_retriever import DenseRetriever
from src.retrieval.hybrid_retriever import HybridRetriever
from src.reranking.no_reranker import NoReranker

# Initialize components
embedding_manager = EmbeddingManager(
    model_name='bge-large',
    cache_dir='../data/embeddings',
    batch_size=64,
)

hybrid_index = HybridIndex(
    embedding_manager=embedding_manager,
    bm25_k1=1.2,
    bm25_b=0.75,
    indices_dir='../data/indices',
)

# Load pre-built indices
hybrid_index.load(chunk_strategy='adaptive', chunk_size=500)
stats = hybrid_index.get_stats()

print(f"FAISS: {stats['faiss']['total_vectors']} vectors, dim={stats['faiss']['dimension']}")
print(f"BM25: {stats['bm25']['num_documents']} documents")
print(f"Chunk map: {stats['chunk_map_size']} entries")

In [None]:
# Initialize retrievers
query_processor = QueryProcessor()

bm25_retriever = BM25Retriever(hybrid_index, query_processor)
dense_retriever = DenseRetriever(hybrid_index, query_processor)
hybrid_retriever = HybridRetriever(
    hybrid_index,
    query_processor=query_processor,
    reranker=None,  # No reranker for baseline comparison
    fusion_method='rrf',
    alpha=0.5,
    rrf_k=60,
)

print('All 3 retrieval systems ready!')

## 2. Test Queries

Compare all 3 systems on single-provider and cross-cloud queries.

In [None]:
# Define test queries
test_queries = [
    # Single-provider (AWS-focused)
    'How to create a VPC in AWS?',
    'Configure an S3 bucket lifecycle policy',
    'AWS Lambda function deployment with environment variables',
    
    # Single-provider (Azure-focused)
    'Deploy Azure Functions with HTTP trigger',
    'Azure Kubernetes Service AKS cluster setup',
    
    # Single-provider (GCP-focused)
    'Google Cloud Storage bucket permissions',
    'GCP Compute Engine instance types and pricing',
    
    # Cross-cloud queries
    'Compare serverless computing options between AWS, Azure and GCP',
    'Container orchestration best practices across cloud providers',
    'IAM identity and access management policies comparison',
    
    # Conceptual
    'What is a service mesh and how does it work?',
    'Kubernetes pod lifecycle and restart policies',
]

print(f'Total test queries: {len(test_queries)}')

In [None]:
# Run all 3 systems on each query
TOP_K = 5
all_results = {}

for query in test_queries:
    print(f'\nQuery: "{query}"')
    print('-' * 60)
    
    # BM25
    start = time.time()
    bm25_res = bm25_retriever.search(query, top_k=TOP_K)
    bm25_time = time.time() - start
    
    # Dense
    start = time.time()
    dense_res = dense_retriever.search(query, top_k=TOP_K)
    dense_time = time.time() - start
    
    # Hybrid (RRF)
    start = time.time()
    hybrid_res = hybrid_retriever.search(query, top_k=TOP_K, fusion='rrf', use_reranker=False)
    hybrid_time = time.time() - start
    
    all_results[query] = {
        'bm25': {'results': bm25_res, 'time': bm25_time},
        'dense': {'results': dense_res, 'time': dense_time},
        'hybrid_rrf': {'results': hybrid_res, 'time': hybrid_time},
    }
    
    # Summary
    for sys_name, data in all_results[query].items():
        providers = set(r.cloud_provider for r in data['results'])
        print(f'  {sys_name:12s}: {len(data["results"])} results, '
              f'providers={providers}, time={data["time"]*1000:.0f}ms')

print(f'\nCompleted {len(test_queries)} queries x 3 systems = {len(test_queries)*3} searches')

## 3. Detailed Result Comparison

In [None]:
def show_results(query, results_dict, top_n=5):
    """Display results from all 3 systems side by side."""
    print(f'\n{"="*80}')
    print(f'Query: "{query}"')
    print(f'{"="*80}')
    
    for sys_name, data in results_dict.items():
        print(f'\n--- {sys_name.upper()} ({data["time"]*1000:.0f}ms) ---')
        for i, r in enumerate(data['results'][:top_n]):
            text_preview = r.chunk_text[:120].replace('\n', ' ')
            print(f'  [{i+1}] score={r.score:.4f} | {r.cloud_provider}/{r.service_name} | {r.doc_type}')
            print(f'      {text_preview}...')
            if r.heading_path:
                print(f'      Path: {r.heading_path}')

# Show detailed results for key queries
show_results('How to create a VPC in AWS?', all_results['How to create a VPC in AWS?'])
show_results('Compare serverless computing options between AWS, Azure and GCP',
             all_results['Compare serverless computing options between AWS, Azure and GCP'])

## 4. Response Time Comparison

In [None]:
# Collect timing data
timing_data = []
for query, res_dict in all_results.items():
    for sys_name, data in res_dict.items():
        timing_data.append({
            'Query': query[:40] + '...',
            'System': sys_name,
            'Time (ms)': data['time'] * 1000,
        })

df_timing = pd.DataFrame(timing_data)

# Summary stats
print('Average Response Time by System:')
print(df_timing.groupby('System')['Time (ms)'].agg(['mean', 'std', 'min', 'max']).round(1))

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

system_colors = {'bm25': '#FF9800', 'dense': '#2196F3', 'hybrid_rrf': '#4CAF50'}
sns.boxplot(data=df_timing, x='System', y='Time (ms)', palette=system_colors, ax=ax)
sns.stripplot(data=df_timing, x='System', y='Time (ms)', color='black', alpha=0.3, ax=ax)

ax.set_title('Response Time Distribution by Retrieval System')
ax.set_ylabel('Time (milliseconds)')

plt.tight_layout()
plt.savefig('../output/figures/retrieval_timing.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Result Overlap Analysis

How much do the results from different systems overlap?

In [None]:
# Compute Jaccard similarity between result sets
overlap_data = []

for query in test_queries:
    res = all_results[query]
    
    bm25_ids = set(r.chunk_id for r in res['bm25']['results'])
    dense_ids = set(r.chunk_id for r in res['dense']['results'])
    hybrid_ids = set(r.chunk_id for r in res['hybrid_rrf']['results'])
    
    # Jaccard similarity
    def jaccard(a, b):
        if not a and not b:
            return 1.0
        return len(a & b) / len(a | b) if (a | b) else 0.0
    
    overlap_data.append({
        'Query': query[:40] + '...',
        'BM25 \u2229 Dense': jaccard(bm25_ids, dense_ids),
        'BM25 \u2229 Hybrid': jaccard(bm25_ids, hybrid_ids),
        'Dense \u2229 Hybrid': jaccard(dense_ids, hybrid_ids),
        'BM25 unique': len(bm25_ids - dense_ids - hybrid_ids),
        'Dense unique': len(dense_ids - bm25_ids - hybrid_ids),
        'Hybrid unique': len(hybrid_ids - bm25_ids - dense_ids),
    })

df_overlap = pd.DataFrame(overlap_data)
display(df_overlap.round(3))

In [None]:
# Overlap heatmap
avg_overlap = df_overlap[['BM25 \u2229 Dense', 'BM25 \u2229 Hybrid', 'Dense \u2229 Hybrid']].mean()

# Build symmetric matrix
systems = ['BM25', 'Dense', 'Hybrid']
overlap_matrix = np.ones((3, 3))
overlap_matrix[0, 1] = overlap_matrix[1, 0] = avg_overlap['BM25 \u2229 Dense']
overlap_matrix[0, 2] = overlap_matrix[2, 0] = avg_overlap['BM25 \u2229 Hybrid']
overlap_matrix[1, 2] = overlap_matrix[2, 1] = avg_overlap['Dense \u2229 Hybrid']

fig, ax = plt.subplots(figsize=(7, 5))
sns.heatmap(
    pd.DataFrame(overlap_matrix, index=systems, columns=systems),
    annot=True, fmt='.3f', cmap='YlGnBu', vmin=0, vmax=1, ax=ax
)
ax.set_title('Average Jaccard Overlap Between Retrieval Systems (Top-5)')

plt.tight_layout()
plt.savefig('../output/figures/retrieval_overlap.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Provider Diversity Analysis

Which system retrieves from the most diverse set of providers?

In [None]:
# Provider diversity per query per system
diversity_data = []

for query in test_queries:
    res = all_results[query]
    
    for sys_name, data in res.items():
        providers = [r.cloud_provider for r in data['results']]
        unique_providers = set(providers)
        
        diversity_data.append({
            'Query': query[:40] + '...',
            'System': sys_name,
            'Unique Providers': len(unique_providers),
            'Providers': ', '.join(sorted(unique_providers)),
        })

df_diversity = pd.DataFrame(diversity_data)

# Average diversity
print('Average Provider Diversity (unique providers in top-5):')
print(df_diversity.groupby('System')['Unique Providers'].agg(['mean', 'std', 'min', 'max']).round(2))

In [None]:
# Cross-cloud queries: check if hybrid gets chunks from all relevant providers
cross_cloud_queries = [
    'Compare serverless computing options between AWS, Azure and GCP',
    'Container orchestration best practices across cloud providers',
    'IAM identity and access management policies comparison',
]

print('Cross-Cloud Query Results - Provider Coverage:')
print('=' * 70)

for query in cross_cloud_queries:
    print(f'\nQuery: "{query}"')
    res = all_results[query]
    
    for sys_name, data in res.items():
        providers = [r.cloud_provider for r in data['results']]
        provider_counts = {}
        for p in providers:
            provider_counts[p] = provider_counts.get(p, 0) + 1
        print(f'  {sys_name:12s}: {dict(sorted(provider_counts.items()))}')

## 7. Fusion Method Comparison

Compare Linear vs RRF fusion with different alpha values.

In [None]:
# Grid search: fusion methods x alpha values
test_query_for_grid = 'How to create a VPC in AWS?'

grid_results = hybrid_retriever.grid_search_alpha(
    query=test_query_for_grid,
    alpha_values=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    fusion_methods=['linear', 'rrf'],
    top_k=5,
    top_k_candidates=50,
)

print(f'Grid search configs tested: {len(grid_results)}')
for key, data in grid_results.items():
    ids_preview = [r.chunk_id[:8] for r in data['results']]
    providers = [r.cloud_provider for r in data['results']]
    print(f'  {key:25s}: providers={set(providers)}')

In [None]:
# Multi-query grid search to find best alpha/fusion
grid_queries = [
    'How to create a VPC in AWS?',
    'Compare serverless computing options between AWS, Azure and GCP',
    'Kubernetes pod lifecycle and restart policies',
    'Configure an S3 bucket lifecycle policy',
    'Azure Kubernetes Service AKS cluster setup',
]

fusion_methods = ['linear', 'rrf']
alpha_values = [0.3, 0.5, 0.7]

# Track how results differ across configs
config_scores = {}

for query in grid_queries:
    grid_res = hybrid_retriever.grid_search_alpha(
        query=query,
        alpha_values=alpha_values,
        fusion_methods=fusion_methods,
        top_k=5,
    )
    
    for key, data in grid_res.items():
        if key not in config_scores:
            config_scores[key] = {'n_providers': [], 'avg_score': []}
        
        providers = set(r.cloud_provider for r in data['results'])
        avg_score = np.mean([r.score for r in data['results']]) if data['results'] else 0
        config_scores[key]['n_providers'].append(len(providers))
        config_scores[key]['avg_score'].append(avg_score)

# Display summary
config_summary = []
for key, scores in config_scores.items():
    config_summary.append({
        'Config': key,
        'Avg Providers': np.mean(scores['n_providers']),
        'Avg Score': np.mean(scores['avg_score']),
    })

df_configs = pd.DataFrame(config_summary).sort_values('Avg Score', ascending=False)
display(df_configs.round(3))

In [None]:
# Plot alpha sensitivity for Linear fusion
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for fusion_idx, fusion in enumerate(fusion_methods):
    alphas = []
    avg_providers_list = []
    avg_scores_list = []
    
    for alpha in alpha_values:
        key = f'{fusion}_alpha_{alpha}'
        if key in config_scores:
            alphas.append(alpha)
            avg_providers_list.append(np.mean(config_scores[key]['n_providers']))
            avg_scores_list.append(np.mean(config_scores[key]['avg_score']))
    
    ax = axes[fusion_idx]
    ax2 = ax.twinx()
    
    line1 = ax.plot(alphas, avg_scores_list, 'o-', color='#2196F3', label='Avg Score', linewidth=2)
    line2 = ax2.plot(alphas, avg_providers_list, 's--', color='#4CAF50', label='Avg Providers', linewidth=2)
    
    ax.set_xlabel('Alpha (BM25 weight)')
    ax.set_ylabel('Average Score', color='#2196F3')
    ax2.set_ylabel('Avg Unique Providers', color='#4CAF50')
    ax.set_title(f'{fusion.upper()} Fusion')
    
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax.legend(lines, labels, loc='best')

plt.suptitle('Fusion Parameter Sensitivity Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('../output/figures/retrieval_fusion_sensitivity.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. System Comparison Summary

In [None]:
# Build comprehensive comparison table
summary_rows = []

for sys_name in ['bm25', 'dense', 'hybrid_rrf']:
    times = []
    n_providers = []
    
    for query in test_queries:
        data = all_results[query][sys_name]
        times.append(data['time'] * 1000)
        providers = set(r.cloud_provider for r in data['results'])
        n_providers.append(len(providers))
    
    summary_rows.append({
        'System': sys_name.replace('_', ' ').upper(),
        'Avg Time (ms)': round(np.mean(times), 1),
        'Std Time (ms)': round(np.std(times), 1),
        'Avg Providers': round(np.mean(n_providers), 2),
        'Max Providers': max(n_providers),
    })

df_summary = pd.DataFrame(summary_rows).set_index('System')
display(df_summary)

In [None]:
# Final comparison chart
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

systems = df_summary.index.tolist()
colors = ['#FF9800', '#2196F3', '#4CAF50']

# Time
axes[0].bar(systems, df_summary['Avg Time (ms)'], color=colors, alpha=0.8)
axes[0].set_ylabel('Average Time (ms)')
axes[0].set_title('Response Time')
axes[0].tick_params(axis='x', rotation=15)

# Provider diversity
axes[1].bar(systems, df_summary['Avg Providers'], color=colors, alpha=0.8)
axes[1].set_ylabel('Avg Unique Providers in Top-5')
axes[1].set_title('Provider Diversity')
axes[1].tick_params(axis='x', rotation=15)

plt.suptitle('Retrieval System Comparison', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('../output/figures/retrieval_comparison_summary.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
print('=' * 60)
print('RETRIEVAL SYSTEM COMPARISON - KEY FINDINGS')
print('=' * 60)
print()
print('1. BM25 (Control 1): Fast lexical matching, good for exact keyword queries')
print('   - Best for: Single-provider, specific technical terms')
print('   - Limitation: Cannot capture semantic similarity')
print()
print('2. Dense (Control 2): Semantic understanding, captures meaning')
print('   - Best for: Conceptual queries, paraphrased questions')
print('   - Limitation: Can miss exact keyword matches')
print()
print('3. Hybrid (Experimental): Combines both strengths')
print('   - Best for: Cross-cloud queries, complex questions')
print('   - Advantage: Higher provider diversity in results')
print('   - Trade-off: Slightly higher latency')
print()
print('Next steps: Evaluate with ground truth labels (Phase 3)')