# 03 - Embedding Model Comparison

**Tesis:** Diseño y Validación de un Modelo Semántico Híbrido para Optimizar Sistemas RAG

This notebook compares 4 embedding models on our cloud documentation corpus:
1. **all-MiniLM-L6-v2** (384d) - Lightweight baseline
2. **BGE-large-en-v1.5** (1024d) - State-of-the-art (primary candidate)
3. **E5-large-v2** (1024d) - Competitive alternative
4. **Instructor-large** (768d) - Task-specific instructions

**Metrics:** Embedding time, file size, VRAM usage, t-SNE visualization, cross-provider similarity.

In [None]:
import sys
sys.path.insert(0, '..')

import json
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid', font_scale=1.1)

print('Imports OK')

## 1. Load Sample Chunks

We'll use a representative sample of adaptive/500 chunks for comparison.

In [None]:
# Load all adaptive/500 chunks
chunks_dir = Path('../data/chunks/adaptive/size_500')
all_chunks = []

for jf in sorted(chunks_dir.glob('*.json')):
    data = json.loads(jf.read_text(encoding='utf-8'))
    all_chunks.append(data)

print(f'Total chunks: {len(all_chunks)}')

# Provider distribution
providers = {}
for c in all_chunks:
    p = c.get('cloud_provider', 'unknown')
    providers[p] = providers.get(p, 0) + 1

for p, count in sorted(providers.items(), key=lambda x: x[1], reverse=True):
    print(f'  {p}: {count} chunks ({count/len(all_chunks)*100:.1f}%)')

In [None]:
# Take a stratified sample (500 per provider, max 2000 total) for faster comparison
import random
random.seed(42)

SAMPLE_SIZE_PER_PROVIDER = 400

by_provider = {}
for c in all_chunks:
    p = c.get('cloud_provider', 'unknown')
    by_provider.setdefault(p, []).append(c)

sample_chunks = []
for p, chunks_list in by_provider.items():
    n = min(SAMPLE_SIZE_PER_PROVIDER, len(chunks_list))
    sample_chunks.extend(random.sample(chunks_list, n))

random.shuffle(sample_chunks)
sample_texts = [c['text'] for c in sample_chunks]
sample_providers = [c.get('cloud_provider', 'unknown') for c in sample_chunks]
sample_ids = [c['chunk_id'] for c in sample_chunks]

print(f'Sample size: {len(sample_chunks)}')
for p in sorted(set(sample_providers)):
    count = sample_providers.count(p)
    print(f'  {p}: {count}')

## 2. Embed with All 4 Models

We measure: embedding time, throughput (docs/sec), VRAM usage.

In [None]:
import torch
from src.embedding.embedding_manager import EmbeddingManager, MODEL_CONFIGS

models_to_test = ['all-MiniLM-L6-v2', 'bge-large', 'e5-large']

# Add instructor-large only if InstructorEmbedding is installed
try:
    import InstructorEmbedding
    models_to_test.append('instructor-large')
    print('InstructorEmbedding available - testing 4 models')
except ImportError:
    print('InstructorEmbedding not installed - testing 3 models (instructor-large will use SentenceTransformer fallback)')
    models_to_test.append('instructor-large')  # Will use fallback

print(f'\nModels to test: {models_to_test}')
print(f'Device: {"cuda" if torch.cuda.is_available() else "cpu"}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB')

In [None]:
# Embed with each model and collect metrics
results = {}

for model_name in models_to_test:
    print(f'\n{"="*60}')
    print(f'Testing: {model_name}')
    print(f'{"="*60}')
    
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        vram_before = torch.cuda.memory_allocated() / 1024**2
    
    mgr = EmbeddingManager(
        model_name=model_name,
        cache_dir='../data/embeddings',
        batch_size=64,
    )
    
    # Force model load
    _ = mgr.model
    
    if torch.cuda.is_available():
        vram_model = torch.cuda.memory_allocated() / 1024**2
    
    # Embed
    start = time.time()
    embeddings = mgr.embed_documents(sample_texts, show_progress=True)
    elapsed = time.time() - start
    
    if torch.cuda.is_available():
        vram_peak = torch.cuda.max_memory_allocated() / 1024**2
    else:
        vram_model = 0
        vram_peak = 0
    
    # Estimate file size
    file_size_mb = embeddings.nbytes / (1024 * 1024)
    
    results[model_name] = {
        'embeddings': embeddings,
        'dimension': embeddings.shape[1],
        'time_seconds': elapsed,
        'throughput': len(sample_texts) / elapsed,
        'vram_model_mb': vram_model,
        'vram_peak_mb': vram_peak,
        'file_size_mb': file_size_mb,
    }
    
    print(f'  Dimension: {embeddings.shape[1]}')
    print(f'  Time: {elapsed:.1f}s ({len(sample_texts)/elapsed:.0f} docs/sec)')
    print(f'  VRAM (model): {vram_model:.0f} MB')
    print(f'  VRAM (peak): {vram_peak:.0f} MB')
    print(f'  File size ({len(sample_texts)} docs): {file_size_mb:.1f} MB')
    
    # Cleanup
    del mgr
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

## 3. Performance Comparison Table

In [None]:
# Build comparison DataFrame
rows = []
for model_name, r in results.items():
    config = MODEL_CONFIGS[model_name]
    rows.append({
        'Model': model_name,
        'Full Name': config['full_name'],
        'Dimension': r['dimension'],
        'Time (s)': round(r['time_seconds'], 1),
        'Throughput (docs/s)': round(r['throughput'], 0),
        'VRAM Model (MB)': round(r['vram_model_mb'], 0),
        'VRAM Peak (MB)': round(r['vram_peak_mb'], 0),
        'File Size (MB)': round(r['file_size_mb'], 1),
    })

df_comparison = pd.DataFrame(rows)
df_comparison.set_index('Model', inplace=True)
display(df_comparison)

In [None]:
# Bar chart: Time & Throughput
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

models = list(results.keys())
colors = ['#2196F3', '#4CAF50', '#FF9800', '#9C27B0']

# Time
times = [results[m]['time_seconds'] for m in models]
axes[0].barh(models, times, color=colors[:len(models)])
axes[0].set_xlabel('Time (seconds)')
axes[0].set_title('Embedding Time')
for i, v in enumerate(times):
    axes[0].text(v + 0.3, i, f'{v:.1f}s', va='center')

# Throughput
throughputs = [results[m]['throughput'] for m in models]
axes[1].barh(models, throughputs, color=colors[:len(models)])
axes[1].set_xlabel('Documents / second')
axes[1].set_title('Throughput')
for i, v in enumerate(throughputs):
    axes[1].text(v + 1, i, f'{v:.0f}', va='center')

# VRAM
vrams = [results[m]['vram_peak_mb'] for m in models]
axes[2].barh(models, vrams, color=colors[:len(models)])
axes[2].set_xlabel('Peak VRAM (MB)')
axes[2].set_title('GPU Memory Usage')
for i, v in enumerate(vrams):
    axes[2].text(v + 10, i, f'{v:.0f} MB', va='center')

plt.tight_layout()
plt.savefig('../output/figures/embedding_performance.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: output/figures/embedding_performance.png')

## 4. t-SNE Visualization

Visualize how each model clusters documents by cloud provider.

In [None]:
# t-SNE for each model (use subsample for speed)
TSNE_SAMPLE = min(1000, len(sample_chunks))
tsne_idx = random.sample(range(len(sample_chunks)), TSNE_SAMPLE)
tsne_providers = [sample_providers[i] for i in tsne_idx]

provider_colors = {
    'aws': '#FF9900',
    'azure': '#0078D4',
    'gcp': '#4285F4',
    'kubernetes': '#326CE5',
    'cncf': '#00B39F',
}

n_models = len(models)
fig, axes = plt.subplots(1, n_models, figsize=(6 * n_models, 5))
if n_models == 1:
    axes = [axes]

for ax_idx, model_name in enumerate(models):
    emb_sub = results[model_name]['embeddings'][tsne_idx]
    
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
    coords = tsne.fit_transform(emb_sub)
    
    for provider in sorted(set(tsne_providers)):
        mask = [p == provider for p in tsne_providers]
        color = provider_colors.get(provider, '#999999')
        axes[ax_idx].scatter(
            coords[mask, 0], coords[mask, 1],
            c=color, label=provider, alpha=0.5, s=10,
        )
    
    axes[ax_idx].set_title(f'{model_name}\n(dim={results[model_name]["dimension"]})', fontsize=11)
    axes[ax_idx].set_xticks([])
    axes[ax_idx].set_yticks([])
    if ax_idx == 0:
        axes[ax_idx].legend(fontsize=8, loc='lower left')

plt.suptitle('t-SNE: Embedding Space by Cloud Provider', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('../output/figures/embedding_tsne.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: output/figures/embedding_tsne.png')

## 5. Cross-Provider Similarity Analysis

How similar are equivalent services across providers? (e.g., AWS Lambda vs Azure Functions vs GCP Cloud Functions)

In [None]:
# Define cross-provider equivalent queries
cross_provider_queries = [
    'How to create a virtual machine instance',
    'Serverless function deployment and configuration',
    'Object storage bucket creation and management',
    'Container orchestration with Kubernetes',
    'Identity and access management IAM policies',
    'Virtual private cloud VPC networking setup',
    'Load balancer configuration and health checks',
    'Database managed service setup and scaling',
]

print(f'Testing {len(cross_provider_queries)} cross-provider queries')

In [None]:
# For each model, embed queries and compute similarity to provider-specific chunks
cross_provider_results = {}

for model_name in models:
    print(f'\nAnalyzing: {model_name}')
    mgr = EmbeddingManager(model_name=model_name, cache_dir='../data/embeddings', batch_size=64)
    
    # Embed queries
    query_embs = []
    for q in cross_provider_queries:
        query_embs.append(mgr.embed_query(q))
    query_embs = np.array(query_embs)
    
    # Compute similarity per provider
    provider_sims = {}
    for provider in sorted(set(sample_providers)):
        mask = [p == provider for p in sample_providers]
        provider_embs = results[model_name]['embeddings'][mask]
        
        # Average max similarity across queries
        sims = cosine_similarity(query_embs, provider_embs)
        avg_max_sim = np.mean(np.max(sims, axis=1))
        provider_sims[provider] = avg_max_sim
    
    cross_provider_results[model_name] = provider_sims
    
    for p, s in sorted(provider_sims.items()):
        print(f'  {p}: avg max sim = {s:.4f}')
    
    del mgr

In [None]:
# Heatmap: Cross-provider similarity per model
fig, axes = plt.subplots(1, len(models), figsize=(5 * len(models), 4))
if len(models) == 1:
    axes = [axes]

for ax_idx, model_name in enumerate(models):
    sims = cross_provider_results[model_name]
    data = pd.DataFrame([sims]).T
    data.columns = ['Avg Max Sim']
    
    sns.heatmap(
        data, annot=True, fmt='.3f', cmap='YlOrRd',
        vmin=0.3, vmax=1.0, ax=axes[ax_idx]
    )
    axes[ax_idx].set_title(model_name, fontsize=11)

plt.suptitle('Cross-Provider Query Similarity by Model', fontsize=13, y=1.02)
plt.tight_layout()
plt.savefig('../output/figures/embedding_cross_provider_sim.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Intra-Cluster vs Inter-Cluster Similarity

Measure how well each model separates provider-specific content from cross-provider content.

In [None]:
# Compute average intra/inter-provider cosine similarity
CLUSTER_SAMPLE = min(200, len(sample_chunks))
cluster_idx = random.sample(range(len(sample_chunks)), CLUSTER_SAMPLE)
cluster_providers = [sample_providers[i] for i in cluster_idx]

cluster_metrics = {}
for model_name in models:
    emb_sub = results[model_name]['embeddings'][cluster_idx]
    sim_matrix = cosine_similarity(emb_sub)
    
    intra_sims = []
    inter_sims = []
    
    for i in range(len(cluster_idx)):
        for j in range(i + 1, len(cluster_idx)):
            if cluster_providers[i] == cluster_providers[j]:
                intra_sims.append(sim_matrix[i, j])
            else:
                inter_sims.append(sim_matrix[i, j])
    
    cluster_metrics[model_name] = {
        'intra_mean': np.mean(intra_sims),
        'intra_std': np.std(intra_sims),
        'inter_mean': np.mean(inter_sims),
        'inter_std': np.std(inter_sims),
        'separation': np.mean(intra_sims) - np.mean(inter_sims),
    }

df_cluster = pd.DataFrame(cluster_metrics).T
df_cluster.columns = ['Intra-Provider Mean', 'Intra-Provider Std', 'Inter-Provider Mean', 'Inter-Provider Std', 'Separation']
display(df_cluster.round(4))

In [None]:
# Bar chart: Intra vs Inter similarity
fig, ax = plt.subplots(figsize=(10, 5))

x = np.arange(len(models))
width = 0.35

intra = [cluster_metrics[m]['intra_mean'] for m in models]
inter = [cluster_metrics[m]['inter_mean'] for m in models]

bars1 = ax.bar(x - width/2, intra, width, label='Intra-Provider', color='#4CAF50', alpha=0.8)
bars2 = ax.bar(x + width/2, inter, width, label='Inter-Provider', color='#FF5722', alpha=0.8)

ax.set_ylabel('Average Cosine Similarity')
ax.set_title('Intra-Provider vs Inter-Provider Similarity')
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=15)
ax.legend()
ax.set_ylim(0, 1)

# Add separation score annotations
for i, m in enumerate(models):
    sep = cluster_metrics[m]['separation']
    ax.annotate(
        f'\u0394={sep:.3f}', xy=(i, max(intra[i], inter[i]) + 0.02),
        ha='center', fontsize=9, color='navy', fontweight='bold'
    )

plt.tight_layout()
plt.savefig('../output/figures/embedding_cluster_separation.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Full Corpus Embedding (with selected model)

Embed the full adaptive/500 corpus with the primary model (BGE-large) for use in retrieval experiments.

In [None]:
# Check if full embeddings already cached
from src.embedding.embedding_manager import EmbeddingManager

primary_model = 'bge-large'
mgr = EmbeddingManager(model_name=primary_model, cache_dir='../data/embeddings', batch_size=64)

cached = mgr.load_embeddings('adaptive', 500)
if cached is not None:
    print(f'Full embeddings already cached: {cached[0].shape}')
    print(f'Chunk IDs: {len(cached[1])}')
else:
    print('Full embeddings not cached yet.')
    print('Run: python scripts/build_index.py --embedding bge-large --chunker adaptive --size 500')
    print('Or run the next cell to embed now.')

In [None]:
# Optionally: Embed full corpus here (will take several minutes on GPU)
if cached is None:
    all_texts = [c['text'] for c in all_chunks]
    all_ids = [c['chunk_id'] for c in all_chunks]
    
    print(f'Embedding {len(all_texts)} chunks with {primary_model}...')
    start = time.time()
    embeddings, ids = mgr.embed_and_cache(all_texts, all_ids, 'adaptive', 500)
    elapsed = time.time() - start
    print(f'Done in {elapsed:.1f}s ({len(all_texts)/elapsed:.0f} docs/sec)')
    print(f'Embeddings shape: {embeddings.shape}')
else:
    print('Using cached embeddings.')

## 8. Summary

Key findings from embedding model comparison.

In [None]:
print('=' * 60)
print('EMBEDDING MODEL COMPARISON - SUMMARY')
print('=' * 60)

# Identify best model for each metric
fastest = min(results, key=lambda m: results[m]['time_seconds'])
smallest = min(results, key=lambda m: results[m]['file_size_mb'])
best_sep = max(cluster_metrics, key=lambda m: cluster_metrics[m]['separation'])
least_vram = min(results, key=lambda m: results[m]['vram_peak_mb'])

print(f'\nFastest model:         {fastest} ({results[fastest]["time_seconds"]:.1f}s)')
print(f'Smallest file size:    {smallest} ({results[smallest]["file_size_mb"]:.1f} MB)')
print(f'Best cluster sep.:     {best_sep} (delta={cluster_metrics[best_sep]["separation"]:.4f})')
print(f'Least VRAM:            {least_vram} ({results[least_vram]["vram_peak_mb"]:.0f} MB)')

print(f'\nRecommendation: Use {primary_model} as primary embedding model for thesis experiments.')
print('Rationale: Best balance of quality (MTEB benchmarks) and cloud-domain performance.')