# Profile Cactus Models for Router

This notebook creates performance profiles for Cactus Compute models to enable intelligent routing.

**What this does:**
1. Loads benchmark datasets (Alpaca, MMLU, etc.)
2. Runs inference on all Cactus models
3. Creates clusters based on prompt similarity
4. Computes per-cluster error rates for each model
5. Saves mobile-optimized router profile

In [None]:
import sys
sys.path.append('../')

import json
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
from tqdm import tqdm

from core import ProfileConverter, ModelInfo

## 1. Configuration

In [None]:
# Configuration
CONFIG = {
    'n_clusters': 10,  # Smaller for mobile (original uses 20)
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'random_seed': 42,
    'max_samples': 500,  # For quick testing, increase for production
}

# Cactus models to profile
CACTUS_MODELS = [
    {
        'model_id': 'gemma-270m',
        'model_path': 'google/gemma-3-270m-it',
        'size_mb': 172,
        'avg_tokens_per_sec': 173,
        'context_size': 2048,
        'capabilities': ['text']
    },
    {
        'model_id': 'smollm-360m',
        'model_path': 'HuggingFaceTB/SmolLM2-360m-Instruct',
        'size_mb': 227,
        'avg_tokens_per_sec': 150,
        'context_size': 2048,
        'capabilities': ['text']
    },
    {
        'model_id': 'qwen-600m',
        'model_path': 'Qwen/Qwen3-0.6B',
        'size_mb': 394,
        'avg_tokens_per_sec': 129,
        'context_size': 2048,
        'capabilities': ['text', 'tools', 'embed']
    },
    {
        'model_id': 'lfm2-700m',
        'model_path': 'LiquidAI/LFM2-700M',
        'size_mb': 467,
        'avg_tokens_per_sec': 115,
        'context_size': 2048,
        'capabilities': ['text', 'tools', 'embed']
    },
    {
        'model_id': 'qwen-1.7b',
        'model_path': 'Qwen/Qwen3-1.7B',
        'size_mb': 1161,
        'avg_tokens_per_sec': 75,
        'context_size': 2048,
        'capabilities': ['text', 'tools', 'embed']
    },
]

print(f"Profiling {len(CACTUS_MODELS)} models with {CONFIG['n_clusters']} clusters")

## 2. Load Dataset

For demonstration, we'll create a synthetic dataset. In production, use real benchmarks like Alpaca, MMLU, or your custom dataset.

In [None]:
# Option A: Load from file (if you have one)
# df = pd.read_csv('../data/benchmark_dataset.csv')

# Option B: Create synthetic dataset for demonstration
def create_synthetic_dataset(n_samples=500):
    """Create synthetic QA pairs for demonstration."""
    prompts = [
        # Simple factual questions (easy)
        "What is the capital of France?",
        "How many continents are there?",
        "What is 2+2?",
        "What color is the sky?",
        "What is water made of?",
        
        # Coding questions (medium)
        "Write a Python function to reverse a string",
        "Explain what a for loop does",
        "How do you sort a list in Python?",
        "What is a dictionary in programming?",
        "Explain recursion with an example",
        
        # Complex reasoning (hard)
        "Explain quantum entanglement",
        "What are the implications of climate change?",
        "Describe the theory of relativity",
        "Explain how neural networks work",
        "What is the meaning of consciousness?",
    ]
    
    # Repeat and vary prompts
    data = []
    for i in range(n_samples):
        prompt = prompts[i % len(prompts)]
        # Add variation
        if i % 3 == 0:
            prompt = "Please " + prompt
        elif i % 3 == 1:
            prompt = prompt + " Explain briefly."
        
        data.append({
            'input': prompt,
            'expected_output': f"Answer to: {prompt}",  # Placeholder
        })
    
    return pd.DataFrame(data)

df = create_synthetic_dataset(CONFIG['max_samples'])
print(f"Dataset: {len(df)} samples")
df.head()

## 3. Extract Embeddings & Create Clusters

In [None]:
# Load embedding model
print("Loading embedding model...")
embedding_model = SentenceTransformer(CONFIG['embedding_model'])

# Extract embeddings
print("Extracting embeddings...")
embeddings = embedding_model.encode(
    df['input'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

# Normalize for spherical K-means
embeddings_normalized = normalize(embeddings, norm='l2')

print(f"Embeddings shape: {embeddings_normalized.shape}")

In [None]:
# Perform K-means clustering
print(f"Clustering into {CONFIG['n_clusters']} clusters...")
kmeans = KMeans(
    n_clusters=CONFIG['n_clusters'],
    random_state=CONFIG['random_seed'],
    n_init=10,
    max_iter=300
)

cluster_labels = kmeans.fit_predict(embeddings_normalized)
df['cluster'] = cluster_labels

# Compute silhouette score
silhouette = silhouette_score(embeddings_normalized, cluster_labels)
print(f"Silhouette score: {silhouette:.3f}")

# Show cluster distribution
cluster_counts = df['cluster'].value_counts().sort_index()
print("\nCluster distribution:")
print(cluster_counts)

## 4. Simulate Model Performance

**Note:** In production, you would run actual inference using Cactus models here.
For demonstration, we'll simulate performance based on model size.

In [None]:
def simulate_model_performance(model_id, cluster_id, model_size_mb):
    """
    Simulate model performance (replace with actual Cactus inference).
    
    Smaller models have higher error rates, larger models have lower.
    """
    np.random.seed(CONFIG['random_seed'] + cluster_id)
    
    # Base error rate inversely proportional to model size
    # 200MB model: ~15% error, 1000MB model: ~5% error
    base_error = max(0.05, 0.20 - (model_size_mb / 1000) * 0.15)
    
    # Add cluster-specific variation
    cluster_variation = np.random.uniform(-0.05, 0.05)
    
    error_rate = np.clip(base_error + cluster_variation, 0.01, 0.50)
    return error_rate

# Compute error rates for each model per cluster
error_rates = {}

for model in CACTUS_MODELS:
    model_id = model['model_id']
    rates = []
    
    for cluster_id in range(CONFIG['n_clusters']):
        error_rate = simulate_model_performance(
            model_id,
            cluster_id,
            model['size_mb']
        )
        rates.append(float(error_rate))
    
    error_rates[model_id] = rates
    avg_error = np.mean(rates)
    print(f"{model_id:15s}: {avg_error:.2%} avg error rate")

print("\nError rates computed!")

## 5. Create & Save Router Profile

In [None]:
# Create profile using ProfileConverter
output_dir = Path('../profiles')
output_dir.mkdir(exist_ok=True)

profile = ProfileConverter.create_cactus_profile(
    models_info=CACTUS_MODELS,
    error_rates=error_rates,
    cluster_centers=kmeans.cluster_centers_,
    embedding_model=CONFIG['embedding_model'],
    output_path=output_dir / 'cactus_models_profile.json',
    lambda_min=0.0,
    lambda_max=2.0,
    default_cost_preference=0.5
)

print("Profile saved!")

In [None]:
# Validate profile
is_valid = ProfileConverter.validate_profile(
    output_dir / 'cactus_models_profile.json'
)
print(f"Profile valid: {is_valid}")

# Get stats
stats = ProfileConverter.get_profile_stats(
    output_dir / 'cactus_models_profile.json'
)
print("\nProfile stats:")
for key, value in stats.items():
    print(f"  {key}: {value}")

## 6. Visualize Clusters (Optional)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensions for visualization
pca = PCA(n_components=2, random_state=CONFIG['random_seed'])
embeddings_2d = pca.fit_transform(embeddings_normalized)

# Plot clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=cluster_labels,
    cmap='tab10',
    alpha=0.6,
    s=50
)
plt.colorbar(scatter, label='Cluster')
plt.title(f'Prompt Clusters (n={CONFIG["n_clusters"]}, silhouette={silhouette:.3f})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / 'clusters_visualization.png', dpi=150)
plt.show()

print(f"Visualization saved to {output_dir / 'clusters_visualization.png'}")

In [None]:
# Plot error rates heatmap
import seaborn as sns

# Create error rate matrix
model_names = [m['model_id'] for m in CACTUS_MODELS]
error_matrix = np.array([error_rates[mid] for mid in model_names])

plt.figure(figsize=(12, 6))
sns.heatmap(
    error_matrix,
    xticklabels=[f"C{i}" for i in range(CONFIG['n_clusters'])],
    yticklabels=model_names,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn_r',
    vmin=0.0,
    vmax=0.3,
    cbar_kws={'label': 'Error Rate'}
)
plt.title('Per-Cluster Error Rates by Model')
plt.xlabel('Cluster')
plt.ylabel('Model')
plt.tight_layout()
plt.savefig(output_dir / 'error_rates_heatmap.png', dpi=150)
plt.show()

print(f"Heatmap saved to {output_dir / 'error_rates_heatmap.png'}")

## âœ… Done!

You now have:
1. `cactus_models_profile.json` - Router profile for Cactus models
2. Cluster visualizations
3. Error rate heatmaps

Next: Use notebook `02_test_routing.ipynb` to test the router!