# SWE-bench Verified - Adaptive Router Analysis

Analysis of 500 instances from SWE-bench Verified benchmark run with Adaptive AI router.

**Routing Models:** Claude Opus 4.5 + Claude Sonnet 4.5

**Run ID:** adaptive_20251215_124758

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## 1. Load Data

In [None]:
# Load generation results
results_path = Path('results/adaptive/adaptive_20251215_124758_generation.json')
with open(results_path) as f:
    data = json.load(f)

# Extract summary info
print("=" * 60)
print("SWE-bench Verified - Adaptive Router Results")
print("=" * 60)
print(f"\nModel: {data['model_name']}")
print(f"Dataset: {data['dataset']}")
print(f"Timestamp: {data['timestamp']}")
print("\n" + "-" * 40)

## 2. Summary Statistics

In [None]:
summary = data['summary']
cost = data['cost_metrics']
tokens = data['token_metrics']

print("\nüìä SUMMARY STATISTICS")
print("-" * 40)
print(f"Total Instances:     {summary['total_instances']}")
print(f"Patches Generated:   {summary['total_instances'] - summary['failed_instances'] - summary['error_instances']}")
print(f"Failed Instances:    {summary['failed_instances']}")
print(f"Error Instances:     {summary['error_instances']}")

print("\nüí∞ COST METRICS")
print("-" * 40)
print(f"Total Cost:          ${cost['total_cost_usd']:.4f}")
print(f"Cost per Instance:   ${cost['cost_per_instance']:.4f}")

print("\nüî¢ TOKEN METRICS")
print("-" * 40)
print(f"Total Tokens:        {tokens['total_tokens']:,}")
print(f"Input Tokens:        {tokens['total_input_tokens']:,}")
print(f"Output Tokens:       {tokens['total_output_tokens']:,}")
print(f"Avg Tokens/Instance: {tokens['total_tokens'] / summary['total_instances']:.0f}")

## 3. Create DataFrame from Instance Results

In [None]:
# Convert to DataFrame
instances = data['instance_results']
df = pd.DataFrame(instances)

# Extract nested generation_metrics
gen_metrics = pd.json_normalize(df['generation_metrics'])
gen_metrics.columns = ['gen_' + col for col in gen_metrics.columns]
df = pd.concat([df.drop('generation_metrics', axis=1), gen_metrics], axis=1)

# Extract repo name from instance_id
df['repo_name'] = df['instance_id'].apply(lambda x: x.split('__')[0].replace('_', '-'))

# Display first few rows
print(f"\nDataFrame shape: {df.shape}")
df.head()

## 4. Model Selection Analysis

In [None]:
# Model selection distribution
model_counts = df['gen_model_used'].value_counts()

print("\nü§ñ MODEL SELECTION DISTRIBUTION")
print("-" * 40)
for model, count in model_counts.items():
    pct = count / len(df) * 100
    print(f"{model}: {count} ({pct:.1f}%)")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
colors = ['#3498db', '#e74c3c', '#2ecc71']
model_labels = [m.replace('anthropic/', '').replace('-20250929', '') for m in model_counts.index]
axes[0].pie(model_counts.values, labels=model_labels, autopct='%1.1f%%', 
            colors=colors[:len(model_counts)], startangle=90)
axes[0].set_title('Model Selection Distribution', fontsize=14, fontweight='bold')

# Bar chart
bars = axes[1].bar(model_labels, model_counts.values, color=colors[:len(model_counts)])
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Count')
axes[1].set_title('Requests per Model', fontsize=14, fontweight='bold')
for bar, count in zip(bars, model_counts.values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                 str(count), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('model_selection_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Cost Analysis

In [None]:
# Cost by model
cost_by_model = df.groupby('gen_model_used').agg({
    'gen_cost_usd': ['sum', 'mean', 'std'],
    'instance_id': 'count'
}).round(4)
cost_by_model.columns = ['total_cost', 'avg_cost', 'std_cost', 'count']

print("\nüí∞ COST BY MODEL")
print("-" * 60)
print(cost_by_model.to_string())

# Cost distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot of costs by model
models = df['gen_model_used'].unique()
model_labels_short = [m.replace('anthropic/', '').replace('-20250929', '') for m in models]
data_for_box = [df[df['gen_model_used'] == m]['gen_cost_usd'] for m in models]
bp = axes[0].boxplot(data_for_box, labels=model_labels_short, patch_artist=True)
for patch, color in zip(bp['boxes'], colors[:len(models)]):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Cost per Request ($)')
axes[0].set_title('Cost Distribution by Model', fontsize=14, fontweight='bold')

# Histogram of costs
axes[1].hist(df['gen_cost_usd'], bins=50, edgecolor='black', alpha=0.7, color='#3498db')
axes[1].axvline(df['gen_cost_usd'].mean(), color='red', linestyle='--', label=f'Mean: ${df["gen_cost_usd"].mean():.4f}')
axes[1].axvline(df['gen_cost_usd'].median(), color='orange', linestyle='--', label=f'Median: ${df["gen_cost_usd"].median():.4f}')
axes[1].set_xlabel('Cost ($)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Cost Distribution (All Instances)', fontsize=14, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('cost_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Token Analysis

In [None]:
# Token statistics
print("\nüî¢ TOKEN STATISTICS")
print("-" * 60)
print(f"Input Tokens  - Mean: {df['gen_input_tokens'].mean():.0f}, Median: {df['gen_input_tokens'].median():.0f}, Max: {df['gen_input_tokens'].max()}")
print(f"Output Tokens - Mean: {df['gen_output_tokens'].mean():.0f}, Median: {df['gen_output_tokens'].median():.0f}, Max: {df['gen_output_tokens'].max()}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Input vs Output tokens scatter
scatter = axes[0].scatter(df['gen_input_tokens'], df['gen_output_tokens'], 
                          c=df['gen_cost_usd'], cmap='viridis', alpha=0.6, s=30)
axes[0].set_xlabel('Input Tokens')
axes[0].set_ylabel('Output Tokens')
axes[0].set_title('Input vs Output Tokens (colored by cost)', fontsize=14, fontweight='bold')
plt.colorbar(scatter, ax=axes[0], label='Cost ($)')

# Token distribution by model
df['gen_total_tokens'] = df['gen_input_tokens'] + df['gen_output_tokens']
token_by_model = df.groupby('gen_model_used')['gen_total_tokens'].mean()
model_labels_short = [m.replace('anthropic/', '').replace('-20250929', '') for m in token_by_model.index]
bars = axes[1].bar(model_labels_short, token_by_model.values, color=colors[:len(token_by_model)])
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Average Tokens')
axes[1].set_title('Average Tokens per Model', fontsize=14, fontweight='bold')
for bar, val in zip(bars, token_by_model.values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                 f'{val:.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('token_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Latency Analysis

In [None]:
# Latency statistics
print("\n‚è±Ô∏è LATENCY STATISTICS")
print("-" * 60)
print(f"Overall - Mean: {df['gen_latency_seconds'].mean():.2f}s, Median: {df['gen_latency_seconds'].median():.2f}s")
print(f"Min: {df['gen_latency_seconds'].min():.2f}s, Max: {df['gen_latency_seconds'].max():.2f}s")

# By model
latency_by_model = df.groupby('gen_model_used')['gen_latency_seconds'].agg(['mean', 'median', 'std']).round(2)
print("\nLatency by Model:")
print(latency_by_model.to_string())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
data_for_box = [df[df['gen_model_used'] == m]['gen_latency_seconds'] for m in models]
bp = axes[0].boxplot(data_for_box, labels=model_labels_short, patch_artist=True)
for patch, color in zip(bp['boxes'], colors[:len(models)]):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Latency (seconds)')
axes[0].set_title('Latency Distribution by Model', fontsize=14, fontweight='bold')

# Histogram
axes[1].hist(df['gen_latency_seconds'], bins=50, edgecolor='black', alpha=0.7, color='#9b59b6')
axes[1].axvline(df['gen_latency_seconds'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["gen_latency_seconds"].mean():.2f}s')
axes[1].set_xlabel('Latency (seconds)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Latency Distribution (All Instances)', fontsize=14, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('latency_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Repository Analysis

In [None]:
# Instances by repository
repo_counts = df['repo_name'].value_counts()

print("\nüìÅ INSTANCES BY REPOSITORY")
print("-" * 60)
for repo, count in repo_counts.items():
    pct = count / len(df) * 100
    print(f"{repo}: {count} ({pct:.1f}%)")

# Visualize top repositories
fig, ax = plt.subplots(figsize=(14, 6))
top_repos = repo_counts.head(10)
bars = ax.barh(top_repos.index[::-1], top_repos.values[::-1], color='#3498db')
ax.set_xlabel('Number of Instances')
ax.set_title('Top 10 Repositories by Instance Count', fontsize=14, fontweight='bold')

for bar, count in zip(bars, top_repos.values[::-1]):
    ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
            str(count), ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('repository_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Model Selection by Repository

In [None]:
# Cross-tabulation of model selection by repo
cross_tab = pd.crosstab(df['repo_name'], df['gen_model_used'], normalize='index') * 100
cross_tab.columns = [c.replace('anthropic/', '').replace('-20250929', '') for c in cross_tab.columns]

# Visualize for top repos
top_repo_names = repo_counts.head(10).index
cross_tab_top = cross_tab.loc[top_repo_names]

fig, ax = plt.subplots(figsize=(14, 8))
cross_tab_top.plot(kind='barh', stacked=True, ax=ax, color=colors[:len(cross_tab_top.columns)])
ax.set_xlabel('Percentage (%)')
ax.set_title('Model Selection by Repository (Top 10)', fontsize=14, fontweight='bold')
ax.legend(title='Model', bbox_to_anchor=(1.02, 1), loc='upper left')

plt.tight_layout()
plt.savefig('model_selection_by_repo.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nModel Selection % by Repository:")
print(cross_tab_top.round(1).to_string())

## 10. Cost Efficiency Analysis

In [None]:
# Calculate cost per 1K tokens
df['cost_per_1k_tokens'] = (df['gen_cost_usd'] / df['gen_total_tokens']) * 1000

print("\nüí∞ COST EFFICIENCY")
print("-" * 60)
efficiency = df.groupby('gen_model_used').agg({
    'cost_per_1k_tokens': 'mean',
    'gen_cost_usd': 'sum',
    'gen_total_tokens': 'sum'
})
efficiency.columns = ['avg_cost_per_1k', 'total_cost', 'total_tokens']
efficiency['overall_cost_per_1k'] = (efficiency['total_cost'] / efficiency['total_tokens']) * 1000
print(efficiency.round(4).to_string())

# Hypothetical comparison: What if we only used one model?
print("\nüìä HYPOTHETICAL COMPARISON")
print("-" * 60)

# Get actual costs
actual_cost = df['gen_cost_usd'].sum()
print(f"Actual Total Cost (Adaptive Router): ${actual_cost:.4f}")

# Note: Hypothetical costs would require knowing the per-token pricing for each model
# Since the router dynamically selects based on task, we can analyze the selection pattern
print(f"\nInstances routed to each model:")
for model in df['gen_model_used'].unique():
    count = len(df[df['gen_model_used'] == model])
    cost = df[df['gen_model_used'] == model]['gen_cost_usd'].sum()
    print(f"  {model.replace('anthropic/', '')}: {count} instances, ${cost:.4f} total")

## 11. Summary Dashboard

In [None]:
# Create summary dashboard
fig = plt.figure(figsize=(16, 12))

# 1. Model distribution pie
ax1 = fig.add_subplot(2, 3, 1)
model_labels = [m.replace('anthropic/', '').replace('-20250929', '') for m in model_counts.index]
ax1.pie(model_counts.values, labels=model_labels, autopct='%1.1f%%', colors=colors[:len(model_counts)])
ax1.set_title('Model Selection', fontsize=12, fontweight='bold')

# 2. Cost distribution
ax2 = fig.add_subplot(2, 3, 2)
ax2.hist(df['gen_cost_usd'], bins=30, edgecolor='black', alpha=0.7, color='#3498db')
ax2.axvline(df['gen_cost_usd'].mean(), color='red', linestyle='--', label=f'Mean: ${df["gen_cost_usd"].mean():.4f}')
ax2.set_xlabel('Cost ($)')
ax2.set_ylabel('Frequency')
ax2.set_title('Cost Distribution', fontsize=12, fontweight='bold')
ax2.legend(fontsize=8)

# 3. Latency distribution
ax3 = fig.add_subplot(2, 3, 3)
ax3.hist(df['gen_latency_seconds'], bins=30, edgecolor='black', alpha=0.7, color='#9b59b6')
ax3.axvline(df['gen_latency_seconds'].mean(), color='red', linestyle='--', label=f'Mean: {df["gen_latency_seconds"].mean():.1f}s')
ax3.set_xlabel('Latency (s)')
ax3.set_ylabel('Frequency')
ax3.set_title('Latency Distribution', fontsize=12, fontweight='bold')
ax3.legend(fontsize=8)

# 4. Repository distribution
ax4 = fig.add_subplot(2, 3, 4)
top_repos = repo_counts.head(5)
ax4.barh(top_repos.index[::-1], top_repos.values[::-1], color='#2ecc71')
ax4.set_xlabel('Count')
ax4.set_title('Top 5 Repositories', fontsize=12, fontweight='bold')

# 5. Tokens by model
ax5 = fig.add_subplot(2, 3, 5)
token_means = df.groupby('gen_model_used')['gen_total_tokens'].mean()
model_labels_short = [m.replace('anthropic/', '').replace('-20250929', '') for m in token_means.index]
ax5.bar(model_labels_short, token_means.values, color=colors[:len(token_means)])
ax5.set_xlabel('Model')
ax5.set_ylabel('Avg Tokens')
ax5.set_title('Avg Tokens by Model', fontsize=12, fontweight='bold')

# 6. Summary text
ax6 = fig.add_subplot(2, 3, 6)
ax6.axis('off')
summary_text = f"""
üìä SWE-bench Verified Results
{'='*30}

Total Instances: {summary['total_instances']}
Patches Generated: {summary['total_instances'] - summary['failed_instances']}

üí∞ Cost Metrics:
  Total: ${cost['total_cost_usd']:.4f}
  Per Instance: ${cost['cost_per_instance']:.4f}

üî¢ Token Metrics:
  Total: {tokens['total_tokens']:,}
  Avg/Instance: {tokens['total_tokens'] // summary['total_instances']}

‚è±Ô∏è Latency:
  Mean: {df['gen_latency_seconds'].mean():.1f}s
  Median: {df['gen_latency_seconds'].median():.1f}s
"""
ax6.text(0.1, 0.9, summary_text, transform=ax6.transAxes, fontsize=11,
         verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('SWE-bench Verified - Adaptive Router Analysis Dashboard', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('summary_dashboard.png', dpi=150, bbox_inches='tight')
plt.show()

## 12. Export Summary Statistics

In [None]:
# Create summary report
summary_report = {
    'run_info': {
        'model_name': data['model_name'],
        'dataset': data['dataset'],
        'timestamp': data['timestamp'],
        'run_id': 'adaptive_20251215_124758'
    },
    'instance_summary': {
        'total': summary['total_instances'],
        'patches_generated': summary['total_instances'] - summary['failed_instances'] - summary['error_instances'],
        'failed': summary['failed_instances'],
        'errors': summary['error_instances']
    },
    'cost_summary': {
        'total_usd': round(cost['total_cost_usd'], 4),
        'per_instance_usd': round(cost['cost_per_instance'], 4),
        'mean_usd': round(df['gen_cost_usd'].mean(), 4),
        'median_usd': round(df['gen_cost_usd'].median(), 4)
    },
    'token_summary': {
        'total': tokens['total_tokens'],
        'input': tokens['total_input_tokens'],
        'output': tokens['total_output_tokens'],
        'avg_per_instance': tokens['total_tokens'] // summary['total_instances']
    },
    'latency_summary': {
        'mean_seconds': round(df['gen_latency_seconds'].mean(), 2),
        'median_seconds': round(df['gen_latency_seconds'].median(), 2),
        'min_seconds': round(df['gen_latency_seconds'].min(), 2),
        'max_seconds': round(df['gen_latency_seconds'].max(), 2)
    },
    'model_selection': {
        model.replace('anthropic/', ''): {
            'count': int(count),
            'percentage': round(count / len(df) * 100, 1),
            'total_cost': round(df[df['gen_model_used'] == model]['gen_cost_usd'].sum(), 4)
        }
        for model, count in model_counts.items()
    },
    'repository_distribution': {
        repo: int(count) for repo, count in repo_counts.items()
    }
}

# Save summary
with open('analysis_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2)

print("\n‚úÖ Analysis complete!")
print("\nFiles generated:")
print("  - model_selection_distribution.png")
print("  - cost_analysis.png")
print("  - token_analysis.png")
print("  - latency_analysis.png")
print("  - repository_distribution.png")
print("  - model_selection_by_repo.png")
print("  - summary_dashboard.png")
print("  - analysis_summary.json")