# Adaptive API - Cost Bias Testing

Test how different `cost_bias` values affect model selection between Claude Opus 4.5 and Claude Sonnet 4.5.

**Cost Bias Values:**
- `0.0` = Quality focused (prefer more capable models)
- `0.5` = Balanced
- `1.0` = Cost focused (prefer cheaper models)

In [None]:
import os
import json
import time
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import matplotlib.pyplot as plt

# Load environment variables
load_dotenv()

# Initialize client
client = OpenAI(
    api_key=os.getenv("ADAPTIVE_API_KEY"),
    base_url=os.getenv("ADAPTIVE_API_BASE", "https://api.llmadaptive.uk/v1/")
)

# Models to route between
MODELS = ["anthropic/claude-opus-4-5", "anthropic/claude-sonnet-4-5"]

print("Adaptive API Configuration")
print("=" * 40)
print(f"API Base: {client.base_url}")
print(f"Models: {MODELS}")

## 1. Define Test Prompts

Different complexity levels to see how routing changes.

In [None]:
# Test prompts of varying complexity
TEST_PROMPTS = [
    {
        "name": "simple_greeting",
        "complexity": "low",
        "prompt": "Say hello in 3 words."
    },
    {
        "name": "math_question",
        "complexity": "low",
        "prompt": "What is 15 * 23?"
    },
    {
        "name": "code_simple",
        "complexity": "medium",
        "prompt": "Write a Python function to check if a number is prime."
    },
    {
        "name": "code_medium",
        "complexity": "medium",
        "prompt": "Write a Python function to implement binary search on a sorted list. Include docstring and type hints."
    },
    {
        "name": "code_complex",
        "complexity": "high",
        "prompt": """Design and implement a Python class for a thread-safe LRU cache with the following requirements:
1. O(1) get and put operations
2. Thread-safe for concurrent access
3. Configurable max size
4. Optional TTL (time-to-live) for entries
Include comprehensive docstrings, type hints, and example usage."""
    },
    {
        "name": "reasoning",
        "complexity": "high",
        "prompt": """Analyze the trade-offs between microservices and monolithic architecture for a startup building a social media platform. 
Consider: team size (5 developers), expected scale (100K users in year 1), budget constraints, and time-to-market pressure.
Provide a concrete recommendation with justification."""
    }
]

print(f"Defined {len(TEST_PROMPTS)} test prompts:")
for p in TEST_PROMPTS:
    print(f"  - {p['name']} ({p['complexity']} complexity)")

## 2. Helper Function for API Calls

In [None]:
def call_adaptive_api(prompt: str, cost_bias: float, max_tokens: int = 1024) -> dict:
    """
    Call Adaptive API with specified cost_bias.
    
    Args:
        prompt: The user prompt
        cost_bias: 0.0 (quality) to 1.0 (cost)
        max_tokens: Maximum tokens to generate
        
    Returns:
        Dictionary with response details
    """
    start_time = time.time()
    
    try:
        response = client.chat.completions.create(
            model="adaptive/auto",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0.7,
            extra_body={
                "model_router": {
                    "models": MODELS,
                    "cost_bias": cost_bias
                }
            }
        )
        
        latency = time.time() - start_time
        
        # Extract model used from response
        model_used = response.model
        
        # Extract usage
        usage = response.usage
        
        return {
            "success": True,
            "model_used": model_used,
            "input_tokens": usage.prompt_tokens if usage else 0,
            "output_tokens": usage.completion_tokens if usage else 0,
            "total_tokens": usage.total_tokens if usage else 0,
            "latency_seconds": latency,
            "content": response.choices[0].message.content[:200] + "..." if len(response.choices[0].message.content) > 200 else response.choices[0].message.content,
            "error": None
        }
        
    except Exception as e:
        return {
            "success": False,
            "model_used": "error",
            "input_tokens": 0,
            "output_tokens": 0,
            "total_tokens": 0,
            "latency_seconds": time.time() - start_time,
            "content": "",
            "error": str(e)
        }

print("Helper function defined.")

## 3. Run Tests with Different Cost Biases

In [None]:
# Cost bias values to test
COST_BIASES = [0.0, 0.25, 0.5, 0.75, 1.0]

# Store results
results = []

print("Running tests...")
print("=" * 60)

for prompt_info in TEST_PROMPTS:
    print(f"\nTesting: {prompt_info['name']} ({prompt_info['complexity']} complexity)")
    
    for cost_bias in COST_BIASES:
        print(f"  cost_bias={cost_bias}...", end=" ")
        
        result = call_adaptive_api(
            prompt=prompt_info["prompt"],
            cost_bias=cost_bias
        )
        
        result["prompt_name"] = prompt_info["name"]
        result["prompt_complexity"] = prompt_info["complexity"]
        result["cost_bias"] = cost_bias
        results.append(result)
        
        if result["success"]:
            model_short = result["model_used"].replace("anthropic/", "").replace("-20250929", "")
            print(f"-> {model_short} ({result['latency_seconds']:.2f}s)")
        else:
            print(f"-> ERROR: {result['error']}")
        
        # Small delay to avoid rate limiting
        time.sleep(0.5)

print("\n" + "=" * 60)
print(f"Completed {len(results)} API calls")

## 4. Analyze Results

In [None]:
# Convert to DataFrame
df = pd.DataFrame(results)

# Clean model names
df['model_short'] = df['model_used'].apply(
    lambda x: x.replace('anthropic/', '').replace('-20250929', '') if x != 'error' else 'error'
)

print("Results Summary")
print("=" * 60)
print(df[['prompt_name', 'cost_bias', 'model_short', 'latency_seconds', 'total_tokens']].to_string())

## 5. Model Selection by Cost Bias

In [None]:
# Pivot table: model selection by cost_bias
selection_pivot = pd.crosstab(df['cost_bias'], df['model_short'], normalize='index') * 100

print("\nModel Selection % by Cost Bias:")
print("-" * 40)
print(selection_pivot.round(1).to_string())

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
selection_pivot.plot(kind='bar', ax=ax, color=['#e74c3c', '#3498db'])
ax.set_xlabel('Cost Bias')
ax.set_ylabel('Selection %')
ax.set_title('Model Selection by Cost Bias', fontsize=14, fontweight='bold')
ax.set_xticklabels([f'{b}' for b in COST_BIASES], rotation=0)
ax.legend(title='Model')
ax.set_ylim(0, 105)

for container in ax.containers:
    ax.bar_label(container, fmt='%.0f%%', fontsize=9)

plt.tight_layout()
plt.savefig('cost_bias_selection.png', dpi=150)
plt.show()

## 6. Model Selection by Prompt Complexity

In [None]:
# Pivot table: model selection by complexity and cost_bias
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

complexities = ['low', 'medium', 'high']
colors = ['#3498db', '#e74c3c']

for i, complexity in enumerate(complexities):
    subset = df[df['prompt_complexity'] == complexity]
    if len(subset) > 0:
        pivot = pd.crosstab(subset['cost_bias'], subset['model_short'], normalize='index') * 100
        pivot.plot(kind='bar', ax=axes[i], color=colors, legend=(i == 2))
        axes[i].set_title(f'{complexity.capitalize()} Complexity', fontsize=12, fontweight='bold')
        axes[i].set_xlabel('Cost Bias')
        axes[i].set_ylabel('Selection %' if i == 0 else '')
        axes[i].set_xticklabels([f'{b}' for b in pivot.index], rotation=0)
        axes[i].set_ylim(0, 105)

if axes[2].get_legend():
    axes[2].legend(title='Model', bbox_to_anchor=(1.02, 1))

plt.suptitle('Model Selection by Complexity and Cost Bias', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('cost_bias_by_complexity.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Latency Comparison

In [None]:
# Latency by model
print("\nLatency by Model:")
print("-" * 40)
latency_stats = df.groupby('model_short')['latency_seconds'].agg(['mean', 'median', 'std', 'count'])
print(latency_stats.round(2).to_string())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Box plot
models = df['model_short'].unique()
data_for_box = [df[df['model_short'] == m]['latency_seconds'] for m in models if m != 'error']
labels = [m for m in models if m != 'error']
bp = axes[0].boxplot(data_for_box, labels=labels, patch_artist=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Latency (seconds)')
axes[0].set_title('Latency Distribution by Model', fontsize=12, fontweight='bold')

# Latency by cost_bias
latency_by_bias = df.groupby('cost_bias')['latency_seconds'].mean()
axes[1].plot(latency_by_bias.index, latency_by_bias.values, 'o-', markersize=10, linewidth=2)
axes[1].set_xlabel('Cost Bias')
axes[1].set_ylabel('Average Latency (seconds)')
axes[1].set_title('Average Latency by Cost Bias', fontsize=12, fontweight='bold')
axes[1].set_xticks(COST_BIASES)

plt.tight_layout()
plt.savefig('latency_comparison.png', dpi=150)
plt.show()

## 8. Detailed Results Table

In [None]:
# Create detailed pivot table
detail_table = df.pivot_table(
    index='prompt_name',
    columns='cost_bias',
    values='model_short',
    aggfunc='first'
)

print("\nModel Selected by Prompt and Cost Bias:")
print("=" * 80)
print(detail_table.to_string())

## 9. Summary and Recommendations

In [None]:
# Calculate summary statistics
successful = df[df['success'] == True]

opus_count = len(successful[successful['model_short'].str.contains('opus', case=False)])
sonnet_count = len(successful[successful['model_short'].str.contains('sonnet', case=False)])

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nTotal API Calls: {len(df)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(df) - len(successful)}")

print(f"\nModel Distribution:")
print(f"  Claude Opus 4.5:   {opus_count} ({opus_count/len(successful)*100:.1f}%)")
print(f"  Claude Sonnet 4.5: {sonnet_count} ({sonnet_count/len(successful)*100:.1f}%)")

print(f"\nAverage Latency: {successful['latency_seconds'].mean():.2f}s")
print(f"Average Tokens:  {successful['total_tokens'].mean():.0f}")

print("\n" + "-" * 60)
print("RECOMMENDATIONS")
print("-" * 60)
print("""
Based on the test results:

1. cost_bias=0.0: Use for critical/complex tasks requiring best quality
   - More likely to select Claude Opus 4.5
   - Higher cost but better reasoning

2. cost_bias=0.5: Balanced approach for general use
   - Router intelligently selects based on task complexity
   - Good trade-off between cost and quality

3. cost_bias=1.0: Use for high-volume, simpler tasks
   - More likely to select Claude Sonnet 4.5
   - Lower cost, still good quality for routine tasks
""")

## 10. Export Results

In [None]:
# Export to JSON
export_data = {
    "test_config": {
        "models": MODELS,
        "cost_biases_tested": COST_BIASES,
        "prompts_tested": len(TEST_PROMPTS)
    },
    "summary": {
        "total_calls": len(df),
        "successful_calls": len(successful),
        "opus_selections": opus_count,
        "sonnet_selections": sonnet_count,
        "avg_latency": round(successful['latency_seconds'].mean(), 2),
        "avg_tokens": round(successful['total_tokens'].mean(), 0)
    },
    "results": df.to_dict(orient='records')
}

with open('cost_bias_test_results.json', 'w') as f:
    json.dump(export_data, f, indent=2, default=str)

# Export to CSV
df.to_csv('cost_bias_test_results.csv', index=False)

print("\nResults exported:")
print("  - cost_bias_test_results.json")
print("  - cost_bias_test_results.csv")
print("  - cost_bias_selection.png")
print("  - cost_bias_by_complexity.png")
print("  - latency_comparison.png")