In [None]:
# Experiment with different parameters
def create_experimental_data(n_examples=200, proposed_fusion_rate=0.65, 
                            baseline_error_rate=0.08, proposed_error_rate=0.09):
    """Create experimental data with custom parameters."""
    
    experimental_results = {
        "baseline": [
            {"decision": "fission", "error": i < int(n_examples * baseline_error_rate)} 
            for i in range(n_examples)
        ],
        "proposed": [
            {
                "decision": "fusion" if i < int(n_examples * proposed_fusion_rate) else "fission", 
                "error": i < int(n_examples * proposed_error_rate)
            } 
            for i in range(n_examples)
        ]
    }
    
    return experimental_results

# Try different scenarios - modify these values to experiment!
experimental_scenarios = {
    "Current": {"fusion_rate": 0.65, "baseline_error": 0.08, "proposed_error": 0.09},
    "Conservative": {"fusion_rate": 0.40, "baseline_error": 0.08, "proposed_error": 0.06},
    "Aggressive": {"fusion_rate": 0.85, "baseline_error": 0.08, "proposed_error": 0.12},
}

print("Experimental Scenario Comparison:")
print("=" * 80)

for scenario_name, params in experimental_scenarios.items():
    exp_data = create_experimental_data(
        proposed_fusion_rate=params["fusion_rate"],
        baseline_error_rate=params["baseline_error"], 
        proposed_error_rate=params["proposed_error"]
    )
    
    exp_metrics = compute_metrics(exp_data)
    
    print(f"\n{scenario_name} Scenario:")
    print(f"  Fusion Rate: {params['fusion_rate']:.1%}")
    print(f"  API Reduction: {exp_metrics['improvement']['api_reduction_pct']:.1f}%")
    print(f"  Error Change: {exp_metrics['improvement']['error_rate_diff']:+.1%}")
    print(f"  Avg Calls/Example: {exp_metrics['proposed']['avg_calls_per_example']:.2f}")

print(f"\n{'='*80}")
print("ðŸ’¡ TIP: Modify the experimental_scenarios dictionary above to test your own scenarios!")

## 5. Interactive Experimentation

You can modify the parameters below to experiment with different scenarios and see how they affect the metrics.

In [None]:
# Create visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('DKW Controller Performance Analysis', fontsize=16, fontweight='bold')

# 1. Decision Distribution Comparison
methods = ['Baseline', 'Proposed']
fusion_rates = [metrics['baseline']['fusion_rate'], metrics['proposed']['fusion_rate']]
fission_rates = [metrics['baseline']['fission_rate'], metrics['proposed']['fission_rate']]

x = np.arange(len(methods))
width = 0.35

ax1.bar(x - width/2, fusion_rates, width, label='Fusion', alpha=0.8, color='skyblue')
ax1.bar(x + width/2, fission_rates, width, label='Fission', alpha=0.8, color='lightcoral')
ax1.set_xlabel('Method')
ax1.set_ylabel('Decision Rate')
ax1.set_title('Decision Distribution')
ax1.set_xticks(x)
ax1.set_xticklabels(methods)
ax1.legend()
ax1.set_ylim(0, 1.1)

# 2. Error Rate Comparison
error_rates = [metrics['baseline']['error_rate'], metrics['proposed']['error_rate']]
ax2.bar(methods, error_rates, alpha=0.8, color=['red', 'orange'])
ax2.set_ylabel('Error Rate')
ax2.set_title('Error Rate Comparison')
ax2.set_ylim(0, max(error_rates) * 1.2)
for i, v in enumerate(error_rates):
    ax2.text(i, v + 0.002, f'{v:.1%}', ha='center', va='bottom')

# 3. API Calls per Example
api_calls_avg = [metrics['baseline']['avg_calls_per_example'], 
                 metrics['proposed']['avg_calls_per_example']]
bars = ax3.bar(methods, api_calls_avg, alpha=0.8, color=['lightblue', 'lightgreen'])
ax3.set_ylabel('Average API Calls per Example')
ax3.set_title('API Call Efficiency')
ax3.set_ylim(0, max(api_calls_avg) * 1.2)
for i, v in enumerate(api_calls_avg):
    ax3.text(i, v + 0.05, f'{v:.2f}', ha='center', va='bottom')

# 4. Total API Calls
total_api_calls = [metrics['baseline']['api_calls'], metrics['proposed']['api_calls']]
ax4.bar(methods, total_api_calls, alpha=0.8, color=['coral', 'lightseagreen'])
ax4.set_ylabel('Total API Calls')
ax4.set_title('Total API Call Usage')
for i, v in enumerate(total_api_calls):
    ax4.text(i, v + 5, f'{v}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Summary statistics
print(f"\nðŸŽ¯ KEY FINDINGS:")
print(f"   â€¢ API calls reduced by {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"   â€¢ Proposed method uses {metrics['proposed']['fusion_rate']:.1%} fusion decisions")
print(f"   â€¢ Error rate changed by {metrics['improvement']['error_rate_diff']:+.1%}")
print(f"   â€¢ Total API calls: {metrics['baseline']['api_calls']} â†’ {metrics['proposed']['api_calls']}")

## 4. Visualizations

Let's create some visualizations to better understand the performance differences between the methods.

In [None]:
# Compute metrics
metrics = compute_metrics(results)

# Display results in a nice format
print("="*60)
print("           DKW CONTROLLER EVALUATION RESULTS")
print("="*60)

# Method comparison table
methods_data = []
for method in ["baseline", "proposed"]:
    m = metrics[method]
    methods_data.append({
        'Method': method.title(),
        'Fusion Rate': f"{m['fusion_rate']:.1%}",
        'Fission Rate': f"{m['fission_rate']:.1%}",
        'Error Rate': f"{m['error_rate']:.1%}",
        'API Calls': m['api_calls'],
        'Avg Calls/Example': f"{m['avg_calls_per_example']:.2f}"
    })

df = pd.DataFrame(methods_data)
print("\nMethod Comparison:")
print(df.to_string(index=False))

# Improvement metrics
print(f"\n{'='*60}")
print("                    IMPROVEMENTS")
print(f"{'='*60}")
print(f"API Call Reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"Error Rate Change:  {metrics['improvement']['error_rate_diff']:+.1%}")

# Save results (equivalent to the original eval_out.json)
eval_output = metrics
print(f"\nEquivalent to eval_out.json content:")
print(json.dumps(eval_output, indent=2))

## 3. Run Evaluation and Display Results

Now let's compute the metrics and display the results in a readable format.

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics for both baseline and proposed methods."""
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement metrics
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

print("Metrics computation function defined successfully!")

## 2. Metrics Computation Function

This function analyzes the predictions from both methods and computes key performance metrics:

- **Fusion Rate**: Percentage of examples that chose fusion (1 API call)
- **Fission Rate**: Percentage of examples that chose fission (2 API calls)  
- **Error Rate**: Percentage of predictions that resulted in errors
- **API Calls**: Total number of API calls made
- **Average Calls per Example**: Efficiency metric for API usage

In [None]:
"""Evaluation script for DKW Controller."""
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample data representing the results that would normally be read from method_out.json
# This data represents predictions from both baseline and proposed methods
results = {
    "baseline": [
        # Baseline always chooses fission (decision="fission"), with 8% error rate
        {"decision": "fission", "error": i < 16} for i in range(200)
    ],
    "proposed": [
        # Proposed method: 65% fusion, 35% fission, with 9% error rate
        {"decision": "fusion" if i < 130 else "fission", "error": i < 18} for i in range(200)
    ]
}

print("Sample data loaded successfully!")
print(f"Baseline examples: {len(results['baseline'])}")
print(f"Proposed examples: {len(results['proposed'])}")
print(f"Sample baseline prediction: {results['baseline'][0]}")
print(f"Sample proposed prediction: {results['proposed'][0]}")

## 1. Setup and Data Import

First, let's import the required libraries and set up our sample data. In the original script, this data would be read from `../experiment_001/method_out.json`, but we'll inline it here for a self-contained notebook.

# DKW Controller Evaluation

This notebook contains an evaluation script for the DKW Controller, analyzing the performance metrics between baseline and proposed methods for fusion/fission decisions.

## Overview
- **Baseline Method**: Always chooses fission (2 API calls per example)
- **Proposed Method**: Intelligently chooses between fusion (1 API call) and fission (2 API calls)
- **Metrics**: Fusion rate, fission rate, error rate, and API call efficiency