## Customization

To modify this notebook for your own data:

1. **Update the sample data** in the "Sample Data" section:
   - Replace the `baseline_data` and `proposed_data` lists with your actual experimental results
   - Each item should be a dictionary with `"decision"` ("fusion" or "fission") and `"error"` (True/False) keys

2. **Modify metrics calculation** if needed:
   - The `compute_metrics` function assumes fusion=1 API call and fission=2 API calls
   - Adjust these values in the function if your system uses different call patterns

3. **Add new visualizations** or analysis by creating additional code cells below

The notebook is completely self-contained and doesn't require any external files to run!

In [None]:
import matplotlib.pyplot as plt

# Create a comparison chart
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 8))

methods = ['Baseline', 'Proposed']

# 1. Decision Types
fusion_rates = [metrics['baseline']['fusion_rate'], metrics['proposed']['fusion_rate']]
fission_rates = [metrics['baseline']['fission_rate'], metrics['proposed']['fission_rate']]

ax1.bar(methods, fusion_rates, label='Fusion Rate', alpha=0.7)
ax1.bar(methods, fission_rates, bottom=fusion_rates, label='Fission Rate', alpha=0.7)
ax1.set_ylabel('Rate')
ax1.set_title('Decision Type Distribution')
ax1.legend()

# 2. API Calls per Example
api_calls = [metrics['baseline']['avg_calls_per_example'], metrics['proposed']['avg_calls_per_example']]
bars = ax2.bar(methods, api_calls, color=['red', 'green'], alpha=0.7)
ax2.set_ylabel('Average API Calls')
ax2.set_title('API Efficiency')
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.05,
             f'{height:.2f}', ha='center', va='bottom')

# 3. Error Rates
error_rates = [metrics['baseline']['error_rate'], metrics['proposed']['error_rate']]
ax3.bar(methods, error_rates, color=['orange', 'blue'], alpha=0.7)
ax3.set_ylabel('Error Rate')
ax3.set_title('Error Rate Comparison')
for i, bar in enumerate(ax3.patches):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.002,
             f'{height:.3f}', ha='center', va='bottom')

# 4. Key Improvements
improvements = ['API Reduction %', 'Error Rate Diff']
values = [metrics['improvement']['api_reduction_pct'], metrics['improvement']['error_rate_diff'] * 100]  # Convert to percentage
colors = ['green' if v > 0 else 'red' for v in values]
bars = ax4.bar(improvements, values, color=colors, alpha=0.7)
ax4.set_ylabel('Improvement (%)')
ax4.set_title('Key Improvements')
ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + (0.5 if height > 0 else -1),
             f'{height:.1f}%', ha='center', va='bottom' if height > 0 else 'top')

plt.tight_layout()
plt.show()

# Summary
print(f"\nðŸŽ¯ KEY FINDINGS:")
print(f"   â€¢ API calls reduced by {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"   â€¢ Error rate changed by {metrics['improvement']['error_rate_diff']:.1f} percentage points")
print(f"   â€¢ Proposed method uses {metrics['proposed']['fusion_rate']:.0%} fusion vs {metrics['baseline']['fusion_rate']:.0%} baseline")

## Visualization

Let's create some simple visualizations to better understand the results:

In [None]:
# Display the JSON output (equivalent to writing eval_out.json)
print("Contents of eval_out.json:")
print(json.dumps(metrics, indent=2))

# Optionally save to file (uncomment if you want to create the file)
# with open("eval_out.json", "w") as f:
#     json.dump(metrics, f, indent=2)

## Save Results

The original script saved results to `eval_out.json`. Here we'll display the JSON output:

In [None]:
# Compute metrics
metrics = compute_metrics(results)

# Display the main result (equivalent to the original script's print statement)
print(f"API reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print()

# Display all metrics in a nicely formatted way
print("=== DETAILED RESULTS ===")
print()
print("Baseline Method:")
for key, value in metrics["baseline"].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.3f}")
    else:
        print(f"  {key}: {value}")

print()
print("Proposed Method:")
for key, value in metrics["proposed"].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.3f}")
    else:
        print(f"  {key}: {value}")

print()
print("Improvements:")
for key, value in metrics["improvement"].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.3f}")
    else:
        print(f"  {key}: {value}")

## Run Evaluation

Now let's compute the metrics and display the results:

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics."""
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

## Evaluation Function

The `compute_metrics` function analyzes the experimental results and calculates:
- **Fusion/Fission rates**: Percentage of each decision type
- **Error rate**: Percentage of examples with errors
- **API calls**: Total calls (fusion=1 call, fission=2 calls)
- **Improvement metrics**: Comparison between baseline and proposed methods

In [None]:
# Sample experimental results data (normally loaded from method_out.json)
# This data is constructed to match the expected output metrics

# Create baseline data: 200 examples, all fission decisions, 8% error rate
baseline_data = []
for i in range(200):
    baseline_data.append({
        "decision": "fission",
        "error": i < 16  # First 16 examples have errors (8% of 200)
    })

# Create proposed data: 200 examples, 65% fusion, 35% fission, 9% error rate
proposed_data = []
for i in range(200):
    if i < 130:  # First 130 examples are fusion (65% of 200)
        decision = "fusion"
    else:  # Remaining 70 examples are fission (35% of 200)
        decision = "fission"
    
    proposed_data.append({
        "decision": decision,
        "error": i < 18  # First 18 examples have errors (9% of 200)
    })

# Combine into the expected format
results = {
    "baseline": baseline_data,
    "proposed": proposed_data
}

print(f"Loaded data:")
print(f"- Baseline: {len(results['baseline'])} examples")
print(f"- Proposed: {len(results['proposed'])} examples")

## Sample Data

The following cell contains sample experimental results data that would normally be loaded from `method_out.json`. This data represents 200 examples for each method (baseline and proposed) with their decisions and error status.

In [None]:
"""Required imports for the evaluation."""
import json
import numpy as np

# DKW Controller Evaluation

This notebook evaluates the performance of a DKW (Decision Knowledge Worker) Controller, comparing baseline and proposed methods across several metrics including API call efficiency and error rates.