In [None]:
# Display complete metrics as formatted JSON
print(json.dumps(metrics, indent=2))

# Save to file if needed (uncomment below)
# with open("eval_out.json", "w") as f:
#     json.dump(metrics, f, indent=2)

## Complete Metrics Output

This shows the full metrics output that would normally be saved to `eval_out.json`:

In [None]:
# Compute metrics
metrics = compute_metrics(results)

# Display key results
print("=== DKW Controller Evaluation Results ===")
print(f"\nAPI reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"Error rate difference: {metrics['improvement']['error_rate_diff']:.3f}")

print(f"\nBaseline Method:")
print(f"  Fusion rate: {metrics['baseline']['fusion_rate']:.1%}")
print(f"  Fission rate: {metrics['baseline']['fission_rate']:.1%}")
print(f"  Error rate: {metrics['baseline']['error_rate']:.1%}")
print(f"  Avg API calls per example: {metrics['baseline']['avg_calls_per_example']:.2f}")

print(f"\nProposed Method:")
print(f"  Fusion rate: {metrics['proposed']['fusion_rate']:.1%}")
print(f"  Fission rate: {metrics['proposed']['fission_rate']:.1%}")
print(f"  Error rate: {metrics['proposed']['error_rate']:.1%}")
print(f"  Avg API calls per example: {metrics['proposed']['avg_calls_per_example']:.2f}")

## Run Evaluation

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics."""
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

## Metrics Computation

The `compute_metrics` function analyzes the results and calculates key performance indicators:

1. **Decision rates**: Proportion of fusion vs fission decisions
2. **Error rate**: Percentage of incorrect predictions 
3. **API efficiency**: Total and average API calls per example
4. **Improvements**: Comparison between baseline and proposed methods

In [None]:
# Sample experimental results data (normally loaded from method_out.json)
results = {
    "baseline": [
        # 200 examples: all fission decisions, 16 with errors (8% error rate)
        {"decision": "fission", "error": i < 16} 
        for i in range(200)
    ],
    "proposed": [
        # 200 examples: 130 fusion (65%), 70 fission (35%), 18 with errors (9% error rate)  
        {"decision": "fusion" if i < 130 else "fission", "error": i < 18}
        for i in range(200)
    ]
}

print(f"Baseline examples: {len(results['baseline'])}")
print(f"Proposed examples: {len(results['proposed'])}")
print(f"Baseline errors: {sum(1 for p in results['baseline'] if p['error'])}")
print(f"Proposed errors: {sum(1 for p in results['proposed'] if p['error'])}")

## Sample Data

Here we define the experimental results data that would normally be read from `method_out.json`. 

The data contains predictions from both baseline and proposed methods, where each prediction includes:
- `decision`: Either "fusion" (1 API call) or "fission" (2 API calls)
- `error`: Boolean indicating if the prediction was incorrect

In [None]:
"""Evaluation script for DKW Controller."""
import json
import numpy as np

# DKW Controller Evaluation

This notebook evaluates the performance of a DKW (Decision-based Knowledge Worker) controller by comparing baseline and proposed methods. 

The evaluation computes:
- **Fusion/Fission rates**: How often each decision type is made
- **Error rates**: Frequency of incorrect decisions  
- **API efficiency**: Number of API calls required
- **Performance improvements**: Reduction in API calls between methods