In [None]:
# Display comprehensive results
print("ðŸ“ˆ DETAILED EVALUATION METRICS")
print("=" * 50)

print("\nðŸ”¹ BASELINE METHOD:")
for key, value in metrics["baseline"].items():
    if "rate" in key:
        print(f"  {key}: {value:.1%}")
    elif "calls" in key:
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print("\nðŸ”¸ PROPOSED METHOD:")
for key, value in metrics["proposed"].items():
    if "rate" in key:
        print(f"  {key}: {value:.1%}")
    elif "calls" in key:
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print("\nðŸš€ IMPROVEMENT:")
print(f"  API Reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"  Error Rate Change: {metrics['improvement']['error_rate_diff']:+.1%}")

# Pretty print the complete results (equivalent to the JSON output)
print("\n" + "=" * 50)
print("ðŸ“‹ COMPLETE RESULTS (JSON format):")
print(json.dumps(metrics, indent=2))

## 5. Detailed Results

Let's examine the detailed metrics to understand the performance comparison between baseline and proposed methods.

In [None]:
# Run the evaluation (equivalent to the original script's main section)
metrics = compute_metrics(results)

# Display the key result (equivalent to the original print statement)
print(f"ðŸŽ¯ API reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")

# Save results to variable (equivalent to writing eval_out.json)
eval_output = metrics
print("ðŸ“Š Evaluation completed successfully!")

## 4. Run Evaluation

Let's compute the metrics and save the results. This replaces the file I/O from the original script.

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics."""
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

print("âœ… Metrics computation function defined successfully!")

## 3. Evaluation Metrics Function

This function computes various metrics to compare the baseline and proposed methods:
- **Fusion/Fission Rates**: Percentage of decisions for each operation type
- **Error Rate**: Percentage of predictions that resulted in errors
- **API Calls**: Total and average API calls (fusion=1 call, fission=2 calls)
- **Improvement**: Percentage reduction in API calls and error rate difference

In [None]:
# Sample evaluation results (inlined data)
# This replaces reading from "../experiment_001/method_out.json"

# Generate sample baseline results: all fission, 8% error rate
baseline_results = []
for i in range(200):
    baseline_results.append({
        "decision": "fission",
        "error": i < 16  # First 16 examples have errors (8% error rate)
    })

# Generate sample proposed results: 65% fusion, 35% fission, 9% error rate  
proposed_results = []
for i in range(200):
    if i < 130:  # First 130 examples (65%) use fusion
        decision = "fusion"
    else:  # Remaining 70 examples (35%) use fission
        decision = "fission"
    
    proposed_results.append({
        "decision": decision,
        "error": i < 18  # First 18 examples have errors (9% error rate)
    })

# Combine into the expected data structure
results = {
    "baseline": baseline_results,
    "proposed": proposed_results
}

print(f"Generated sample data:")
print(f"- Baseline: {len(baseline_results)} predictions")
print(f"- Proposed: {len(proposed_results)} predictions")
print(f"- Baseline fusion rate: {sum(1 for r in baseline_results if r['decision'] == 'fusion') / len(baseline_results):.1%}")
print(f"- Proposed fusion rate: {sum(1 for r in proposed_results if r['decision'] == 'fusion') / len(proposed_results):.1%}")

## 2. Sample Evaluation Data

Instead of reading from external JSON files, we'll inline the sample data directly into the notebook. 
This data represents evaluation results from both the baseline and proposed methods on 200 test examples.

In [None]:
import json
import numpy as np
from pprint import pprint

## 1. Import Dependencies

Let's start by importing the necessary libraries for our evaluation.

# DKW Controller Evaluation

This notebook contains an evaluation script for the DKW Controller that compares baseline and proposed methods for decision-making between fusion and fission operations.

## Overview
- **Baseline Method**: Always chooses fission operations
- **Proposed Method**: Intelligently chooses between fusion and fission operations
- **Metrics**: Fusion/fission rates, error rates, API call efficiency