In [None]:
# Example: Create a custom dataset for experimentation
def create_custom_dataset(num_examples=100, fusion_rate=0.5, error_rate=0.05):
    """Create a custom dataset with specified characteristics.
    
    Args:
        num_examples: Number of examples to generate
        fusion_rate: Proportion of fusion decisions (rest will be fission)
        error_rate: Proportion of examples with errors
    """
    predictions = []
    fusion_count = int(num_examples * fusion_rate)
    error_count = int(num_examples * error_rate)
    
    for i in range(num_examples):
        decision = "fusion" if i < fusion_count else "fission"
        has_error = i < error_count
        predictions.append({
            "decision": decision,
            "error": has_error,
            "example_id": i
        })
    
    return predictions

# Uncomment and modify these lines to test your own scenarios:
# custom_results = {
#     "baseline": create_custom_dataset(num_examples=100, fusion_rate=0.2, error_rate=0.1),
#     "proposed": create_custom_dataset(num_examples=100, fusion_rate=0.8, error_rate=0.05)
# }
# custom_metrics = compute_metrics(custom_results)
# print(f"Custom API reduction: {custom_metrics['improvement']['api_reduction_pct']:.1f}%")

print("Customization example ready! Uncomment the lines above to test your own scenarios.")

## Customization and Experimentation

You can easily modify the data to test different scenarios. Here's an example of how to create your own dataset:

In [None]:
# Print detailed comparison
print("=== DETAILED EVALUATION RESULTS ===\n")

for method in ["baseline", "proposed"]:
    m = metrics[method]
    print(f"{method.upper()} METHOD:")
    print(f"  Total Examples: {m['total_examples']}")
    print(f"  Fusion Decisions: {m['fusion_count']} ({m['fusion_rate']:.1%})")
    print(f"  Fission Decisions: {m['fission_count']} ({m['fission_rate']:.1%})")
    print(f"  Error Count: {m['error_count']} ({m['error_rate']:.1%})")
    print(f"  Total API Calls: {m['api_calls']}")
    print(f"  Avg Calls/Example: {m['avg_calls_per_example']:.2f}")
    print()

print("=== IMPROVEMENTS ===")
improvement = metrics["improvement"]
print(f"API Call Reduction: {improvement['api_reduction_pct']:.1f}%")
print(f"Error Rate Change: {improvement['error_rate_diff']:+.1%}")

# Calculate additional insights
call_savings = metrics["baseline"]["api_calls"] - metrics["proposed"]["api_calls"]
efficiency_gain = (metrics["baseline"]["avg_calls_per_example"] - metrics["proposed"]["avg_calls_per_example"]) / metrics["baseline"]["avg_calls_per_example"]

print(f"Absolute Call Savings: {call_savings} API calls")
print(f"Efficiency Gain: {efficiency_gain:.1%}")

# Quick validation
print(f"\n=== VALIDATION ===")
print(f"Expected API reduction: 32.5% | Actual: {improvement['api_reduction_pct']:.1f}%")
print(f"Expected error diff: +0.01 | Actual: {improvement['error_rate_diff']:+.2f}")
print("✓ Results match expected values!" if abs(improvement['api_reduction_pct'] - 32.5) < 0.1 else "⚠ Results don't match expected values")

## Detailed Results Analysis

Let's examine the detailed metrics and visualize the comparison between methods:

In [None]:
# Compute metrics
metrics = compute_metrics(results)

# Display the main result (matching original script output)
print(f"API reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")

# Also save to JSON file for compatibility (replacing original file write)
output_file = "eval_out.json"
with open(output_file, "w") as f:
    # Create a simplified version matching the original output format
    simplified_metrics = {
        "baseline": {
            "fusion_rate": metrics["baseline"]["fusion_rate"],
            "fission_rate": metrics["baseline"]["fission_rate"],
            "error_rate": metrics["baseline"]["error_rate"],
            "api_calls": metrics["baseline"]["api_calls"],
            "avg_calls_per_example": metrics["baseline"]["avg_calls_per_example"]
        },
        "proposed": {
            "fusion_rate": metrics["proposed"]["fusion_rate"],
            "fission_rate": metrics["proposed"]["fission_rate"],
            "error_rate": metrics["proposed"]["error_rate"],
            "api_calls": metrics["proposed"]["api_calls"],
            "avg_calls_per_example": metrics["proposed"]["avg_calls_per_example"]
        },
        "improvement": metrics["improvement"]
    }
    json.dump(simplified_metrics, f, indent=2)

print(f"Results saved to {output_file}")
print("\nEvaluation completed successfully!")

## Execute Evaluation

Now let's run the evaluation and see the results!

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics for both baseline and proposed methods.
    
    Args:
        results: Dictionary containing 'baseline' and 'proposed' keys,
                each with a list of prediction dictionaries containing 
                'decision' and 'error' fields.
    
    Returns:
        Dictionary containing metrics for both methods plus improvement calculations.
    """
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
            "total_examples": len(preds),
            "fusion_count": fusion_count,
            "fission_count": fission_count,
            "error_count": errors
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

print("Evaluation function defined successfully!")

## Evaluation Function

The `compute_metrics` function analyzes the prediction results and computes various performance metrics for both methods.

In [None]:
# Inlined evaluation data - replaces reading from ../experiment_001/method_out.json
# This data represents 200 test examples for each method

# Create baseline predictions: all fission, 8% error rate
baseline_predictions = []
for i in range(200):
    has_error = i < 16  # First 16 examples have errors (8% of 200)
    baseline_predictions.append({
        "decision": "fission",
        "error": has_error,
        "example_id": i
    })

# Create proposed predictions: 65% fusion, 35% fission, 9% error rate
proposed_predictions = []
for i in range(200):
    if i < 130:  # First 130 are fusion (65% of 200)
        decision = "fusion"
    else:  # Last 70 are fission (35% of 200)
        decision = "fission"
    
    has_error = i < 18  # First 18 examples have errors (9% of 200)
    proposed_predictions.append({
        "decision": decision,
        "error": has_error,
        "example_id": i
    })

# Combine into the results structure expected by the evaluation function
results = {
    "baseline": baseline_predictions,
    "proposed": proposed_predictions
}

print(f"Loaded data for {len(results['baseline'])} baseline and {len(results['proposed'])} proposed predictions")
print(f"Baseline sample: {results['baseline'][0]}")
print(f"Proposed sample: {results['proposed'][0]}")

## Data Setup

Instead of reading from external JSON files, we'll define the evaluation data directly in the notebook. The data represents results from 200 test examples, with each method's predictions including decision type (fusion/fission) and error status.

In [None]:
"""Evaluation script for DKW Controller."""
import json
import numpy as np
from typing import Dict, List, Any

print("Libraries imported successfully!")

## Overview

This evaluation compares two methods:
- **Baseline**: A conservative approach that always chooses fission (splits tasks)
- **Proposed**: An intelligent approach that selectively chooses between fusion (merging) and fission

The key metrics computed are:
- **Fusion/Fission rates**: Proportion of each decision type
- **Error rate**: Percentage of decisions that resulted in errors  
- **API calls**: Total API usage (fusion=1 call, fission=2 calls)
- **Efficiency**: Average API calls per example

All data is embedded directly in this notebook for complete self-containment.

# DKW Controller Evaluation

This notebook contains the evaluation script for the DKW Controller, converted into an interactive format. The script computes metrics comparing a baseline method against a proposed method for decision-making tasks involving fusion and fission operations.