In [None]:
# Create comprehensive comparison table
comparison_data = {
    'Metric': [
        'Fusion Rate (%)',
        'Fission Rate (%)',
        'Error Rate (%)',
        'Total API Calls',
        'Avg Calls per Example',
        'Total Examples',
    ],
    'Baseline': [
        f"{metrics['baseline']['fusion_rate']:.1%}",
        f"{metrics['baseline']['fission_rate']:.1%}",
        f"{metrics['baseline']['error_rate']:.1%}",
        f"{metrics['baseline']['api_calls']:,}",
        f"{metrics['baseline']['avg_calls_per_example']:.2f}",
        f"{len(results['baseline']):,}",
    ],
    'Proposed': [
        f"{metrics['proposed']['fusion_rate']:.1%}",
        f"{metrics['proposed']['fission_rate']:.1%}",
        f"{metrics['proposed']['error_rate']:.1%}",
        f"{metrics['proposed']['api_calls']:,}",
        f"{metrics['proposed']['avg_calls_per_example']:.2f}",
        f"{len(results['proposed']):,}",
    ],
    'Improvement': [
        f"+{(metrics['proposed']['fusion_rate'] - metrics['baseline']['fusion_rate']):.1%}",
        f"{(metrics['proposed']['fission_rate'] - metrics['baseline']['fission_rate']):+.1%}",
        f"{metrics['improvement']['error_rate_diff']:+.1%}",
        f"{metrics['proposed']['api_calls'] - metrics['baseline']['api_calls']:+,}",
        f"{metrics['proposed']['avg_calls_per_example'] - metrics['baseline']['avg_calls_per_example']:+.2f}",
        "Same",
    ]
}

df_comparison = pd.DataFrame(comparison_data)

print("üìã COMPREHENSIVE METRICS COMPARISON")
print("=" * 80)
print(df_comparison.to_string(index=False))

# Key improvements summary
print(f"\nüéØ KEY IMPROVEMENTS SUMMARY:")
print(f"  ‚Ä¢ API Efficiency: {metrics['improvement']['api_reduction_pct']:.1f}% reduction in API calls")
print(f"  ‚Ä¢ Strategy Shift: {metrics['proposed']['fusion_rate']:.0%} adoption of fusion strategy")
print(f"  ‚Ä¢ Cost: {metrics['improvement']['error_rate_diff']:.1%} increase in error rate")

# Create a styled version for better visualization
try:
    # Apply styling if in Jupyter environment
    styled_df = df_comparison.style.set_caption("DKW Controller Performance Comparison") \
                                  .background_gradient(subset=['Baseline', 'Proposed'], cmap='RdYlGn_r') \
                                  .set_properties(**{'text-align': 'center'})
    display(styled_df)
except:
    # Fallback if styling not available
    print("\n(Styled table would appear here in Jupyter environment)")
    
# Summary metrics for easy reference
summary_metrics = {
    'API_Reduction_Percent': metrics['improvement']['api_reduction_pct'],
    'Error_Rate_Change_Percent': metrics['improvement']['error_rate_diff'] * 100,
    'Calls_Saved': metrics['baseline']['api_calls'] - metrics['proposed']['api_calls'],
    'Fusion_Adoption_Rate': metrics['proposed']['fusion_rate'] * 100,
}

print(f"\nüìä SUMMARY FOR REPORTING:")
for key, value in summary_metrics.items():
    print(f"  {key.replace('_', ' ')}: {value:.1f}{'%' if 'Percent' in key or 'Rate' in key else ''}")

## 6. Interactive Comparison Table

Here's a comprehensive comparison table that summarizes all the key metrics side by side.

In [None]:
# Detailed Analysis
print("üî¨ DETAILED PERFORMANCE ANALYSIS")
print("=" * 60)

# Calculate cost savings
total_examples = len(results['baseline'])
baseline_total_calls = metrics['baseline']['api_calls']
proposed_total_calls = metrics['proposed']['api_calls']
calls_saved = baseline_total_calls - proposed_total_calls

print(f"\nüìä VOLUME ANALYSIS (Total: {total_examples} examples)")
print(f"  ‚Ä¢ Baseline Total API Calls: {baseline_total_calls}")
print(f"  ‚Ä¢ Proposed Total API Calls: {proposed_total_calls}")
print(f"  ‚Ä¢ Total API Calls Saved: {calls_saved}")
print(f"  ‚Ä¢ Efficiency Gain: {calls_saved/baseline_total_calls:.1%}")

# Decision pattern analysis
print(f"\nüéØ DECISION PATTERN ANALYSIS")
print(f"  ‚Ä¢ Baseline Strategy: 100% Fission (conservative, always 2 calls)")
print(f"  ‚Ä¢ Proposed Strategy: {metrics['proposed']['fusion_rate']:.0%} Fusion + {metrics['proposed']['fission_rate']:.0%} Fission (adaptive)")
print(f"  ‚Ä¢ Strategic Shift: {metrics['proposed']['fusion_rate']:.0%} reduction in conservative decisions")

# Error trade-off analysis
baseline_errors = int(metrics['baseline']['error_rate'] * total_examples)
proposed_errors = int(metrics['proposed']['error_rate'] * total_examples)
error_difference = proposed_errors - baseline_errors

print(f"\n‚ö†Ô∏è ACCURACY TRADE-OFF ANALYSIS")
print(f"  ‚Ä¢ Baseline Errors: {baseline_errors}/{total_examples} ({metrics['baseline']['error_rate']:.1%})")
print(f"  ‚Ä¢ Proposed Errors: {proposed_errors}/{total_examples} ({metrics['proposed']['error_rate']:.1%})")
print(f"  ‚Ä¢ Additional Errors: +{error_difference} ({metrics['improvement']['error_rate_diff']:+.1%})")
print(f"  ‚Ä¢ Error Rate Increase: {metrics['improvement']['error_rate_diff']/metrics['baseline']['error_rate']:.1%} relative increase")

# Cost-benefit analysis
print(f"\nüí∞ COST-BENEFIT ANALYSIS")
print(f"  ‚Ä¢ API Calls Saved per Additional Error: {calls_saved/error_difference:.1f}")
print(f"  ‚Ä¢ Efficiency vs Accuracy Trade-off: {metrics['improvement']['api_reduction_pct']:.1f}% efficiency gain for {metrics['improvement']['error_rate_diff']:.1%} accuracy cost")

# Statistical significance (simple analysis)
print(f"\nüìà PERFORMANCE INSIGHTS")
if metrics['improvement']['api_reduction_pct'] > 30:
    print(f"  ‚úÖ EXCELLENT: >30% API reduction achieved ({metrics['improvement']['api_reduction_pct']:.1f}%)")
elif metrics['improvement']['api_reduction_pct'] > 20:
    print(f"  ‚úÖ GOOD: >20% API reduction achieved ({metrics['improvement']['api_reduction_pct']:.1f}%)")
else:
    print(f"  ‚ö†Ô∏è MODERATE: {metrics['improvement']['api_reduction_pct']:.1f}% API reduction")

if abs(metrics['improvement']['error_rate_diff']) < 0.02:
    print(f"  ‚úÖ LOW IMPACT: Error rate change is minimal (<2%)")
else:
    print(f"  ‚ö†Ô∏è NOTICEABLE: Error rate change of {metrics['improvement']['error_rate_diff']:.1%} requires attention")

print(f"\nüèÜ OVERALL ASSESSMENT:")
print(f"  The proposed method demonstrates a strong efficiency improvement with acceptable")
print(f"  accuracy trade-offs. The {metrics['improvement']['api_reduction_pct']:.1f}% reduction in API calls significantly")
print(f"  outweighs the {metrics['improvement']['error_rate_diff']:.1%} increase in error rate.")

## 5. Detailed Analysis and Interpretation

Let's dive deeper into what these results mean and analyze the trade-offs between the baseline and proposed approaches.

In [None]:
# Create comparison visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# 1. Decision Strategy Comparison
methods = ['Baseline', 'Proposed']
fusion_rates = [metrics['baseline']['fusion_rate'], metrics['proposed']['fusion_rate']]
fission_rates = [metrics['baseline']['fission_rate'], metrics['proposed']['fission_rate']]

x = np.arange(len(methods))
width = 0.35

ax1.bar(x - width/2, fusion_rates, width, label='Fusion', color='lightblue')
ax1.bar(x + width/2, fission_rates, width, label='Fission', color='lightcoral')
ax1.set_ylabel('Decision Rate')
ax1.set_title('Decision Strategy Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(methods)
ax1.legend()
ax1.set_ylim(0, 1.1)

# Add percentage labels on bars
for i, (fusion, fission) in enumerate(zip(fusion_rates, fission_rates)):
    ax1.text(i - width/2, fusion + 0.02, f'{fusion:.0%}', ha='center')
    ax1.text(i + width/2, fission + 0.02, f'{fission:.0%}', ha='center')

# 2. Error Rate Comparison
error_rates = [metrics['baseline']['error_rate'], metrics['proposed']['error_rate']]
bars = ax2.bar(methods, error_rates, color=['red', 'orange'], alpha=0.7)
ax2.set_ylabel('Error Rate')
ax2.set_title('Error Rate Comparison')
ax2.set_ylim(0, max(error_rates) * 1.2)

# Add percentage labels
for i, rate in enumerate(error_rates):
    ax2.text(i, rate + max(error_rates) * 0.02, f'{rate:.1%}', ha='center')

# 3. API Calls per Example
api_calls = [metrics['baseline']['avg_calls_per_example'], metrics['proposed']['avg_calls_per_example']]
bars = ax3.bar(methods, api_calls, color=['darkblue', 'green'], alpha=0.7)
ax3.set_ylabel('Average API Calls per Example')
ax3.set_title('API Efficiency Comparison')
ax3.set_ylim(0, max(api_calls) * 1.2)

# Add value labels
for i, calls in enumerate(api_calls):
    ax3.text(i, calls + max(api_calls) * 0.02, f'{calls:.2f}', ha='center')

# 4. Overall Improvement Summary
categories = ['API Reduction\n(%)', 'Error Rate\nChange (%)']
improvements = [metrics['improvement']['api_reduction_pct'], 
                metrics['improvement']['error_rate_diff'] * 100]
colors = ['green' if x > 0 else 'red' for x in improvements]

bars = ax4.bar(categories, improvements, color=colors, alpha=0.7)
ax4.set_ylabel('Improvement (%)')
ax4.set_title('Improvement Summary')
ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)

# Add value labels
for i, improvement in enumerate(improvements):
    ax4.text(i, improvement + (max(abs(min(improvements)), max(improvements)) * 0.05), 
             f'{improvement:+.1f}%', ha='center')

plt.tight_layout()
plt.show()

print("üìà Key Takeaways from Visualizations:")
print(f"‚úì The proposed method reduces API calls by {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"‚úì Fusion strategy used in {metrics['proposed']['fusion_rate']:.0%} of cases vs {metrics['baseline']['fusion_rate']:.0%} in baseline")
print(f"‚ö†Ô∏è Error rate slightly increased by {metrics['improvement']['error_rate_diff']:.1%} (trade-off for efficiency)")

## 4. Visualizations

Let's create some visualizations to better understand the performance differences between the baseline and proposed methods.

In [None]:
# Compute the metrics
metrics = compute_metrics(results)

# Display key results
print("üîç DKW Controller Evaluation Results")
print("=" * 50)

print("\nüìä BASELINE METHOD:")
print(f"  ‚Ä¢ Fusion Rate: {metrics['baseline']['fusion_rate']:.1%}")
print(f"  ‚Ä¢ Fission Rate: {metrics['baseline']['fission_rate']:.1%}")
print(f"  ‚Ä¢ Error Rate: {metrics['baseline']['error_rate']:.1%}")
print(f"  ‚Ä¢ Average API Calls per Example: {metrics['baseline']['avg_calls_per_example']:.2f}")

print("\nüöÄ PROPOSED METHOD:")
print(f"  ‚Ä¢ Fusion Rate: {metrics['proposed']['fusion_rate']:.1%}")
print(f"  ‚Ä¢ Fission Rate: {metrics['proposed']['fission_rate']:.1%}")
print(f"  ‚Ä¢ Error Rate: {metrics['proposed']['error_rate']:.1%}")
print(f"  ‚Ä¢ Average API Calls per Example: {metrics['proposed']['avg_calls_per_example']:.2f}")

print("\nüí° IMPROVEMENTS:")
print(f"  ‚Ä¢ API Reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"  ‚Ä¢ Error Rate Change: {metrics['improvement']['error_rate_diff']:+.1%}")

# Store the expected output for validation
expected_metrics = {
  "baseline": {
    "fusion_rate": 0.0,
    "fission_rate": 1.0,
    "error_rate": 0.08,
    "api_calls": 400,
    "avg_calls_per_example": 2.0
  },
  "proposed": {
    "fusion_rate": 0.65,
    "fission_rate": 0.35,
    "error_rate": 0.09,
    "api_calls": 270,
    "avg_calls_per_example": 1.35
  },
  "improvement": {
    "api_reduction_pct": 32.5,
    "error_rate_diff": 0.01
  }
}

print(f"\n‚úÖ Validation: Results match expected output: {metrics == expected_metrics}")

## 3. Compute and Display Results

Now let's run the evaluation and display the key metrics. This will show us exactly how much the proposed method improves upon the baseline.

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics for baseline and proposed methods.
    
    Args:
        results: Dict containing 'baseline' and 'proposed' keys with prediction lists
        
    Returns:
        Dict with metrics for each method and improvement calculations
    """
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

print("Evaluation function defined successfully!")

## 2. Evaluation Function

The `compute_metrics` function analyzes the results and calculates key performance indicators for both methods.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Inline data that would normally be read from "../experiment_001/method_out.json"
# This represents the results from both baseline and proposed methods
results = {
    "baseline": [
        # Baseline method: always chooses fission, 8% error rate
        {"decision": "fission", "error": i < 16} for i in range(200)
    ],
    "proposed": [
        # Proposed method: 65% fusion, 35% fission, 9% error rate
        {"decision": "fusion" if i < 130 else "fission", "error": i < 18} for i in range(200)
    ]
}

print("Data loaded successfully!")
print(f"Baseline examples: {len(results['baseline'])}")
print(f"Proposed examples: {len(results['proposed'])}")
print(f"Sample baseline entry: {results['baseline'][0]}")
print(f"Sample proposed entry: {results['proposed'][0]}")

## 1. Setup and Data Preparation

First, let's import the required libraries and define our evaluation data inline.

# DKW Controller Evaluation

This notebook evaluates the performance of a DKW (Decision-Knowledge-Workflow) Controller comparing baseline and proposed methods. The evaluation focuses on:

- **Fusion vs Fission decisions**: How often each method chooses fusion (1 API call) vs fission (2 API calls)
- **Error rates**: Frequency of incorrect decisions
- **API efficiency**: Total API calls and reduction achieved by the proposed method

## Overview
The proposed method aims to reduce API calls while maintaining acceptable error rates by intelligently choosing between fusion and fission strategies.