In [None]:
# Show the results in the same format as the original script would save
print("üìù RESULTS OUTPUT (method_out.json format):")
print("=" * 50)

# Convert results to JSON-serializable format
formatted_results = {
    "baseline": [
        {
            "id": r["id"],
            "decision": r["decision"], 
            "error": r["error"]
        }
        for r in experiment_results["baseline"]
    ],
    "proposed": [
        {
            "id": r["id"],
            "decision": r["decision"],
            "error": r["error"] 
        }
        for r in experiment_results["proposed"]
    ]
}

# Display as formatted JSON
print(json.dumps(formatted_results, indent=2))

# Optional: Save to file (uncomment to enable)
# with open("method_out.json", "w") as f:
#     json.dump(formatted_results, f, indent=2)
# print("\n‚úì Results saved to method_out.json")

print(f"\n‚úÖ NOTEBOOK COMPLETE!")
print(f"   - DKW Controller implemented and tested")
print(f"   - {len(test_data)} examples processed")
print(f"   - Baseline vs. proposed comparison completed")
print(f"   - Ready for further experimentation!")

## Save Results (Optional)

The original script saved results to `method_out.json`. Here we show the equivalent output format and provide an option to save to file if desired.

In [None]:
# Experiment with different parameters!
# Try changing these values and re-running to see the effects:

def experiment_with_parameters(epsilon_target=0.15, delta=0.05, min_samples=50, hysteresis=0.02):
    """Run experiment with custom parameters."""
    print(f"üß™ CUSTOM EXPERIMENT")
    print(f"   epsilon_target: {epsilon_target} (target error rate)")
    print(f"   delta: {delta} (confidence parameter)") 
    print(f"   min_samples: {min_samples} (minimum observations)")
    print(f"   hysteresis: {hysteresis} (anti-oscillation buffer)")
    print()
    
    # Create custom controller
    custom_controller = DKWController(
        epsilon_target=epsilon_target,
        delta=delta, 
        min_samples=min_samples,
        hysteresis=hysteresis
    )
    
    # Run experiment
    custom_results = {"baseline": [], "proposed": []}
    
    for example in test_data:
        error = np.random.random() < example["difficulty"]
        custom_controller.add_observation(float(error))
        decision = custom_controller.decide()
        
        custom_results["proposed"].append({
            "id": example["id"],
            "decision": decision, 
            "error": error,
        })
        custom_results["baseline"].append({
            "id": example["id"],
            "decision": "fission",
            "error": error,
        })
    
    # Quick analysis
    proposed_fusion_count = sum(1 for r in custom_results["proposed"] if r["decision"] == "fusion")
    proposed_error_count = sum(1 for r in custom_results["proposed"] if r["error"])
    
    print(f"üìä Results: {proposed_fusion_count} fusion decisions, {proposed_error_count} errors")
    return custom_results

# Try the default parameters
print("=" * 50)
default_results = experiment_with_parameters()

print("\n" + "=" * 50)
print("Try experimenting with different parameters!")
print("For example:")
print("  experiment_with_parameters(epsilon_target=0.05)  # More aggressive")
print("  experiment_with_parameters(epsilon_target=0.20)  # More conservative") 
print("  experiment_with_parameters(min_samples=20)       # Faster decisions")
print("  experiment_with_parameters(hysteresis=0.10)      # Less switching")

## Interactive Experimentation

Try modifying the controller parameters to see how they affect performance! This section lets you explore different configurations.

In [None]:
# Analyze results
def analyze_results(results):
    """Analyze experiment results."""
    baseline = results["baseline"] 
    proposed = results["proposed"]
    
    print("=== EXPERIMENT RESULTS ANALYSIS ===\n")
    
    # Count decisions
    baseline_fission = sum(1 for r in baseline if r["decision"] == "fission")
    baseline_fusion = sum(1 for r in baseline if r["decision"] == "fusion") 
    proposed_fission = sum(1 for r in proposed if r["decision"] == "fission")
    proposed_fusion = sum(1 for r in proposed if r["decision"] == "fusion")
    
    print("üìä DECISION BREAKDOWN:")
    print(f"  Baseline:  {baseline_fission} fission, {baseline_fusion} fusion")
    print(f"  Proposed:  {proposed_fission} fission, {proposed_fusion} fusion")
    
    # Count errors
    baseline_errors = sum(1 for r in baseline if r["error"])
    proposed_errors = sum(1 for r in proposed if r["error"])
    
    print(f"\n‚ùå ERROR COUNTS:")
    print(f"  Baseline:  {baseline_errors}/{len(baseline)} errors ({baseline_errors/len(baseline):.1%})")
    print(f"  Proposed:  {proposed_errors}/{len(proposed)} errors ({proposed_errors/len(proposed):.1%})")
    
    # Performance metrics (assuming fusion is higher performance when no errors)
    baseline_performance = baseline_fusion  # Only fusion gives performance benefit
    proposed_performance = proposed_fusion  
    
    print(f"\nüöÄ PERFORMANCE OPPORTUNITIES:")
    print(f"  Baseline:  {baseline_performance} high-performance decisions")  
    print(f"  Proposed:  {proposed_performance} high-performance decisions")
    
    if proposed_performance > baseline_performance:
        print(f"  üìà Proposed approach used {proposed_performance - baseline_performance} more high-performance decisions")
    
    # Show decision timeline
    print(f"\nüìã DECISION TIMELINE:")
    print(f"{'Example':<12} {'Error':<8} {'Baseline':<10} {'Proposed':<10}")
    print("-" * 42)
    for i, (b, p) in enumerate(zip(baseline, proposed)):
        error_symbol = "‚úó" if b["error"] else "‚úì"
        highlight = " üîÑ" if b["decision"] != p["decision"] else ""
        print(f"{b['id']:<12} {error_symbol:<8} {b['decision']:<10} {p['decision']:<10}{highlight}")
    
    return {
        "baseline_errors": baseline_errors,
        "proposed_errors": proposed_errors, 
        "baseline_performance": baseline_performance,
        "proposed_performance": proposed_performance
    }

# Analyze our experiment results
analysis = analyze_results(experiment_results)

## Results Analysis

Let's analyze the performance of both approaches and see how the DKW controller adapts its decisions based on the observed error rates.

In [None]:
def run_experiment(data):
    """Run DKW controller experiment."""
    controller = DKWController()
    results = {"baseline": [], "proposed": []}
    
    print("Running experiment...")
    print(f"{'Example':<12} {'Difficulty':<12} {'Error':<8} {'Baseline':<10} {'Proposed':<10} {'Samples':<8}")
    print("-" * 70)
    
    for i, example in enumerate(data):
        # Simulate error occurrence based on difficulty
        error = np.random.random() < example["difficulty"]
        controller.add_observation(float(error))
        decision = controller.decide()

        # Store results
        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",  # Always conservative
            "error": error,
        })
        
        # Print progress
        print(f"{example['id']:<12} {example['difficulty']:<12.3f} {'‚úó' if error else '‚úì':<8} {'fission':<10} {decision:<10} {len(controller.samples):<8}")
    
    print("\n‚úì Experiment completed")
    return results

# Run the experiment with our inline data
experiment_results = run_experiment(test_data)

## Experiment Function

The `run_experiment` function compares two approaches:

1. **Baseline**: Always uses conservative "fission" mode
2. **Proposed**: Uses DKW controller to adaptively switch between modes

The function simulates errors based on each example's difficulty level and tracks the controller's decisions over time.

In [None]:
# Inline test data - simulates reading from "../dataset_001/data_out.json"
# Each example has an ID and difficulty level (0.0 = easy, 1.0 = very hard)
test_data = [
    {"id": "example_000", "difficulty": 0.05},  # Very easy - 5% error chance
    {"id": "example_001", "difficulty": 0.08},  # Easy - 8% error chance  
    {"id": "example_002", "difficulty": 0.15},  # Medium - 15% error chance
    {"id": "example_003", "difficulty": 0.12},  # Medium-easy - 12% error chance
    {"id": "example_004", "difficulty": 0.18},  # Medium-hard - 18% error chance
    {"id": "example_005", "difficulty": 0.22},  # Hard - 22% error chance
    {"id": "example_006", "difficulty": 0.09},  # Easy - 9% error chance
    {"id": "example_007", "difficulty": 0.14},  # Medium - 14% error chance
    {"id": "example_008", "difficulty": 0.25},  # Very hard - 25% error chance
    {"id": "example_009", "difficulty": 0.11},  # Medium-easy - 11% error chance
]

# Expected results data - what the original script would have output
expected_results = {
  "baseline": [
    {
      "id": "example_000",
      "decision": "fission", 
      "error": False
    },
    {
      "id": "example_001",
      "decision": "fission",
      "error": False
    },
    {
      "id": "example_002", 
      "decision": "fission",
      "error": True
    }
  ],
  "proposed": [
    {
      "id": "example_000",
      "decision": "fission",
      "error": False
    },
    {
      "id": "example_001",
      "decision": "fusion", 
      "error": False
    },
    {
      "id": "example_002",
      "decision": "fusion",
      "error": True
    }
  ]
}

print(f"‚úì Loaded {len(test_data)} test examples")
print(f"‚úì Loaded expected results for comparison")
print(f"  - Difficulty range: {min(ex['difficulty'] for ex in test_data):.2f} to {max(ex['difficulty'] for ex in test_data):.2f}")

## Inline Test Data

Instead of reading from external JSON files, we'll define the test data directly in the notebook to make it self-contained.

The data simulates examples with varying difficulty levels that affect error probability.

In [None]:
@dataclass
class DKWController:
    """DKW-guided fusion/fission controller."""
    epsilon_target: float = 0.10
    delta: float = 0.05
    min_samples: int = 100
    hysteresis: float = 0.05

    samples: list = field(default_factory=list)
    current_state: str = "fission"

    def dkw_epsilon(self, n: int) -> float:
        """Compute DKW epsilon for n samples."""
        if n < 2:
            return 1.0
        return np.sqrt(np.log(2 / self.delta) / (2 * n))

    def add_observation(self, error: float) -> None:
        """Add error observation for calibration."""
        self.samples.append(error)

    def decide(self) -> str:
        """Make fusion/fission decision with DKW guarantee."""
        n = len(self.samples)
        if n < self.min_samples:
            return self.current_state

        epsilon = self.dkw_epsilon(n)
        empirical_error = np.mean(self.samples[-self.min_samples:])
        error_upper_bound = empirical_error + epsilon

        if self.current_state == "fusion":
            if error_upper_bound > self.epsilon_target + self.hysteresis:
                self.current_state = "fission"
        else:
            if error_upper_bound < self.epsilon_target - self.hysteresis:
                self.current_state = "fusion"

        return self.current_state

# Test the controller creation
controller = DKWController()
print(f"‚úì Created DKW Controller")
print(f"  - Target error rate: {controller.epsilon_target}")
print(f"  - Confidence level: {1-controller.delta:.0%}")
print(f"  - Initial state: {controller.current_state}")

## DKW Controller Class

The `DKWController` implements a statistical decision-making system using the **Dvoretzky-Kiefer-Wolfowitz inequality**.

### Key Parameters:
- `epsilon_target`: Target error rate threshold (default: 0.10 = 10%)
- `delta`: Confidence level parameter for DKW bound (default: 0.05 = 95% confidence)
- `min_samples`: Minimum observations before making decisions (default: 100)
- `hysteresis`: Prevents oscillation between modes (default: 0.05)

### How it works:
1. **Collects error observations** from the system
2. **Computes DKW epsilon** - statistical bound on estimation error
3. **Calculates upper confidence bound** on error rate 
4. **Makes switching decisions** with hysteresis to prevent oscillation

In [None]:
"""DKW Controller Implementation."""
import json
import numpy as np
from dataclasses import dataclass, field

# Set random seed for reproducible results
np.random.seed(42)

print("‚úì Imported required libraries")
print("‚úì Set random seed for reproducibility")

# DKW Controller Implementation Demo

This notebook demonstrates a **DKW-guided fusion/fission controller** that uses the Dvoretzky-Kiefer-Wolfowitz (DKW) inequality to make statistically-grounded decisions between fusion and fission modes.

## Overview
- **Artifact ID**: experiment_001
- **Original file**: method.py
- **Purpose**: Adaptive controller that switches between fusion/fission based on error rate observations

The controller uses statistical guarantees to decide when to switch between:
- **Fusion mode**: More aggressive, higher performance but potentially higher error rate
- **Fission mode**: Conservative, lower performance but more reliable