In [None]:
# Interactive parameter exploration function
def run_experiment_with_params(epsilon_target=0.10, delta=0.05, min_samples=100, hysteresis=0.05):
    """Run experiment with custom parameters."""
    
    # Create controller with custom parameters
    controller = DKWController(
        epsilon_target=epsilon_target,
        delta=delta,
        min_samples=min_samples,
        hysteresis=hysteresis
    )
    
    results = {"baseline": [], "proposed": []}
    
    for example in sample_data:
        error = np.random.random() < example["difficulty"]
        controller.add_observation(float(error))
        decision = controller.decide()

        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
            "empirical_error": np.mean(controller.samples) if controller.samples else 0.0
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",
            "error": error,
        })
    
    # Quick analysis
    proposed_df = pd.DataFrame(results["proposed"])
    baseline_df = pd.DataFrame(results["baseline"])
    
    fusion_rate = (proposed_df['decision'] == 'fusion').mean()
    error_rate = proposed_df['error'].mean()
    
    print(f"Parameters: target={epsilon_target}, δ={delta}, min_samples={min_samples}, hysteresis={hysteresis}")
    print(f"Results: {fusion_rate:.1%} fusion decisions, {error_rate:.3f} error rate")
    
    return results, proposed_df

# Example: Try different target error rates
print("=== Comparing Different Target Error Rates ===")
for target in [0.05, 0.10, 0.15]:
    run_experiment_with_params(epsilon_target=target)

## 7. Interactive Parameter Exploration

Try modifying the controller parameters below and re-run the experiment to see how they affect performance:

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Decision timeline for proposed method
axes[0,0].plot(range(len(proposed_df)), proposed_df['decision'].map({'fission': 0, 'fusion': 1}), 'b-', linewidth=2)
axes[0,0].set_title('DKW Controller Decision Timeline')
axes[0,0].set_xlabel('Example Index')
axes[0,0].set_ylabel('Decision (0=Fission, 1=Fusion)')
axes[0,0].grid(True, alpha=0.3)

# Plot 2: Empirical error rate over time
axes[0,1].plot(range(len(proposed_df)), proposed_df['empirical_error'], 'r-', linewidth=2, label='Empirical Error')
axes[0,1].axhline(y=0.10, color='g', linestyle='--', label='Target (10%)')
axes[0,1].set_title('Empirical Error Rate Over Time')
axes[0,1].set_xlabel('Example Index')
axes[0,1].set_ylabel('Error Rate')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Plot 3: Error comparison histogram
error_comparison = pd.DataFrame({
    'Baseline': baseline_df['error'].astype(int),
    'Proposed': proposed_df['error'].astype(int)
})

# Count errors vs correct for each method
baseline_counts = baseline_df['error'].value_counts().sort_index()
proposed_counts = proposed_df['error'].value_counts().sort_index()

x = ['Correct', 'Error']
baseline_vals = [baseline_counts.get(False, 0), baseline_counts.get(True, 0)]
proposed_vals = [proposed_counts.get(False, 0), proposed_counts.get(True, 0)]

x_pos = np.arange(len(x))
width = 0.35

axes[1,0].bar(x_pos - width/2, baseline_vals, width, label='Baseline', alpha=0.7)
axes[1,0].bar(x_pos + width/2, proposed_vals, width, label='Proposed', alpha=0.7)
axes[1,0].set_title('Error Comparison')
axes[1,0].set_xlabel('Outcome')
axes[1,0].set_ylabel('Count')
axes[1,0].set_xticks(x_pos)
axes[1,0].set_xticklabels(x)
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Plot 4: Decision distribution pie chart
decision_counts = proposed_df['decision'].value_counts()
axes[1,1].pie(decision_counts.values, labels=decision_counts.index, autopct='%1.1f%%', startangle=90)
axes[1,1].set_title('Decision Distribution (DKW Controller)')

plt.tight_layout()
plt.show()

# Calculate and display key metrics
print("\\n=== Performance Metrics ===")
print(f"Total examples: {len(proposed_df)}")
print(f"Final empirical error rate: {proposed_df['empirical_error'].iloc[-1]:.4f}")

fusion_decisions = (proposed_df['decision'] == 'fusion').sum()
fission_decisions = (proposed_df['decision'] == 'fission').sum()
print(f"Fusion decisions: {fusion_decisions} ({fusion_decisions/len(proposed_df)*100:.1f}%)")
print(f"Fission decisions: {fission_decisions} ({fission_decisions/len(proposed_df)*100:.1f}%)")

# Calculate efficiency gains (assuming fusion is more efficient when safe)
baseline_always_conservative = len(baseline_df)  # Always uses fission (conservative)
proposed_fusion_usage = fusion_decisions
efficiency_gain = proposed_fusion_usage / baseline_always_conservative * 100
print(f"Efficiency gain over baseline: {efficiency_gain:.1f}% more fusion usage")

## 6. Results Analysis and Visualization

Let's visualize the results to better understand how the DKW controller behaves:

In [None]:
# Run the experiment
results = run_experiment(sample_data)

# Convert to DataFrames for easier analysis
baseline_df = pd.DataFrame(results["baseline"])
proposed_df = pd.DataFrame(results["proposed"])

print("Experiment completed!")
print(f"Total examples processed: {len(baseline_df)}")
print(f"Baseline (always fission) error rate: {baseline_df['error'].mean():.3f}")
print(f"Proposed method error rate: {proposed_df['error'].mean():.3f}")

# Show decision distribution
print(f"\nBaseline decisions: {baseline_df['decision'].value_counts().to_dict()}")
print(f"Proposed decisions: {proposed_df['decision'].value_counts().to_dict()}")

# Show first few results
print("\nFirst 10 results (Proposed method):")
print(proposed_df[['id', 'decision', 'error', 'difficulty', 'samples_so_far', 'empirical_error']].head(10))

## 5. Run the Experiment

Let's run our experiment and compare the DKW controller against the conservative baseline:

In [None]:
def run_experiment(data):
    """Run DKW controller experiment with inlined data."""
    controller = DKWController()
    results = {"baseline": [], "proposed": []}

    for example in data:
        # Simulate error occurrence based on difficulty
        error = np.random.random() < example["difficulty"]
        controller.add_observation(float(error))
        decision = controller.decide()

        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
            "difficulty": example["difficulty"],
            "samples_so_far": len(controller.samples),
            "empirical_error": np.mean(controller.samples) if controller.samples else 0.0
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",  # Always conservative
            "error": error,
            "difficulty": example["difficulty"]
        })

    return results

print("Experiment function defined successfully!")

## 4. Experiment Function

Now let's define the experiment function that will test both our DKW controller and a conservative baseline that always chooses "fission".

In [None]:
# Sample dataset (inlined from ../dataset_001/data_out.json)
# Each example has an ID and a difficulty level (probability of error)
sample_data = [
    {"id": "example_000", "difficulty": 0.02},  # Easy example
    {"id": "example_001", "difficulty": 0.05},  # Medium-easy example  
    {"id": "example_002", "difficulty": 0.15},  # Harder example
    {"id": "example_003", "difficulty": 0.08},  # Medium example
    {"id": "example_004", "difficulty": 0.03},  # Easy example
    {"id": "example_005", "difficulty": 0.12},  # Hard example
    {"id": "example_006", "difficulty": 0.06},  # Medium example
    {"id": "example_007", "difficulty": 0.18},  # Very hard example
    {"id": "example_008", "difficulty": 0.04},  # Easy example
    {"id": "example_009", "difficulty": 0.09},  # Medium example
]

# Extend the dataset with more examples for a more realistic experiment
for i in range(10, 150):
    # Create a mix of difficulties following a realistic distribution
    difficulty = np.random.beta(2, 8)  # Skewed toward lower difficulties
    sample_data.append({
        "id": f"example_{i:03d}",
        "difficulty": min(difficulty, 0.25)  # Cap at 25% max difficulty
    })

print(f"Generated {len(sample_data)} examples")
print(f"Difficulty range: {min(ex['difficulty'] for ex in sample_data):.3f} - {max(ex['difficulty'] for ex in sample_data):.3f}")
print(f"Average difficulty: {np.mean([ex['difficulty'] for ex in sample_data]):.3f}")

## 3. Sample Data (Inlined)

Instead of reading from external files, we'll define our sample data directly in the notebook. This data represents examples with varying difficulty levels that will be used to test our controller.

In [None]:
@dataclass
class DKWController:
    """DKW-guided fusion/fission controller."""
    epsilon_target: float = 0.10
    delta: float = 0.05
    min_samples: int = 100
    hysteresis: float = 0.05

    samples: list = field(default_factory=list)
    current_state: str = "fission"

    def dkw_epsilon(self, n: int) -> float:
        """Compute DKW epsilon for n samples."""
        if n < 2:
            return 1.0
        return np.sqrt(np.log(2 / self.delta) / (2 * n))

    def add_observation(self, error: float) -> None:
        """Add error observation for calibration."""
        self.samples.append(error)

    def decide(self) -> str:
        """Make fusion/fission decision with DKW guarantee."""
        n = len(self.samples)
        if n < self.min_samples:
            return self.current_state

        epsilon = self.dkw_epsilon(n)
        empirical_error = np.mean(self.samples[-self.min_samples:])
        error_upper_bound = empirical_error + epsilon

        if self.current_state == "fusion":
            if error_upper_bound > self.epsilon_target + self.hysteresis:
                self.current_state = "fission"
        else:
            if error_upper_bound < self.epsilon_target - self.hysteresis:
                self.current_state = "fusion"

        return self.current_state

# Test the controller with basic functionality
controller = DKWController()
print(f"Initial state: {controller.current_state}")
print(f"DKW epsilon for 100 samples: {controller.dkw_epsilon(100):.4f}")

## 2. DKW Controller Class

The core of our implementation is the `DKWController` class. This controller uses the **Dvoretzky-Kiefer-Wolfowitz inequality** to bound the error rate with high confidence.

### Key Parameters:
- `epsilon_target`: Target error rate threshold (10% by default)
- `delta`: Confidence parameter for DKW bound (95% confidence with δ=0.05)
- `min_samples`: Minimum observations before making decisions
- `hysteresis`: Prevents oscillation between states

In [None]:
import json
import numpy as np
from dataclasses import dataclass, field
import matplotlib.pyplot as plt
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

print("All imports successful!")

## 1. Imports and Setup

First, let's import the required libraries:

# DKW Controller Implementation - Interactive Demo

This notebook demonstrates a **DKW-guided fusion/fission controller** implementation from `method.py`. 

The controller uses the **Dvoretzky-Kiefer-Wolfowitz (DKW) inequality** to provide statistical guarantees when deciding between "fusion" and "fission" modes based on observed error rates.

## Key Features:
- Statistical error bounds using DKW inequality
- Adaptive decision making with hysteresis
- Comparison between proposed method and conservative baseline