In [None]:
# ðŸ’¾ Optional: Save results to JSON file (matches original script output)
save_results = True  # Set to False if you don't want to save

if save_results:
    # Format results to match original script output structure
    output_results = {
        "baseline": [
            {
                "id": r["id"],
                "decision": r["decision"], 
                "error": r["error"]
            }
            for r in results["baseline"]
        ],
        "proposed": [
            {
                "id": r["id"],
                "decision": r["decision"],
                "error": r["error"]
            }
            for r in results["proposed"]
        ]
    }
    
    # Save to file
    with open("method_out.json", "w") as f:
        json.dump(output_results, f, indent=2)
    
    print("âœ… Results saved to 'method_out.json'")
    print(f"   Baseline results: {len(output_results['baseline'])} entries")
    print(f"   Proposed results: {len(output_results['proposed'])} entries")
else:
    print("ðŸ’¡ Set save_results=True above to save results to JSON file")

print()
print("ðŸŽ‰ Notebook complete! The DKW Controller has been successfully demonstrated.")
print("   Feel free to modify parameters and re-run cells to explore different behaviors.")

## Conclusions & Next Steps

### Key Findings:
1. **Adaptive Control**: The DKW controller automatically adjusts between fusion and fission based on observed error rates
2. **Statistical Guarantees**: The DKW bound provides high-confidence upper bounds on true error rates
3. **Hysteresis Effect**: Prevents rapid switching between modes, providing stability
4. **Trade-offs**: More fusion usage can improve efficiency but may increase error rates

### Parameters to Experiment With:
- **`epsilon_target`**: Lower values = more conservative, higher values = more aggressive
- **`delta`**: Lower values = higher confidence but tighter bounds  
- **`min_samples`**: Higher values = more data before decisions but slower adaptation
- **`hysteresis`**: Higher values = less switching but potentially suboptimal decisions

### Potential Applications:
- Adaptive system configuration
- Quality vs. speed trade-offs in ML pipelines
- Resource allocation under uncertainty
- A/B testing with statistical guarantees

In [None]:
# ðŸ§ª INTERACTIVE EXPERIMENT - Try different parameters!

# Modify these parameters and re-run to see the impact
CUSTOM_EPSILON_TARGET = 0.08   # Try values like 0.05, 0.10, 0.15
CUSTOM_DELTA = 0.05           # Try values like 0.01, 0.05, 0.10  
CUSTOM_MIN_SAMPLES = 75       # Try values like 50, 100, 150
CUSTOM_HYSTERESIS = 0.03      # Try values like 0.01, 0.05, 0.10

# Create custom controller
custom_controller = DKWController(
    epsilon_target=CUSTOM_EPSILON_TARGET,
    delta=CUSTOM_DELTA, 
    min_samples=CUSTOM_MIN_SAMPLES,
    hysteresis=CUSTOM_HYSTERESIS
)

print(f"ðŸ”¬ Testing custom parameters:")
print(f"  Target error rate: {CUSTOM_EPSILON_TARGET:.1%}")
print(f"  Confidence level: {100*(1-CUSTOM_DELTA):.0f}%") 
print(f"  Min samples: {CUSTOM_MIN_SAMPLES}")
print(f"  Hysteresis: {CUSTOM_HYSTERESIS:.3f}")
print()

# Quick simulation with custom parameters
custom_results = {"decisions": [], "errors": [], "state_changes": 0}
previous_decision = "fission"

for i, example in enumerate(sample_data[:200]):  # Test on first 200 examples
    error = np.random.random() < example["difficulty"]
    custom_controller.add_observation(float(error))
    decision = custom_controller.decide()
    
    custom_results["decisions"].append(decision)
    custom_results["errors"].append(error)
    
    if decision != previous_decision:
        custom_results["state_changes"] += 1
        print(f"  State change at example {i}: {previous_decision} â†’ {decision}")
        previous_decision = decision

custom_fusion_rate = sum(1 for d in custom_results["decisions"] if d == "fusion") / len(custom_results["decisions"])
custom_error_rate = sum(custom_results["errors"]) / len(custom_results["errors"])

print()
print(f"ðŸ“Š Custom controller results (first 200 examples):")
print(f"  Fusion rate: {custom_fusion_rate:.1%}")
print(f"  Error rate: {custom_error_rate:.3f}")
print(f"  State changes: {custom_results['state_changes']}")

## Interactive Exploration

Try modifying the parameters below to see how they affect the controller's behavior!

In [None]:
# Create visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Extract data for plotting
examples = range(len(results["proposed"]))
difficulties = [r["difficulty"] for r in results["proposed"]]
decisions = [1 if r["decision"] == "fusion" else 0 for r in results["proposed"]]
errors = [1 if r["error"] else 0 for r in results["proposed"]]
empirical_errors = [r["empirical_error"] for r in results["proposed"]]
dkw_bounds = [r["dkw_bound"] for r in results["proposed"]]

# Plot 1: Controller decisions over time
ax1.fill_between(examples, decisions, alpha=0.7, label="DKW Controller")
ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
ax1.set_xlabel("Example Index")
ax1.set_ylabel("Decision (0=Fission, 1=Fusion)")
ax1.set_title("Controller Decisions Over Time")
ax1.set_ylim(-0.1, 1.1)
ax1.legend()

# Plot 2: Difficulty and errors
ax2.scatter(examples, difficulties, alpha=0.6, s=10, label="Difficulty", color='orange')
ax2.scatter([i for i, e in enumerate(errors) if e], [difficulties[i] for i, e in enumerate(errors) if e], 
           s=20, color='red', alpha=0.8, label="Errors", marker='x')
ax2.axhline(y=0.1, color='blue', linestyle='--', alpha=0.7, label="Target Error Rate")
ax2.set_xlabel("Example Index")
ax2.set_ylabel("Difficulty / Error Indicator")
ax2.set_title("Difficulty and Error Distribution")
ax2.legend()

# Plot 3: Empirical error rate with DKW bounds
window_size = 50
rolling_error = []
for i in range(len(empirical_errors)):
    start_idx = max(0, i - window_size + 1)
    rolling_error.append(np.mean(errors[start_idx:i+1]))

ax3.plot(examples, rolling_error, label="Rolling Error Rate", color='red', linewidth=2)
ax3.plot(examples, [r + b for r, b in zip(empirical_errors, dkw_bounds)], 
         label="Empirical + DKW Bound", color='orange', alpha=0.8)
ax3.axhline(y=0.1, color='blue', linestyle='--', label="Target (10%)")
ax3.axhline(y=0.15, color='blue', linestyle=':', alpha=0.7, label="Target + Hysteresis")
ax3.axhline(y=0.05, color='blue', linestyle=':', alpha=0.7, label="Target - Hysteresis")
ax3.set_xlabel("Example Index")
ax3.set_ylabel("Error Rate")
ax3.set_title("Error Rate Evolution with DKW Bounds")
ax3.legend()

# Plot 4: Performance comparison
methods = ['Baseline\n(Always Fission)', 'Proposed\n(DKW Controller)']
error_rates = [baseline_error_rate, proposed_error_rate]
fusion_rates = [baseline_fusion_count/len(results['baseline']), proposed_fusion_count/len(results['proposed'])]

x = np.arange(len(methods))
width = 0.35

bars1 = ax4.bar(x - width/2, error_rates, width, label='Error Rate', color='lightcoral')
bars2 = ax4.bar(x + width/2, fusion_rates, width, label='Fusion Rate', color='lightblue')

ax4.set_xlabel("Method")
ax4.set_ylabel("Rate")
ax4.set_title("Performance Comparison")
ax4.set_xticks(x)
ax4.set_xticklabels(methods)
ax4.legend()

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax4.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\nðŸ“ˆ Visualization shows:")
print("â€¢ Top-left: When the controller chose fusion vs fission")
print("â€¢ Top-right: Example difficulty and where errors occurred") 
print("â€¢ Bottom-left: How error rates evolved with statistical bounds")
print("â€¢ Bottom-right: Overall performance comparison")

## Visualization

Let's create visualizations to better understand the controller's behavior over time.

In [None]:
# Basic statistics
baseline_errors = sum(1 for r in results["baseline"] if r["error"])
proposed_errors = sum(1 for r in results["proposed"] if r["error"])

baseline_error_rate = baseline_errors / len(results["baseline"])
proposed_error_rate = proposed_errors / len(results["proposed"])

# Count decisions
baseline_fusion_count = sum(1 for r in results["baseline"] if r["decision"] == "fusion")
proposed_fusion_count = sum(1 for r in results["proposed"] if r["decision"] == "fusion")

print("ðŸ“Š EXPERIMENT RESULTS")
print("=" * 50)
print(f"Total examples processed: {len(results['baseline'])}")
print()

print("ðŸŽ¯ ERROR RATES:")
print(f"  Baseline (always fission): {baseline_error_rate:.3f} ({baseline_errors}/{len(results['baseline'])})")
print(f"  Proposed (DKW controller): {proposed_error_rate:.3f} ({proposed_errors}/{len(results['proposed'])})")
print()

print("âš¡ FUSION USAGE:")
print(f"  Baseline: {baseline_fusion_count} fusion decisions ({baseline_fusion_count/len(results['baseline']):.1%})")
print(f"  Proposed: {proposed_fusion_count} fusion decisions ({proposed_fusion_count/len(results['proposed']):.1%})")
print()

print("ðŸ”„ STATE CHANGES:")
if state_changes:
    print(f"  Total changes: {len(state_changes)}")
    for change in state_changes:
        print(f"    Example {change['example_id']:3d}: {change['from_state']} â†’ {change['to_state']} " +
              f"(samples: {change['samples_so_far']}, error rate: {change['empirical_error']:.3f})")
else:
    print("  No state changes occurred")

## Results Analysis

Let's analyze the performance of both approaches and visualize the controller's behavior.

In [None]:
def run_experiment(data):
    """Run DKW controller experiment.
    
    Args:
        data: List of examples with 'id' and 'difficulty' fields
        
    Returns:
        Dictionary with 'baseline' and 'proposed' results
    """
    controller = DKWController()
    results = {"baseline": [], "proposed": []}
    
    # Track controller state changes for analysis
    state_changes = []
    
    for i, example in enumerate(data):
        # Simulate error occurrence based on difficulty
        # Higher difficulty = higher probability of error
        error = np.random.random() < example["difficulty"]
        
        # Add observation to controller and get decision
        controller.add_observation(float(error))
        decision = controller.decide()
        
        # Track state changes
        if i > 0 and decision != results["proposed"][-1]["decision"]:
            state_changes.append({
                "example_id": i,
                "from_state": results["proposed"][-1]["decision"],
                "to_state": decision,
                "samples_so_far": len(controller.samples),
                "empirical_error": np.mean(controller.samples) if controller.samples else 0
            })

        # Store results for proposed method (DKW controller)
        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
            "difficulty": example["difficulty"],
            "empirical_error": np.mean(controller.samples) if controller.samples else 0,
            "dkw_bound": controller.dkw_epsilon(len(controller.samples)) if controller.samples else 1.0
        })
        
        # Store results for baseline (always fission)
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",  # Always conservative
            "error": error,
            "difficulty": example["difficulty"]
        })

    return results, state_changes

# Run the experiment
print("ðŸš€ Running experiment...")
results, state_changes = run_experiment(sample_data)

print(f"âœ… Experiment completed!")
print(f"Processed {len(results['proposed'])} examples")
print(f"Controller made {len(state_changes)} state changes")

## Experiment Function

The experiment compares two approaches:
1. **Baseline**: Always uses fission (conservative) mode
2. **Proposed**: Uses the DKW controller to adaptively switch between fusion and fission

Both approaches process the same sequence of examples and observe the same errors (based on difficulty).

## Usage Notes

- **Self-contained**: This notebook requires no external files - all sample data is inlined
- **Customizable**: You can modify the data collection parameters (e.g., change `test[:200]` to get more/fewer examples)
- **Interactive**: Run cells individually to explore the data step by step
- **Original functionality**: The core logic matches the original `data.py` script

To get started, simply run all cells from top to bottom!

In [None]:
# Create sample dataset inline (replaces external JSON file)
# This simulates examples with different difficulty levels

# Generate a realistic dataset with varying difficulty
np.random.seed(42)  # For reproducibility
n_examples = 500

sample_data = []
for i in range(n_examples):
    # Create varying difficulty patterns:
    # - Early examples: low difficulty (0.05-0.15)
    # - Middle examples: medium difficulty (0.08-0.25) 
    # - Later examples: high difficulty (0.15-0.35)
    
    if i < n_examples // 3:
        difficulty = np.random.uniform(0.05, 0.15)  # Easy examples
    elif i < 2 * n_examples // 3:
        difficulty = np.random.uniform(0.08, 0.25)  # Medium examples
    else:
        difficulty = np.random.uniform(0.15, 0.35)  # Hard examples
    
    sample_data.append({
        "id": f"example_{i:03d}",
        "difficulty": difficulty
    })

print(f"Created {len(sample_data)} examples")
print(f"Difficulty range: {min(ex['difficulty'] for ex in sample_data):.3f} - {max(ex['difficulty'] for ex in sample_data):.3f}")

# Show first few examples
print("\nFirst 5 examples:")
for ex in sample_data[:5]:
    print(f"  {ex['id']}: difficulty = {ex['difficulty']:.3f}")

In [None]:
# Optional: Save the collected data to a JSON file
# Uncomment the lines below if you want to save the data

# with open("data_out.json", "w") as f:
#     json.dump(data, f, indent=2)
# print(f"Saved {len(data)} examples to data_out.json")

print("To save data, uncomment the lines above and run this cell")

## Optional: Save Data to File

If you want to save the collected data to a JSON file (as the original script did), you can run the following cell:

## Sample Dataset

Instead of reading from external files, we'll create sample data inline. The dataset simulates examples with varying difficulty levels that influence error probability.

In [None]:
# Sample data that would have been written to data_out.json
# This is inlined to make the notebook self-contained
sample_output_data = [
    {
        "id": "example_000",
        "question": "What is 2+2?",
        "answer": "4",
        "difficulty": 0.15
    },
    {
        "id": "example_001",
        "question": "If x=5, what is 2x?",
        "answer": "10",
        "difficulty": 0.22
    },
    {
        "id": "example_002",
        "question": "Solve: 3y + 6 = 15",
        "answer": "y=3",
        "difficulty": 0.28
    }
]

print("Sample output data structure:")
print(json.dumps(sample_output_data, indent=2))

In [None]:
@dataclass
class DKWController:
    """DKW-guided fusion/fission controller."""
    epsilon_target: float = 0.10    # Target error rate (10%)
    delta: float = 0.05             # Confidence parameter (95% confidence)
    min_samples: int = 100           # Minimum samples before decision making
    hysteresis: float = 0.05         # Hysteresis to prevent oscillation

    samples: list = field(default_factory=list)
    current_state: str = "fission"   # Start conservatively

    def dkw_epsilon(self, n: int) -> float:
        """Compute DKW epsilon for n samples.
        
        This uses the Dvoretzky-Kiefer-Wolfowitz inequality to bound
        the difference between empirical and true error rates.
        """
        if n < 2:
            return 1.0  # Very conservative bound for small samples
        return np.sqrt(np.log(2 / self.delta) / (2 * n))

    def add_observation(self, error: float) -> None:
        """Add error observation for calibration."""
        self.samples.append(error)

    def decide(self) -> str:
        """Make fusion/fission decision with DKW guarantee.
        
        Returns:
            'fusion' for aggressive mode, 'fission' for conservative mode
        """
        n = len(self.samples)
        if n < self.min_samples:
            return self.current_state

        # Compute statistical bound
        epsilon = self.dkw_epsilon(n)
        empirical_error = np.mean(self.samples[-self.min_samples:])
        error_upper_bound = empirical_error + epsilon

        # State transition logic with hysteresis
        if self.current_state == "fusion":
            # Switch to fission if error bound exceeds target + hysteresis
            if error_upper_bound > self.epsilon_target + self.hysteresis:
                self.current_state = "fission"
        else:  # current_state == "fission"
            # Switch to fusion if error bound is below target - hysteresis
            if error_upper_bound < self.epsilon_target - self.hysteresis:
                self.current_state = "fusion"

        return self.current_state

# Create a controller instance to demonstrate
controller = DKWController()
print(f"Initial state: {controller.current_state}")
print(f"Target error rate: {controller.epsilon_target:.1%}")
print(f"Confidence level: {100*(1-controller.delta):.0f}%")

## Sample Output Data

For reference, here's what the original script would have written to `data_out.json`. This sample data is now inlined to make the notebook self-contained:

In [None]:
# Collect the data
data = collect_data()
print(f"Collected {len(data)} examples")

# Display the first few examples
print("\nFirst 3 examples:")
for i in range(min(3, len(data))):
    print(f"\nExample {i+1}:")
    print(f"  ID: {data[i]['id']}")
    print(f"  Question: {data[i]['question'][:100]}...")  # Truncate for display
    print(f"  Answer: {data[i]['answer']}")
    print(f"  Difficulty: {data[i]['difficulty']:.2f}")

## Collect and Process Data

Now let's run the data collection function and see how many examples we get:

## DKW Controller Implementation

The `DKWController` class implements an adaptive controller that switches between fusion (aggressive) and fission (conservative) modes based on empirical error observations and statistical confidence bounds.

### Key Parameters:
- **`epsilon_target`**: Target error rate (10% by default)
- **`delta`**: Confidence level parameter (5% means 95% confidence)
- **`min_samples`**: Minimum samples needed before making decisions
- **`hysteresis`**: Buffer to prevent rapid oscillations between states

In [None]:
def collect_data():
    """Collect benchmark data for DKW controller evaluation."""
    # Load HuggingFace dataset
    ds = load_dataset("gsm8k", "main", split="test[:200]")

    data = []
    for i, example in enumerate(ds):
        data.append({
            "id": f"example_{i:03d}",
            "question": example["question"],
            "answer": example["answer"],
            "difficulty": len(example["question"]) / 100,  # Simple proxy
        })

    return data

In [None]:
# Import required libraries
import json
import numpy as np
from dataclasses import dataclass, field
import matplotlib.pyplot as plt
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

print("âœ… All libraries imported successfully!")

## Data Collection Function

The `collect_data()` function loads the GSM8K dataset from HuggingFace and processes it into a standardized format. Each example includes:
- **id**: Unique identifier for the example
- **question**: The math problem text
- **answer**: The correct answer
- **difficulty**: A simple proxy based on question length

# DKW Controller Implementation (Experiment 001)

This notebook demonstrates a **DKW-guided fusion/fission controller** that makes adaptive decisions based on statistical confidence bounds. The controller uses the Dvoretzky-Kiefer-Wolfowitz (DKW) inequality to provide probabilistic guarantees on error rates.

## Key Concepts:
- **Fusion**: Aggressive strategy (faster but potentially more errors)
- **Fission**: Conservative strategy (slower but safer) 
- **DKW Bound**: Statistical method to bound empirical error with high confidence
- **Hysteresis**: Prevents oscillation between states

## Notebook Structure:
1. **Imports & Setup**: Required libraries
2. **DKW Controller Class**: Core implementation with adaptive decision making
3. **Sample Data**: Inline dataset for experiments
4. **Experiment Function**: Comparison between baseline and proposed methods  
5. **Results & Analysis**: Visualization and interpretation

In [None]:
"""Dataset collection script for DKW benchmark."""
import json
from datasets import load_dataset

## Setup and Imports

First, let's import the required libraries. This notebook uses the HuggingFace `datasets` library to load the GSM8K dataset.

# Dataset Collection for DKW Benchmark

This notebook demonstrates the `data.py` script converted into an interactive format. It collects benchmark data for DKW controller evaluation using the GSM8K dataset.

In [None]:
# Experiment with different parameters
def experiment_with_parameters(epsilon_target=0.10, delta=0.05, min_samples=100, hysteresis=0.05):
    """Run experiment with custom controller parameters."""
    np.random.seed(42)  # For reproducibility
    
    # Create custom controller
    custom_controller = DKWController(
        epsilon_target=epsilon_target,
        delta=delta,
        min_samples=min_samples,
        hysteresis=hysteresis
    )
    
    results = {"baseline": [], "proposed": []}
    
    for example in sample_data:
        error = np.random.random() < example["difficulty"]
        custom_controller.add_observation(float(error))
        decision = custom_controller.decide()
        
        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",
            "error": error,
        })
    
    # Show decision counts
    proposed_decisions = [r["decision"] for r in results["proposed"]]
    fusion_count = proposed_decisions.count("fusion")
    fission_count = proposed_decisions.count("fission")
    
    print(f"Controller Parameters:")
    print(f"  epsilon_target: {epsilon_target}")
    print(f"  delta: {delta}")
    print(f"  min_samples: {min_samples}")
    print(f"  hysteresis: {hysteresis}")
    print(f"\nDecisions: {fusion_count} fusion, {fission_count} fission")
    print(f"Decision sequence: {' -> '.join(proposed_decisions)}")
    
    return results

# Try default parameters
print("=== DEFAULT PARAMETERS ===")
default_results = experiment_with_parameters()

print("\n=== MORE AGGRESSIVE (lower target error) ===")
aggressive_results = experiment_with_parameters(epsilon_target=0.05)

print("\n=== MORE CONSERVATIVE (higher target error) ===")
conservative_results = experiment_with_parameters(epsilon_target=0.15)

## Interactive Parameter Tuning

Try experimenting with different controller parameters to see how they affect behavior:

In [None]:
# Create a more detailed analysis with visualization
def analyze_controller_behavior(data, results):
    """Analyze and visualize controller behavior."""
    
    # Extract decision sequences
    proposed_decisions = [r["decision"] for r in results["proposed"]]
    errors = [r["error"] for r in results["proposed"]]
    difficulties = [r["difficulty"] for r in results["proposed"]]
    
    # Plot 1: Decision timeline
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    decision_numeric = [1 if d == "fusion" else 0 for d in proposed_decisions]
    plt.plot(decision_numeric, 'bo-', label='DKW Controller')
    plt.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='Baseline (Always Fission)')
    plt.ylabel('Decision (0=Fission, 1=Fusion)')
    plt.xlabel('Example Index')
    plt.title('Controller Decision Timeline')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Error rate vs difficulty
    plt.subplot(2, 2, 2)
    plt.scatter(difficulties, errors, c=['red' if e else 'green' for e in errors], alpha=0.7)
    plt.xlabel('Difficulty Level')
    plt.ylabel('Error Occurred')
    plt.title('Errors vs Difficulty')
    plt.grid(True, alpha=0.3)
    
    # Plot 3: Decisions vs difficulty  
    plt.subplot(2, 2, 3)
    colors = ['blue' if d == 'fusion' else 'orange' for d in proposed_decisions]
    plt.scatter(difficulties, decision_numeric, c=colors, alpha=0.7)
    plt.xlabel('Difficulty Level')
    plt.ylabel('Decision (0=Fission, 1=Fusion)')
    plt.title('Decisions vs Difficulty')
    plt.grid(True, alpha=0.3)
    
    # Plot 4: Running error rate
    plt.subplot(2, 2, 4)
    running_errors = []
    error_sum = 0
    for i, error in enumerate(errors):
        error_sum += error
        running_errors.append(error_sum / (i + 1))
    
    plt.plot(running_errors, 'g-', linewidth=2, label='Running Error Rate')
    plt.axhline(y=0.10, color='r', linestyle='--', label='Target Error Rate (0.10)')
    plt.ylabel('Error Rate')
    plt.xlabel('Example Index')
    plt.title('Running Error Rate')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print controller state information
    print(f"Final running error rate: {running_errors[-1]:.3f}")
    print(f"Target error rate: 0.10")
    print(f"Controller final state: {proposed_decisions[-1]}")

# Run the analysis
analyze_controller_behavior(sample_data, results)

## Analysis and Visualization

Let's analyze the controller's behavior and visualize how it adapts based on the error observations.

In [None]:
# Run the experiment
results = run_experiment(sample_data)

# Display results in a readable format
print("BASELINE RESULTS (Always Fission):")
print("-" * 40)
for result in results["baseline"]:
    print(f"ID: {result['id']:12} | Decision: {result['decision']:8} | Error: {str(result['error']):5} | Difficulty: {result['difficulty']:.2f}")

print("\nPROPOSED DKW CONTROLLER RESULTS:")
print("-" * 40)  
for result in results["proposed"]:
    print(f"ID: {result['id']:12} | Decision: {result['decision']:8} | Error: {str(result['error']):5} | Difficulty: {result['difficulty']:.2f}")

# Calculate some basic statistics
baseline_fissions = sum(1 for r in results["baseline"] if r["decision"] == "fission")
proposed_fissions = sum(1 for r in results["proposed"] if r["decision"] == "fission")
proposed_fusions = sum(1 for r in results["proposed"] if r["decision"] == "fusion")

print(f"\nSUMMARY:")
print(f"Baseline: {baseline_fissions} fission decisions, 0 fusion decisions")
print(f"Proposed: {proposed_fissions} fission decisions, {proposed_fusions} fusion decisions")

## Running the Experiment

In [None]:
def run_experiment(data, seed=42):
    """Run DKW controller experiment with inline data."""
    # Set seed for reproducible results
    np.random.seed(seed)
    
    controller = DKWController()
    results = {"baseline": [], "proposed": []}

    for example in data:
        # Simulate error occurrence based on difficulty
        error = np.random.random() < example["difficulty"]
        controller.add_observation(float(error))
        decision = controller.decide()

        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
            "difficulty": example["difficulty"]
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",  # Always conservative
            "error": error,
            "difficulty": example["difficulty"]
        })

    return results

In [None]:
# Inline sample data (replaces external file dependency)
sample_data = [
    {"id": "example_000", "difficulty": 0.1},  # Low difficulty
    {"id": "example_001", "difficulty": 0.05}, # Very low difficulty  
    {"id": "example_002", "difficulty": 0.8},  # High difficulty
    {"id": "example_003", "difficulty": 0.2},  # Medium-low difficulty
    {"id": "example_004", "difficulty": 0.15}, # Low-medium difficulty
    {"id": "example_005", "difficulty": 0.9},  # Very high difficulty
    {"id": "example_006", "difficulty": 0.3},  # Medium difficulty
    {"id": "example_007", "difficulty": 0.05}, # Very low difficulty
    {"id": "example_008", "difficulty": 0.6},  # Medium-high difficulty
    {"id": "example_009", "difficulty": 0.1},  # Low difficulty
]

print(f"Created sample dataset with {len(sample_data)} examples")
print("Difficulty levels:", [x["difficulty"] for x in sample_data])

## Experiment Setup

We'll simulate an experiment comparing the DKW controller's decisions against a baseline (always conservative "fission") approach. The simulation uses sample data with varying difficulty levels to generate error probabilities.

In [None]:
@dataclass
class DKWController:
    """DKW-guided fusion/fission controller."""
    epsilon_target: float = 0.10
    delta: float = 0.05
    min_samples: int = 100
    hysteresis: float = 0.05

    samples: list = field(default_factory=list)
    current_state: str = "fission"

    def dkw_epsilon(self, n: int) -> float:
        """Compute DKW epsilon for n samples."""
        if n < 2:
            return 1.0
        return np.sqrt(np.log(2 / self.delta) / (2 * n))

    def add_observation(self, error: float) -> None:
        """Add error observation for calibration."""
        self.samples.append(error)

    def decide(self) -> str:
        """Make fusion/fission decision with DKW guarantee."""
        n = len(self.samples)
        if n < self.min_samples:
            return self.current_state

        epsilon = self.dkw_epsilon(n)
        empirical_error = np.mean(self.samples[-self.min_samples:])
        error_upper_bound = empirical_error + epsilon

        if self.current_state == "fusion":
            if error_upper_bound > self.epsilon_target + self.hysteresis:
                self.current_state = "fission"
        else:
            if error_upper_bound < self.epsilon_target - self.hysteresis:
                self.current_state = "fusion"

        return self.current_state

## DKW Controller Class

The DKWController uses the Dvoretzky-Kiefer-Wolfowitz inequality to provide statistical guarantees when making fusion/fission decisions. Key parameters:

- `epsilon_target`: Target error rate (default: 0.10)
- `delta`: Confidence parameter for DKW bound (default: 0.05)  
- `min_samples`: Minimum samples before making decisions (default: 100)
- `hysteresis`: Prevents oscillation between states (default: 0.05)

In [None]:
import json
import numpy as np
from dataclasses import dataclass, field
import matplotlib.pyplot as plt

# DKW Controller Implementation
## experiment_001: method.py Demo

This notebook demonstrates a DKW-guided fusion/fission controller implementation. The DKW (Dvoretzky-Kiefer-Wolfowitz) inequality provides statistical guarantees for decision making under uncertainty.

## Conclusion and Next Steps

This notebook demonstrates how the DKW controller adapts its decisions based on observed error rates while providing statistical guarantees. Key observations:

1. **Adaptive Behavior**: The controller switches between fusion and fission based on error observations
2. **Statistical Guarantees**: Uses DKW inequality to bound estimation error
3. **Hysteresis**: Prevents oscillation between states

### Experiment with Parameters

You can modify the controller parameters to see different behaviors:

```python
# Create a new controller with different parameters
custom_controller = DKWController(
    epsilon_target=0.05,    # Tighter error tolerance
    delta=0.01,            # Higher confidence
    min_samples=50,        # Faster adaptation
    hysteresis=0.02        # Less hysteresis
)
```

### Modify the Data

Change the `sample_data` generation to test different scenarios:
- Different difficulty patterns
- More or fewer examples
- Varying error rates over time

### Save Results

Uncomment the save block in the previous cell to save results to a JSON file.

In [None]:
# Display sample results (first 5 examples)
sample_results = {
    "baseline": results["baseline"][:5],
    "proposed": results["proposed"][:5]
}

print("Sample Results (first 5 examples):")
print(json.dumps(sample_results, indent=2))

# Optionally save full results to file
# with open("method_out.json", "w") as f:
#     json.dump(results, f, indent=2)
# print(f"\nFull results saved to method_out.json")

print(f"\nTotal examples in full results: {len(results['baseline'])}")

## Sample Output

Here's a sample of the results in the same format as the original `method_out.json` file:

In [None]:
# Create visualization of decision patterns over time
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 10))

# Extract decision sequences
proposed_decisions = [1 if r['decision'] == 'fusion' else 0 for r in results['proposed']]
errors = [1 if r['error'] else 0 for r in results['proposed']]
example_ids = list(range(len(results['proposed'])))

# Plot 1: Decision patterns over time
ax1.plot(example_ids, proposed_decisions, 'b-', alpha=0.7, linewidth=2, label='DKW Controller (1=Fusion, 0=Fission)')
ax1.fill_between(example_ids, proposed_decisions, alpha=0.3, color='blue')
ax1.set_ylabel('Decision')
ax1.set_title('DKW Controller Decisions Over Time')
ax1.set_ylim(-0.1, 1.1)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Error occurrences
ax2.scatter(example_ids, errors, alpha=0.6, c='red', s=20)
ax2.set_ylabel('Error Occurred')
ax2.set_title('Error Occurrences Over Time')
ax2.set_ylim(-0.1, 1.1)
ax2.grid(True, alpha=0.3)

# Plot 3: Running error rate
window_size = 20
running_error_rate = []
for i in range(len(errors)):
    start_idx = max(0, i - window_size + 1)
    window_errors = errors[start_idx:i+1]
    running_error_rate.append(np.mean(window_errors))

ax3.plot(example_ids, running_error_rate, 'g-', linewidth=2, label='Running Error Rate (20-sample window)')
ax3.axhline(y=0.10, color='red', linestyle='--', alpha=0.7, label='Target Error Rate (0.10)')
ax3.set_xlabel('Example Index')
ax3.set_ylabel('Error Rate')
ax3.set_title('Running Error Rate vs Target')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Experiment: What if we had a different proposed method?
# Modify these parameters to see different scenarios

def create_experimental_results(n_examples=200, fusion_rate=0.8, error_rate=0.05):
    """Create experimental results with custom parameters."""
    results = []
    n_fusion = int(n_examples * fusion_rate)
    n_errors = int(n_examples * error_rate)
    
    for i in range(n_examples):
        decision = "fusion" if i < n_fusion else "fission"
        error = i < n_errors
        results.append({"decision": decision, "error": error})
    
    return results

# Try different scenarios
experimental_scenarios = {
    "baseline": baseline_results,  # Keep baseline the same
    "high_fusion_low_error": create_experimental_results(fusion_rate=0.9, error_rate=0.03),
    "medium_fusion": create_experimental_results(fusion_rate=0.5, error_rate=0.06),
    "original_proposed": proposed_results
}

print("Comparing different scenarios:\n")
for scenario_name, scenario_results in experimental_scenarios.items():
    if scenario_name == "baseline":
        continue
    
    scenario_data = {"baseline": baseline_results, "proposed": scenario_results}
    scenario_metrics = compute_metrics(scenario_data)
    
    print(f"{scenario_name.upper()}:")
    print(f"  API reduction: {scenario_metrics['improvement']['api_reduction_pct']:.1f}%")
    print(f"  Error rate change: {scenario_metrics['improvement']['error_rate_diff']:+.1%}")
    print(f"  Fusion rate: {scenario_metrics['proposed']['fusion_rate']:.1%}")
    print()

## Results Analysis

Let's analyze the behavior of both methods and visualize how the DKW controller adapts over time.

## Experiment with Different Scenarios

You can modify the parameters below to see how different controller behaviors would affect the metrics:

In [None]:
# Run the experiment
results = run_experiment(sample_data)

# Display basic statistics
print("Experiment Results:")
print(f"Total examples processed: {len(results['baseline'])}")

# Count decisions for each method
baseline_fission = sum(1 for r in results['baseline'] if r['decision'] == 'fission')
baseline_fusion = sum(1 for r in results['baseline'] if r['decision'] == 'fusion')

proposed_fission = sum(1 for r in results['proposed'] if r['decision'] == 'fission')
proposed_fusion = sum(1 for r in results['proposed'] if r['decision'] == 'fusion')

print(f"\nBaseline method:")
print(f"  Fission decisions: {baseline_fission}")
print(f"  Fusion decisions: {baseline_fusion}")

print(f"\nProposed DKW method:")
print(f"  Fission decisions: {proposed_fission}")
print(f"  Fusion decisions: {proposed_fusion}")

# Count errors for each method
baseline_errors = sum(1 for r in results['baseline'] if r['error'])
proposed_errors = sum(1 for r in results['proposed'] if r['error'])

print(f"\nError counts:")
print(f"  Baseline: {baseline_errors} errors")
print(f"  Proposed: {proposed_errors} errors")

In [None]:
# Compute metrics
metrics = compute_metrics(results)

# Display results in a formatted way
print("="*50)
print("DKW CONTROLLER EVALUATION RESULTS")
print("="*50)

for method in ["baseline", "proposed"]:
    print(f"\n{method.upper()} METHOD:")
    m = metrics[method]
    print(f"  Fusion rate:     {m['fusion_rate']:.1%}")
    print(f"  Fission rate:    {m['fission_rate']:.1%}") 
    print(f"  Error rate:      {m['error_rate']:.1%}")
    print(f"  Total API calls: {m['api_calls']}")
    print(f"  Avg calls/example: {m['avg_calls_per_example']:.2f}")

print(f"\nIMPROVEMENT:")
print(f"  API reduction: {metrics['improvement']['api_reduction_pct']:.1f}%")
print(f"  Error rate change: {metrics['improvement']['error_rate_diff']:+.1%}")

# Also save as JSON (inline output)
print(f"\nFull metrics as JSON:")
print(json.dumps(metrics, indent=2))

## Run Experiment

Let's execute the experiment and collect results from both the proposed DKW controller and the baseline always-fission approach.

In [None]:
def run_experiment(data):
    """Run DKW controller experiment."""
    controller = DKWController()
    results = {"baseline": [], "proposed": []}

    for example in data:
        # Simulate error occurrence based on difficulty
        error = np.random.random() < example["difficulty"]
        controller.add_observation(float(error))
        decision = controller.decide()

        results["proposed"].append({
            "id": example["id"],
            "decision": decision,
            "error": error,
        })
        results["baseline"].append({
            "id": example["id"],
            "decision": "fission",  # Always conservative
            "error": error,
        })

    return results

## Run Evaluation

Now let's compute the metrics and display the results:

In [None]:
def compute_metrics(results: dict) -> dict:
    """Compute evaluation metrics."""
    metrics = {}

    for method in ["baseline", "proposed"]:
        preds = results[method]

        # Count decisions
        fusion_count = sum(1 for p in preds if p["decision"] == "fusion")
        fission_count = sum(1 for p in preds if p["decision"] == "fission")

        # Compute error rate
        errors = sum(1 for p in preds if p["error"])
        error_rate = errors / len(preds)

        # API calls (fusion=1, fission=2)
        api_calls = fusion_count + 2 * fission_count

        metrics[method] = {
            "fusion_rate": fusion_count / len(preds),
            "fission_rate": fission_count / len(preds),
            "error_rate": error_rate,
            "api_calls": api_calls,
            "avg_calls_per_example": api_calls / len(preds),
        }

    # Compute improvement
    baseline_calls = metrics["baseline"]["avg_calls_per_example"]
    proposed_calls = metrics["proposed"]["avg_calls_per_example"]
    metrics["improvement"] = {
        "api_reduction_pct": (baseline_calls - proposed_calls) / baseline_calls * 100,
        "error_rate_diff": metrics["proposed"]["error_rate"] - metrics["baseline"]["error_rate"],
    }

    return metrics

## Experiment Function

The `run_experiment` function simulates the controller's behavior over a sequence of examples, comparing:
- **Proposed method**: Uses DKW controller for adaptive decisions
- **Baseline method**: Always uses conservative "fission" mode

For each example, we simulate whether an error occurs based on the difficulty level.

In [None]:
# Sample data - normally read from "../dataset_001/data_out.json"
# Create synthetic data with varying difficulty levels
sample_data = []

# Generate 300 examples with varying difficulty
for i in range(300):
    # Create examples with different difficulty patterns
    if i < 100:
        difficulty = 0.05  # Easy examples (low error rate)
    elif i < 200:
        difficulty = 0.15  # Medium examples (moderate error rate)
    else:
        difficulty = 0.08  # Harder examples (but still manageable)
    
    sample_data.append({
        "id": f"example_{i:03d}",
        "difficulty": difficulty
    })

print(f"Created {len(sample_data)} sample examples")
print(f"Sample data preview: {sample_data[:3]}")

## Evaluation Metrics Function

The `compute_metrics` function calculates several key performance indicators:

- **Fusion/Fission rates**: Proportion of decisions for each strategy
- **Error rate**: Percentage of incorrect decisions
- **API calls**: Total API usage (fusion = 1 call, fission = 2 calls)
- **Efficiency improvements**: Comparison between methods

## Sample Data

We'll create sample data to simulate the input that would normally be read from a JSON file. Each example has:
- `id`: Unique identifier
- `difficulty`: Probability of error occurring (0.0 to 1.0)

In [None]:
# Create sample experimental results
# This data represents the decisions made by baseline vs proposed methods

# Baseline method: always chooses fission, 8% error rate
baseline_results = []
for i in range(200):
    baseline_results.append({
        "decision": "fission",
        "error": i < 16  # First 16 examples have errors (8% of 200)
    })

# Proposed method: 65% fusion, 35% fission, 9% error rate  
proposed_results = []
for i in range(200):
    if i < 130:  # First 130 examples use fusion (65% of 200)
        decision = "fusion"
    else:  # Remaining 70 examples use fission (35% of 200)
        decision = "fission"
    
    proposed_results.append({
        "decision": decision,
        "error": i < 18  # First 18 examples have errors (9% of 200)
    })

# Combine into results structure
results = {
    "baseline": baseline_results,
    "proposed": proposed_results
}

print(f"Generated {len(baseline_results)} baseline results and {len(proposed_results)} proposed results")
print(f"Baseline decisions: {sum(1 for r in baseline_results if r['decision'] == 'fusion')} fusion, {sum(1 for r in baseline_results if r['decision'] == 'fission')} fission")
print(f"Proposed decisions: {sum(1 for r in proposed_results if r['decision'] == 'fusion')} fusion, {sum(1 for r in proposed_results if r['decision'] == 'fission')} fission")

In [None]:
@dataclass
class DKWController:
    """DKW-guided fusion/fission controller."""
    epsilon_target: float = 0.10
    delta: float = 0.05
    min_samples: int = 100
    hysteresis: float = 0.05

    samples: list = field(default_factory=list)
    current_state: str = "fission"

    def dkw_epsilon(self, n: int) -> float:
        """Compute DKW epsilon for n samples."""
        if n < 2:
            return 1.0
        return np.sqrt(np.log(2 / self.delta) / (2 * n))

    def add_observation(self, error: float) -> None:
        """Add error observation for calibration."""
        self.samples.append(error)

    def decide(self) -> str:
        """Make fusion/fission decision with DKW guarantee."""
        n = len(self.samples)
        if n < self.min_samples:
            return self.current_state

        epsilon = self.dkw_epsilon(n)
        empirical_error = np.mean(self.samples[-self.min_samples:])
        error_upper_bound = empirical_error + epsilon

        if self.current_state == "fusion":
            if error_upper_bound > self.epsilon_target + self.hysteresis:
                self.current_state = "fission"
        else:
            if error_upper_bound < self.epsilon_target - self.hysteresis:
                self.current_state = "fusion"

        return self.current_state

## Sample Data

Instead of reading from external JSON files, we'll create sample data inline that represents the experimental results from both baseline and proposed methods.

## DKW Controller Class

The core controller that implements the DKW-guided decision making algorithm.

**Parameters:**
- `epsilon_target`: Target error threshold (0.10)
- `delta`: Confidence parameter for DKW bound (0.05)
- `min_samples`: Minimum samples before making decisions (100)
- `hysteresis`: Buffer to prevent oscillation (0.05)

**Key Methods:**
- `dkw_epsilon()`: Computes the DKW confidence interval width
- `add_observation()`: Records error observations
- `decide()`: Makes fusion/fission decision with statistical guarantees

In [None]:
import json
import numpy as np

# DKW Controller Evaluation

This notebook evaluates the performance of a DKW Controller, comparing baseline and proposed methods. The evaluation focuses on:
- **Fusion vs Fission decisions**: The controller can choose to fuse or split operations
- **API efficiency**: Fusion requires 1 API call, fission requires 2 API calls
- **Error rates**: How often the controller makes incorrect decisions

The goal is to measure the improvement in API efficiency while maintaining acceptable error rates.

In [None]:
"""DKW Controller Implementation."""
import json
import numpy as np
from dataclasses import dataclass, field
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# DKW Controller Implementation Demo

This notebook demonstrates a **DKW-guided fusion/fission controller** that makes adaptive decisions based on error observations with statistical guarantees.

## Overview
The DKW (Dvoretzky-Kiefer-Wolfowitz) inequality provides a way to bound the difference between empirical and true error rates, enabling principled decision-making between fusion and fission states.

**Key Features:**
- Statistical guarantees via DKW inequality
- Adaptive switching between fusion/fission modes
- Hysteresis to prevent oscillation
- Real-time error calibration

# Dataset Collection for DKW Benchmark

This notebook demonstrates dataset collection for DKW controller evaluation using the GSM8K dataset from HuggingFace. The script processes mathematical word problems and creates structured benchmark data with difficulty estimates.

**Artifact ID:** dataset_001  
**Original file:** data.py

## Import Required Libraries

We'll need the `datasets` library to load data from HuggingFace and `json` for data serialization.

In [None]:
"""Dataset collection script for DKW benchmark."""
import json
from datasets import load_dataset

## Data Collection Function

The `collect_data()` function loads the GSM8K dataset from HuggingFace and processes it into a structured format suitable for benchmarking. Each example includes:
- **id**: Unique identifier for the example
- **question**: The mathematical word problem
- **answer**: The correct answer
- **difficulty**: A simple difficulty estimate based on question length

In [None]:
def collect_data():
    """Collect benchmark data for DKW controller evaluation."""
    # Load HuggingFace dataset
    ds = load_dataset("gsm8k", "main", split="test[:200]")

    data = []
    for i, example in enumerate(ds):
        data.append({
            "id": f"example_{i:03d}",
            "question": example["question"],
            "answer": example["answer"],
            "difficulty": len(example["question"]) / 100,  # Simple proxy
        })

    return data

## Execute Data Collection

Run the data collection function and display the results. This will download 200 examples from the GSM8K test set and process them into our benchmark format.

**Note:** This is completely self-contained - no external files are needed!

In [None]:
# Collect the data
data = collect_data()

# Display results
print(f"Collected {len(data)} examples")
print(f"\nFirst 3 examples:")
for i in range(min(3, len(data))):
    print(f"\nExample {i+1}:")
    print(f"  ID: {data[i]['id']}")
    print(f"  Question: {data[i]['question'][:100]}...")
    print(f"  Answer: {data[i]['answer']}")
    print(f"  Difficulty: {data[i]['difficulty']:.2f}")

## Save Data to JSON File

Optionally save the collected data to a JSON file for later use. This mimics the original script's behavior.

In [None]:
# Save data to JSON file (optional)
with open("data_out.json", "w") as f:
    json.dump(data, f, indent=2)

print("Data saved to 'data_out.json'")

# Show the JSON structure
print(f"\nJSON file contains {len(data)} examples")
print("Sample JSON structure:")
print(json.dumps(data[:2], indent=2))

## Example Output Format

Here's an example of what the processed data looks like. This demonstrates the expected structure and format:

In [None]:
# Example of processed data structure (for reference)
sample_data = [
    {
        "id": "example_000",
        "question": "What is 2+2?",
        "answer": "4",
        "difficulty": 0.15
    },
    {
        "id": "example_001",
        "question": "If x=5, what is 2x?",
        "answer": "10",
        "difficulty": 0.22
    },
    {
        "id": "example_002",
        "question": "Solve: 3y + 6 = 15",
        "answer": "y=3",
        "difficulty": 0.28
    }
]

print("Sample data structure:")
print(json.dumps(sample_data, indent=2))

## Usage and Customization

This notebook is completely self-contained and ready to run! Here are some ways you can customize it:

### Modify Dataset Parameters:
- Change the number of examples: `split="test[:200]"` â†’ `split="test[:500]"`
- Use different splits: `split="test"` or `split="train"`
- Use validation set: `split="validation"`

### Adjust Difficulty Calculation:
The current difficulty is based on question length. You could modify it to use:
- Number of mathematical operations
- Presence of certain keywords
- Complexity scoring algorithms

### Data Processing:
- Add additional fields (e.g., topic classification, solution steps)
- Filter examples by certain criteria
- Apply text preprocessing

### Running the Notebook:
1. Make sure you have the required packages: `pip install datasets`
2. Run all cells in order
3. The data will be collected from HuggingFace automatically
4. No external files needed!