## Notebook setup and assumptions
- Paths are relative to the project root.
- No experiment code is modified.
- The notebook scans `results/*/exp_*` for artifacts.
- This notebook does not make strong claims; it reports observations from the saved artifacts.

In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

RESULTS_DIR = Path('results')
FIGS_DIR = Path('notebooks/figures')
FIGS_DIR.mkdir(parents=True, exist_ok=True)

def find_experiments(results_dir: Path = RESULTS_DIR):
    



    experiments = []
    if not results_dir.exists():
        return experiments
    for model_dir in results_dir.iterdir():
        if not model_dir.is_dir():
            continue
        model_name = model_dir.name
        for exp_sub in model_dir.iterdir():
            if not exp_sub.is_dir():
                continue
            exp_id = exp_sub.name
            experiments.append((model_name, exp_id, exp_sub))
    return experiments

def load_metrics(exp_dir: Path):
    metrics_file = next(exp_dir.glob('exp_*_metrics.json'), None)
    if metrics_file is None:
        return None
    with open(metrics_file, 'r') as f:
        return json.load(f)

def load_summary(exp_dir: Path):
    summary_file = next(exp_dir.glob('exp_*_summary.csv'), None)
    if summary_file is None:
        return None
    return pd.read_csv(summary_file)

def load_logs(exp_dir: Path):
    logs_file = next(exp_dir.glob('exp_*_logs.csv'), None)
    if logs_file is None:
        return None
    return pd.read_csv(logs_file)

In [2]:
# Build a dataframe of per-experiment metrics
experiments = find_experiments()
rows = []
for model_name, exp_id, exp_dir in experiments:
    metrics = load_metrics(exp_dir)
    if metrics is None:
        continue
    # canonical fields expected in metrics.json: statistics + classification_metrics (optional)
    stats = metrics.get('statistics', {})
    classification = metrics.get('classification_metrics', {})
    row = {
        'model': model_name,
        'experiment_id': exp_id,
        'mean_reward': stats.get('mean_reward'),
        'std_reward': stats.get('std_reward'),
        'min_reward': stats.get('min_reward'),
        'max_reward': stats.get('max_reward'),
        'median_reward': stats.get('median_reward'),
        'accuracy': classification.get('accuracy'),
        'precision': classification.get('precision'),
        'recall': classification.get('recall'),
        'f1': classification.get('f1'),
        'success_rate': metrics.get('success_rate'),
    }
    rows.append(row)

metrics_df = pd.DataFrame(rows)
metrics_df = metrics_df.sort_values(['model', 'experiment_id']).reset_index(drop=True)
metrics_df.head(10)

KeyError: 'model'

## Aggregated results by model
Compute mean and standard deviation across experiments (typically seeds) for each model. This provides a concise comparison while preserving per-experiment evidence in the `results/` folder.

In [None]:
# Aggregate by model
agg = metrics_df.groupby('model').agg({
    'mean_reward': ['mean', 'std'],
    'std_reward': ['mean', 'std'],
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'success_rate': ['mean', 'std'],
})
# Flatten columns for readability
agg.columns = ['_'.join(col).strip() for col in agg.columns.values]
agg = agg.reset_index()
agg

## Tables: Classification metrics (baselines) and Reward statistics
Below are two tables: one focused on classification metrics for baselines and one showing reward-based statistics across models. All numbers are computed from saved artifacts.

In [None]:
# Classification metrics table (baseline models may have NaN for these columns for RL)
class_cols = ['model', 'accuracy', 'precision', 'recall', 'f1']
class_table = metrics_df[class_cols].groupby('model').agg(['mean', 'std'])
# Flatten
class_table.columns = ['_'.join(col).strip() for col in class_table.columns.values]
class_table = class_table.reset_index()
class_table

In [None]:
# Reward statistics table
reward_cols = ['model', 'mean_reward', 'std_reward', 'success_rate']
reward_table = metrics_df[reward_cols].groupby('model').agg(['mean', 'std'])
reward_table.columns = ['_'.join(col).strip() for col in reward_table.columns.values]
reward_table = reward_table.reset_index()
reward_table

## Plots: Model comparisons
The following static plots visualize model performance. These figures are reproducible and can be exported for reports.

In [None]:
# Bar chart: mean_reward with error bars (std)
plt.figure(figsize=(8, 5))
sns.barplot(x='model', y='mean_reward', data=metrics_df, ci='sd', palette='muted')
plt.title('Mean Episode Reward by Model (error bars = std)')
plt.ylabel('Mean Reward')
plt.xlabel('Model')
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(FIGS_DIR / 'mean_reward_by_model.png')
plt.show()

In [None]:
# Bar chart: success_rate by model
plt.figure(figsize=(8, 5))
sns.barplot(x='model', y='success_rate', data=metrics_df, ci='sd', palette='pastel')
plt.title('Success Rate by Model (fraction of episodes with positive total reward)')
plt.ylabel('Success Rate')
plt.xlabel('Model')
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(FIGS_DIR / 'success_rate_by_model.png')
plt.show()

## RL: Reward vs Episode
For RL-type experiments (where per-episode reward series are available in `summary.csv`), this section plots reward as a function of episode index for each RL run found in `results/`. If multiple RL runs (different seeds) exist, each is plotted separately for visual comparison.

In [None]:
# Reward vs episode for RL experiments
plt.figure(figsize=(10, 6))
rl_found = False
for model_name, exp_id, exp_dir in experiments:
    if model_name.lower().startswith('dqn') or model_name.lower() == 'dqn' or model_name.lower() == 'rl' or model_name.lower() == 'ppo':
        summary = load_summary(exp_dir)
        if summary is None:
            continue
        rl_found = True
        # plot per-episode total_reward (episodes may be many)
        if 'episode' in summary.columns and 'total_reward' in summary.columns:
            plt.plot(summary['episode'], summary['total_reward'], alpha=0.6, label=f
)

if rl_found:
    plt.title('RL: Total Reward per Episode (each line = one experiment run)')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.legend(fontsize='small', ncol=2)
    plt.tight_layout()
    plt.savefig(FIGS_DIR / 'rl_reward_vs_episode.png')
    plt.show()
else:
    print('No RL experiments found under results/ to plot reward vs episode.')

---
### Notes for reviewers
- All data are read from the `results/` folder.
- Per-experiment artifacts (config, metrics, summary, logs) are preserved for audit.
- This notebook reports observations and descriptive statistics only. It does not claim causal superiority of any model.