# TCN Final Evaluation Notebook

This notebook is evaluation-only and designed for final model selection, ablations, and benchmarking.
All runtime variables are isolated with the `eval_` prefix to avoid conflicts with training notebooks.

## 1) Connect to Colab VM and sync repository
Run this first in a fresh Colab runtime.

In [None]:
import os

EVAL_REPO_URL = "https://github.com/Dave-DKings/tape_tcn_project.git"
EVAL_REPO_DIR = "/content/adaptive_portfolio_rl"

if not os.path.exists(f"{EVAL_REPO_DIR}/.git"):
    !git clone {EVAL_REPO_URL} {EVAL_REPO_DIR}

%cd /content/adaptive_portfolio_rl
!git fetch origin
!git reset --hard origin/main

## 2) Optional: mount Drive and restore saved results zip
Set `EVAL_RESTORE_FROM_ZIP=True` only when needed.

In [None]:
from pathlib import Path

EVAL_RESTORE_FROM_ZIP = False
EVAL_ZIP_PATH = "/content/drive/MyDrive/tcn_fusion_results_download_new2.zip"

if EVAL_RESTORE_FROM_ZIP:
    from google.colab import drive
    drive.mount('/content/drive')

    zip_path = Path(EVAL_ZIP_PATH)
    if not zip_path.exists():
        raise FileNotFoundError(f"Zip not found: {zip_path}")

    !mkdir -p /content/adaptive_portfolio_rl
    !unzip -q -o {zip_path} -d /content/adaptive_portfolio_rl
    print("‚úÖ Restored results from zip")
else:
    print("‚ÑπÔ∏è EVAL_RESTORE_FROM_ZIP=False")

## 3) Imports

In [None]:
import copy
import json
import re
from dataclasses import replace
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

from src.config import get_active_config
from src.notebook_helpers.tcn_phase1 import (
    prepare_phase1_dataset,
    create_experiment6_result_stub,
    evaluate_experiment6_checkpoint,
    load_training_metadata_into_config,
    build_evaluation_track_summary,
    build_ablation_table,
    compare_agent_vs_baseline,
)

## 4) Evaluation run settings
Adjust once here.

In [None]:
EVAL_RANDOM_SEED = 42
EVAL_RESULTS_ROOT = Path('/content/adaptive_portfolio_rl/tcn_fusion_results')

# Deterministic policy mode: 'mean' is recommended for stable ranking.
EVAL_DETERMINISTIC_MODE = 'mean'

# Stochastic robustness checks per checkpoint.
EVAL_NUM_STOCHASTIC_RUNS = 10
EVAL_STOCHASTIC_EPISODE_LIMIT = 252

# Selection for ablation basket
EVAL_TOP_HW = 8         # high-watermark checkpoints by filename Sharpe tag
EVAL_TOP_PERIODIC = 4   # periodic step checkpoints by most recent step
EVAL_INCLUDE_ROOT = True
EVAL_INCLUDE_RARE = False

# Save outputs
EVAL_SAVE_LOGS = True
EVAL_SAVE_ARTIFACTS = False

## 5) Build evaluation dataset and load latest metadata config

In [None]:
if not EVAL_RESULTS_ROOT.exists():
    raise FileNotFoundError(f"Missing results root: {EVAL_RESULTS_ROOT}")


def eval_build_core_active_feature_columns(cfg):
    from src.data_utils import DataProcessor

    probe = DataProcessor(cfg)
    return list(dict.fromkeys(probe.get_feature_columns("phase1")))


def eval_apply_core_feature_lock(cfg, active_feature_columns):
    from src.data_utils import DataProcessor

    probe_cfg = copy.deepcopy(cfg)
    probe_fp = probe_cfg.setdefault("feature_params", {})
    probe_fs = probe_fp.setdefault("feature_selection", {})
    probe_fs["disable_features"] = False
    probe_fs["disabled_features"] = []

    probe = DataProcessor(probe_cfg)
    core_all_cols = list(dict.fromkeys(probe.get_feature_columns("phase1")))

    active_set = set(active_feature_columns)
    disabled = sorted([c for c in core_all_cols if c not in active_set])

    fp = cfg.setdefault("feature_params", {})
    fs = fp.setdefault("feature_selection", {})
    fs["disable_features"] = True
    fs["disabled_features"] = disabled

    return core_all_cols, disabled


eval_config = copy.deepcopy(get_active_config('phase1'))

eval_logs_dir = EVAL_RESULTS_ROOT / 'logs'
meta_files = sorted(eval_logs_dir.glob('*_metadata.json'), key=lambda p: p.stat().st_mtime, reverse=True)
if not meta_files:
    raise FileNotFoundError(f"No metadata JSON in {eval_logs_dir}")

EVAL_METADATA_PATH = meta_files[0]
print('üìÑ Using metadata:', EVAL_METADATA_PATH)

# Apply run-time training settings (architecture/reward/etc.) from metadata.
eval_config = load_training_metadata_into_config(
    EVAL_METADATA_PATH,
    copy.deepcopy(eval_config),
    verbose=True,
)

# Enforce architecture family used by checkpoints
eval_config['agent_params']['actor_critic_type'] = 'TCN_FUSION'
eval_config['agent_params']['use_fusion'] = True
eval_config['agent_params']['use_attention'] = False

# IMPORTANT: feature list is derived from core project pipeline, not manifest.
eval_active_feature_columns = eval_build_core_active_feature_columns(eval_config)
_, eval_disabled_features = eval_apply_core_feature_lock(eval_config, eval_active_feature_columns)

print('‚úÖ Eval core feature lock applied')
print('   active_feature_columns:', len(eval_active_feature_columns))
print('   disabled_features:', len(eval_disabled_features))

if 'eval_phase1_data' not in globals():
    eval_phase1_data = prepare_phase1_dataset(eval_config, force_download=False)
else:
    print('‚ÑπÔ∏è Reusing eval_phase1_data from current runtime')


## 6) Inspect latest training CSV logs (for diagnostics context)

In [None]:
def eval_latest_csv(pattern):
    files = sorted(eval_logs_dir.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return files[0] if files else None

eval_latest_episodes_csv = eval_latest_csv('*episodes*.csv')
eval_latest_step_diag_csv = eval_latest_csv('*step_diagnostics*.csv')
eval_latest_summary_csv = eval_latest_csv('*summary*.csv')

print('episodes:', eval_latest_episodes_csv)
print('step diagnostics:', eval_latest_step_diag_csv)
print('summary:', eval_latest_summary_csv)

if eval_latest_episodes_csv:
    eval_episodes_df = pd.read_csv(eval_latest_episodes_csv)
    display(eval_episodes_df.tail(5))

## 7) Checkpoint discovery and ablation basket construction

In [None]:
def eval_parse_sharpe_from_name(path: Path):
    m = re.search(r'_sh([pm])(\d+)p(\d+)', path.name)
    if not m:
        return None
    sign = 1.0 if m.group(1) == 'p' else -1.0
    return sign * float(f"{m.group(2)}.{m.group(3)}")


def eval_parse_episode(path: Path):
    m = re.search(r'_ep(\d+)', path.name)
    return int(m.group(1)) if m else None


def eval_parse_step(path: Path):
    m = re.search(r'_step(\d+)', path.name)
    return int(m.group(1)) if m else None


def eval_discover_actor_files(results_root: Path):
    actors = sorted(results_root.rglob('*_actor.weights.h5'))
    rows = []
    for actor in actors:
        prefix = str(actor).replace('_actor.weights.h5', '')
        critic = Path(prefix + '_critic.weights.h5')
        if not critic.exists():
            continue

        parent = actor.parent.name
        if parent == 'high_watermark_checkpoints':
            kind = 'high_watermark'
        elif parent == 'step_sharpe_checkpoints':
            kind = 'step_sharpe'
        elif parent == 'rare_models':
            kind = 'rare'
        elif eval_parse_step(actor.name if isinstance(actor, str) else actor):
            kind = 'periodic_step'
        else:
            kind = 'root'

        rows.append({
            'actor_path': str(actor),
            'critic_path': str(critic),
            'checkpoint_prefix': prefix,
            'checkpoint_kind': kind,
            'episode': eval_parse_episode(actor),
            'step': eval_parse_step(actor),
            'sharpe_tag': eval_parse_sharpe_from_name(actor),
            'mtime': actor.stat().st_mtime,
        })

    return pd.DataFrame(rows)


def eval_select_ablation_basket(df_ckpt: pd.DataFrame):
    picks = []

    if EVAL_INCLUDE_ROOT:
        root_df = df_ckpt[df_ckpt['checkpoint_kind'] == 'root'].copy()
        if not root_df.empty:
            picks.append(root_df.sort_values('mtime', ascending=False).head(2))

    hw_df = df_ckpt[df_ckpt['checkpoint_kind'] == 'high_watermark'].copy()
    if not hw_df.empty:
        hw_df['sharpe_rank_key'] = hw_df['sharpe_tag'].fillna(-np.inf)
        picks.append(hw_df.sort_values(['sharpe_rank_key', 'episode', 'mtime'], ascending=[False, False, False]).head(EVAL_TOP_HW))

    periodic_df = df_ckpt[df_ckpt['checkpoint_kind'] == 'periodic_step'].copy()
    if not periodic_df.empty:
        picks.append(periodic_df.sort_values(['step', 'mtime'], ascending=[False, False]).head(EVAL_TOP_PERIODIC))

    if EVAL_INCLUDE_RARE:
        rare_df = df_ckpt[df_ckpt['checkpoint_kind'] == 'rare'].copy()
        if not rare_df.empty:
            rare_df['sharpe_rank_key'] = rare_df['sharpe_tag'].fillna(-np.inf)
            picks.append(rare_df.sort_values(['sharpe_rank_key', 'episode', 'mtime'], ascending=[False, False, False]).head(3))

    if not picks:
        return pd.DataFrame(columns=df_ckpt.columns)

    out = pd.concat(picks, ignore_index=True)
    out = out.drop_duplicates(subset=['checkpoint_prefix']).reset_index(drop=True)
    return out


eval_ckpt_df = eval_discover_actor_files(EVAL_RESULTS_ROOT)
if eval_ckpt_df.empty:
    raise RuntimeError(f'No valid actor+critic checkpoint pairs found under {EVAL_RESULTS_ROOT}')

print('All checkpoint pairs:', len(eval_ckpt_df))
print(eval_ckpt_df['checkpoint_kind'].value_counts().to_dict())

eval_ablation_ckpts = eval_select_ablation_basket(eval_ckpt_df)
print('Selected for ablation:', len(eval_ablation_ckpts))
display(eval_ablation_ckpts[['checkpoint_kind', 'episode', 'step', 'sharpe_tag', 'actor_path']].head(30))

## 8) Evaluate ablation basket (deterministic + stochastic)

In [None]:
def eval_run_one_checkpoint(eval_cfg, phase1_data, ckpt_prefix, seed=42):
    stub = create_experiment6_result_stub(
        random_seed=seed,
        use_covariance=True,
        architecture=eval_cfg['agent_params']['actor_critic_type'],
        checkpoint_path=ckpt_prefix,
        base_agent_params=eval_cfg.get('agent_params'),
    )

    return evaluate_experiment6_checkpoint(
        experiment6=stub,
        phase1_data=phase1_data,
        config=eval_cfg,
        random_seed=seed,
        checkpoint_path_override=ckpt_prefix,
        deterministic_eval_mode=EVAL_DETERMINISTIC_MODE,
        num_eval_runs=EVAL_NUM_STOCHASTIC_RUNS,
        stochastic_episode_length_limit=EVAL_STOCHASTIC_EPISODE_LIMIT,
        save_eval_logs=EVAL_SAVE_LOGS,
        save_eval_artifacts=EVAL_SAVE_ARTIFACTS,
    )


eval_evaluations = {}

for i, row in eval_ablation_ckpts.iterrows():
    label = f"{row['checkpoint_kind']}__ep{row['episode']}__step{row['step']}"
    prefix = row['checkpoint_prefix']
    print(f"
[{i+1}/{len(eval_ablation_ckpts)}] Evaluating: {label}")
    try:
        ev = eval_run_one_checkpoint(eval_config, eval_phase1_data, prefix, seed=EVAL_RANDOM_SEED)
        eval_evaluations[label] = ev
    except Exception as e:
        print(f"‚ùå Failed {label}: {type(e).__name__}: {e}")

print('‚úÖ Completed evaluations:', len(eval_evaluations))

## 9) Ablation table and leaderboard

In [None]:
if not eval_evaluations:
    raise RuntimeError('No successful evaluations to summarize.')

eval_ablation_table = build_ablation_table(eval_evaluations)

display(eval_ablation_table.head(30))

# Deterministic-first leaderboard view
eval_leaderboard = eval_ablation_table.copy()
eval_leaderboard['risk_adjusted_score'] = (
    eval_leaderboard['det_sharpe'].fillna(-999)
    - 0.5 * eval_leaderboard['det_max_drawdown'].fillna(1.0)
    - 0.1 * eval_leaderboard['det_turnover'].fillna(1.0)
)
eval_leaderboard = eval_leaderboard.sort_values(['risk_adjusted_score', 'det_sharpe'], ascending=False).reset_index(drop=True)

print('Top by risk-adjusted score:')
display(eval_leaderboard.head(10))

## 10) Build industry baseline returns (equal-weight and cash)

In [None]:
def eval_identify_asset_column(df: pd.DataFrame):
    candidates = ['Ticker', 'ticker', 'tic', 'asset', 'Asset', 'symbol', 'Symbol']
    for c in candidates:
        if c in df.columns:
            return c
    return None


def eval_identify_return_column(df: pd.DataFrame):
    candidates = ['LogReturn_1d', 'log_return_1d', 'Return_1d', 'return_1d', 'daily_return']
    for c in candidates:
        if c in df.columns:
            return c
    return None


def eval_fetch_sp500_returns(start_date: pd.Timestamp, end_date: pd.Timestamp) -> pd.Series:
    """
    Fetch S&P500 daily simple returns for benchmark comparison.
    Primary source: yfinance '^GSPC'.
    Fallback: empty series if fetch fails.
    """
    try:
        import yfinance as yf
    except Exception:
        try:
            !pip -q install yfinance
            import yfinance as yf
        except Exception:
            print('‚ö†Ô∏è Could not install/import yfinance; SP500 benchmark disabled.')
            return pd.Series(dtype=float)

    try:
        df = yf.download('^GSPC', start=str(start_date.date()), end=str((end_date + pd.Timedelta(days=1)).date()), auto_adjust=True, progress=False)
        if df is None or df.empty or 'Close' not in df.columns:
            print('‚ö†Ô∏è SP500 download returned empty data.')
            return pd.Series(dtype=float)
        close = pd.Series(df['Close']).dropna()
        ret = close.pct_change().dropna().astype(float)
        ret.index = pd.to_datetime(ret.index)
        return ret
    except Exception as e:
        print(f'‚ö†Ô∏è SP500 fetch failed: {type(e).__name__}: {e}')
        return pd.Series(dtype=float)


def eval_build_baselines_from_phase1(phase1_data):
    test_df = phase1_data.test_df.copy()
    if 'Date' not in test_df.columns:
        raise ValueError('test_df must contain Date column')

    ret_col = eval_identify_return_column(test_df)
    if ret_col is None:
        raise ValueError('Could not identify return column in test_df')

    if 'LogReturn' in ret_col or 'log' in ret_col.lower():
        test_df['_simple_ret'] = np.expm1(test_df[ret_col].astype(float))
    else:
        test_df['_simple_ret'] = test_df[ret_col].astype(float)

    # Industry baseline 1: equal-weight over available assets each day
    eqw = (
        test_df.groupby('Date')['_simple_ret']
        .mean()
        .sort_index()
        .astype(float)
    )

    # Industry baseline 2: cash (0% daily return)
    cash = pd.Series(np.zeros(len(eqw)), index=pd.to_datetime(eqw.index), name='cash')

    # Industry baseline 3: S&P 500 (^GSPC)
    dt_index = pd.to_datetime(eqw.index)
    sp500_ret = eval_fetch_sp500_returns(dt_index.min(), dt_index.max())
    if not sp500_ret.empty:
        # align to model dates; missing market holidays become 0 return for alignment stability
        sp500_ret = sp500_ret.reindex(dt_index).fillna(0.0)
    else:
        sp500_ret = pd.Series(dtype=float)

    # reset to plain 0..n index for compare_agent_vs_baseline
    eqw = eqw.reset_index(drop=True)
    cash = cash.reset_index(drop=True)
    sp500 = sp500_ret.reset_index(drop=True) if not sp500_ret.empty else pd.Series(dtype=float)
    return eqw, cash, sp500


eval_baseline_eqw, eval_baseline_cash, eval_baseline_sp500 = eval_build_baselines_from_phase1(eval_phase1_data)
print('Baseline lengths | EQW:', len(eval_baseline_eqw), 'Cash:', len(eval_baseline_cash), 'SP500:', len(eval_baseline_sp500))

## 11) Benchmark each evaluated checkpoint vs baselines

In [None]:
benchmark_rows = []
for label, ev in eval_evaluations.items():
    try:
        cmp_eqw = compare_agent_vs_baseline(ev, eval_baseline_eqw)
    except Exception as e:
        cmp_eqw = {'error': str(e)}

    try:
        cmp_cash = compare_agent_vs_baseline(ev, eval_baseline_cash)
    except Exception as e:
        cmp_cash = {'error': str(e)}

    try:
        if len(eval_baseline_sp500) > 0:
            cmp_sp500 = compare_agent_vs_baseline(ev, eval_baseline_sp500)
        else:
            cmp_sp500 = {'error': 'SP500 baseline unavailable'}
    except Exception as e:
        cmp_sp500 = {'error': str(e)}

    row = {
        'label': label,
        'det_sharpe': (ev.deterministic_metrics or {}).get('sharpe_ratio', np.nan),
        'det_return': (ev.deterministic_metrics or {}).get('annualized_return', np.nan),
        'det_mdd': (ev.deterministic_metrics or {}).get('max_drawdown_abs', np.nan),
        'det_turnover': (ev.deterministic_metrics or {}).get('turnover', np.nan),
    }

    for prefix, comp in [('eqw', cmp_eqw), ('cash', cmp_cash), ('sp500', cmp_sp500)]:
        if isinstance(comp, dict) and 'error' not in comp:
            for k, v in comp.items():
                row[f'{prefix}_{k}'] = v
        else:
            row[f'{prefix}_error'] = comp.get('error', 'unknown') if isinstance(comp, dict) else 'unknown'

    benchmark_rows.append(row)

eval_benchmark_df = pd.DataFrame(benchmark_rows)

display(eval_benchmark_df.sort_values('det_sharpe', ascending=False).head(20))

## 12) Champion selection (production candidate)

In [None]:
if eval_benchmark_df.empty:
    raise RuntimeError('No benchmark rows available.')

# Balanced production-style objective: reward risk-adjusted return, penalize drawdown/turnover.
eval_benchmark_df['selection_score'] = (
    eval_benchmark_df['det_sharpe'].fillna(-999)
    + 0.2 * eval_benchmark_df['det_return'].fillna(0.0)
    - 0.7 * eval_benchmark_df['det_mdd'].fillna(1.0)
    - 0.1 * eval_benchmark_df['det_turnover'].fillna(1.0)
)

champion_row = eval_benchmark_df.sort_values('selection_score', ascending=False).iloc[0]
EVAL_CHAMPION_LABEL = champion_row['label']
EVAL_CHAMPION = eval_evaluations[EVAL_CHAMPION_LABEL]

print('üèÜ Champion label:', EVAL_CHAMPION_LABEL)
print(champion_row[['det_sharpe', 'det_return', 'det_mdd', 'det_turnover', 'selection_score']])

## 13) Regime-sliced performance (champion vs equal-weight)

In [None]:
def eval_regime_tag(dates: pd.Series):
    d = pd.to_datetime(dates)
    conds = [
        (d <= pd.Timestamp('2020-02-19')),
        (d >= pd.Timestamp('2020-02-20')) & (d <= pd.Timestamp('2020-06-30')),
        (d >= pd.Timestamp('2020-07-01')) & (d <= pd.Timestamp('2021-12-31')),
        (d >= pd.Timestamp('2022-01-01')) & (d <= pd.Timestamp('2023-12-31')),
        (d >= pd.Timestamp('2024-01-01')),
    ]
    labels = ['pre_covid', 'covid_crash', 'post_covid_recovery', 'inflation_rates', 'recent']
    out = np.select(conds, labels, default='other')
    return pd.Series(out)


def eval_sharpe(x):
    x = pd.Series(x).dropna()
    if len(x) < 2:
        return np.nan
    std = x.std(ddof=1)
    if std <= 1e-12:
        return np.nan
    return np.sqrt(252.0) * x.mean() / std

# Build aligned daily return series for champion and baselines
champ_port = np.array(EVAL_CHAMPION.deterministic_portfolio)
champ_ret = pd.Series(np.diff(champ_port) / champ_port[:-1]).reset_index(drop=True)
eqw_ret = eval_baseline_eqw.reset_index(drop=True)
sp500_ret = eval_baseline_sp500.reset_index(drop=True) if len(eval_baseline_sp500) > 0 else pd.Series(dtype=float)

n_core = min(len(champ_ret), len(eqw_ret), len(eval_phase1_data.test_df['Date'].drop_duplicates()) - 1)
if len(sp500_ret) > 0:
    n = min(n_core, len(sp500_ret))
else:
    n = n_core

dates = pd.to_datetime(eval_phase1_data.test_df['Date'].drop_duplicates().sort_values()).reset_index(drop=True).iloc[1:n+1]

reg_df = pd.DataFrame({
    'Date': dates.reset_index(drop=True),
    'champion_ret': champ_ret.iloc[:n].reset_index(drop=True),
    'eqw_ret': eqw_ret.iloc[:n].reset_index(drop=True),
})
if len(sp500_ret) > 0:
    reg_df['sp500_ret'] = sp500_ret.iloc[:n].reset_index(drop=True)
else:
    reg_df['sp500_ret'] = np.nan

reg_df['regime'] = eval_regime_tag(reg_df['Date'])

regime_rows = []
for regime, g in reg_df.groupby('regime'):
    row = {
        'regime': regime,
        'n_days': len(g),
        'champion_sharpe': eval_sharpe(g['champion_ret']),
        'eqw_sharpe': eval_sharpe(g['eqw_ret']),
        'champion_total_return': float((1.0 + g['champion_ret']).prod() - 1.0),
        'eqw_total_return': float((1.0 + g['eqw_ret']).prod() - 1.0),
    }
    if g['sp500_ret'].notna().any():
        row['sp500_sharpe'] = eval_sharpe(g['sp500_ret'])
        row['sp500_total_return'] = float((1.0 + g['sp500_ret'].fillna(0.0)).prod() - 1.0)
    else:
        row['sp500_sharpe'] = np.nan
        row['sp500_total_return'] = np.nan
    regime_rows.append(row)

eval_regime_df = pd.DataFrame(regime_rows).sort_values('regime').reset_index(drop=True)
display(eval_regime_df)

## 14) Statistical confidence: bootstrap Sharpe difference (champion - equal-weight)

In [None]:
def eval_block_bootstrap_sharpe_diff(agent_ret, base_ret, n_boot=2000, block=20, seed=42):
    rng = np.random.default_rng(seed)
    a = np.asarray(agent_ret, dtype=float)
    b = np.asarray(base_ret, dtype=float)
    n = min(len(a), len(b))
    a = a[:n]
    b = b[:n]

    def _sharpe(x):
        x = pd.Series(x).dropna()
        if len(x) < 2:
            return np.nan
        s = x.std(ddof=1)
        if s <= 1e-12:
            return np.nan
        return np.sqrt(252.0) * x.mean() / s

    diffs = []
    n_blocks = int(np.ceil(n / block))
    max_start = max(1, n - block + 1)

    for _ in range(n_boot):
        idx = []
        for __ in range(n_blocks):
            st = int(rng.integers(0, max_start))
            idx.extend(range(st, min(st + block, n)))
        idx = np.asarray(idx[:n])
        d = _sharpe(a[idx]) - _sharpe(b[idx])
        if np.isfinite(d):
            diffs.append(float(d))

    if not diffs:
        return {'n_boot_eff': 0, 'mean': np.nan, 'ci_low': np.nan, 'ci_high': np.nan, 'p_le_zero': np.nan}

    diffs = np.asarray(diffs)
    return {
        'n_boot_eff': int(len(diffs)),
        'mean': float(np.mean(diffs)),
        'ci_low': float(np.quantile(diffs, 0.025)),
        'ci_high': float(np.quantile(diffs, 0.975)),
        'p_le_zero': float(np.mean(diffs <= 0.0)),
    }

bootstrap_eqw = eval_block_bootstrap_sharpe_diff(
    reg_df['champion_ret'].values,
    reg_df['eqw_ret'].values,
    n_boot=2000,
    block=20,
    seed=EVAL_RANDOM_SEED,
)

if reg_df['sp500_ret'].notna().any():
    bootstrap_sp500 = eval_block_bootstrap_sharpe_diff(
        reg_df['champion_ret'].values,
        reg_df['sp500_ret'].fillna(0.0).values,
        n_boot=2000,
        block=20,
        seed=EVAL_RANDOM_SEED,
    )
else:
    bootstrap_sp500 = {'n_boot_eff': 0, 'mean': np.nan, 'ci_low': np.nan, 'ci_high': np.nan, 'p_le_zero': np.nan}

print('Bootstrap Sharpe diff (Champion - EQW):')
print(bootstrap_eqw)
print('Bootstrap Sharpe diff (Champion - SP500):')
print(bootstrap_sp500)

## 15) Training-diagnostics quality checks from CSV metrics
Uses saved CSVs to report KL stability, turnover drivers, and execution quality.

In [None]:
diag_report = {}

if eval_latest_episodes_csv and Path(eval_latest_episodes_csv).exists():
    ep = pd.read_csv(eval_latest_episodes_csv)
    diag_report['episodes_rows'] = len(ep)

    if 'approx_kl' in ep.columns:
        kl = pd.to_numeric(ep['approx_kl'], errors='coerce').dropna()
        if len(kl):
            diag_report['approx_kl_mean'] = float(kl.mean())
            diag_report['approx_kl_p50'] = float(kl.quantile(0.50))
            diag_report['approx_kl_p90'] = float(kl.quantile(0.90))

    if {'episode_turnover_pct', 'approx_kl'}.issubset(ep.columns):
        x = pd.to_numeric(ep['episode_turnover_pct'], errors='coerce')
        y = pd.to_numeric(ep['approx_kl'], errors='coerce')
        valid = x.notna() & y.notna()
        if valid.any():
            diag_report['corr_turnoverpct_kl'] = float(np.corrcoef(x[valid], y[valid])[0, 1])

if eval_latest_step_diag_csv and Path(eval_latest_step_diag_csv).exists():
    sd = pd.read_csv(eval_latest_step_diag_csv)
    diag_report['step_diag_rows'] = len(sd)

    for col in ['l1_w_delta', 'turnover_penalty_contrib', 'tx_cost_contrib_reward_pts', 'action_realization_l1']:
        if col in sd.columns:
            s = pd.to_numeric(sd[col], errors='coerce').dropna()
            if len(s):
                diag_report[f'{col}_mean'] = float(s.mean())
                diag_report[f'{col}_p90'] = float(s.quantile(0.90))

print(json.dumps(diag_report, indent=2))

## 16) Save final evaluation package
Exports leaderboard, benchmark table, regime table, diagnostics, and champion metadata.

In [None]:
eval_out_dir = EVAL_RESULTS_ROOT / 'logs'
eval_out_dir.mkdir(parents=True, exist_ok=True)

ts = datetime.now().strftime('%Y%m%d_%H%M%S')

eval_ablation_path = eval_out_dir / f'final_eval_ablation_{ts}.csv'
eval_benchmark_path = eval_out_dir / f'final_eval_benchmark_{ts}.csv'
eval_regime_path = eval_out_dir / f'final_eval_regime_{ts}.csv'
eval_diag_path = eval_out_dir / f'final_eval_diagnostics_{ts}.json'
eval_meta_path = eval_out_dir / f'final_eval_champion_{ts}.json'

eval_ablation_table.to_csv(eval_ablation_path, index=False)
eval_benchmark_df.to_csv(eval_benchmark_path, index=False)
eval_regime_df.to_csv(eval_regime_path, index=False)

with open(eval_diag_path, 'w', encoding='utf-8') as f:
    json.dump({
        'bootstrap_sharpe_diff_eqw': bootstrap_eqw,
        'bootstrap_sharpe_diff_sp500': bootstrap_sp500,
        'diagnostics': diag_report,
        'metadata_path': str(EVAL_METADATA_PATH),
    }, f, indent=2)

with open(eval_meta_path, 'w', encoding='utf-8') as f:
    json.dump({
        'champion_label': EVAL_CHAMPION_LABEL,
        'selection_row': champion_row.to_dict(),
        'deterministic_metrics': EVAL_CHAMPION.deterministic_metrics,
        'checkpoint_description': EVAL_CHAMPION.checkpoint_description,
    }, f, indent=2, default=str)

print('‚úÖ Saved evaluation package:')
print('-', eval_ablation_path)
print('-', eval_benchmark_path)
print('-', eval_regime_path)
print('-', eval_diag_path)
print('-', eval_meta_path)