# TCN Final Evaluation Notebook

This notebook is evaluation-only and designed for final model selection, ablations, and benchmarking.
All runtime variables are isolated with the `eval_` prefix to avoid conflicts with training notebooks.

## 1) Connect to Colab VM and sync repository
Run this first in a fresh Colab runtime.

In [1]:
import os

EVAL_REPO_URL = "https://github.com/Dave-DKings/tape_tcn_project.git"
EVAL_REPO_DIR = "/content/adaptive_portfolio_rl"

if not os.path.exists(f"{EVAL_REPO_DIR}/.git"):
    !git clone {EVAL_REPO_URL} {EVAL_REPO_DIR}

%cd /content/adaptive_portfolio_rl
!git fetch origin
!git reset --hard origin/main

/content/adaptive_portfolio_rl
HEAD is now at 2fb3a67 Harden mixed-precision dtype handling across train and eval


## 2) Optional: mount Drive and restore saved results zip
Set `EVAL_RESTORE_FROM_ZIP=True` only when needed.

In [2]:
from pathlib import Path

EVAL_RESTORE_FROM_ZIP = True
EVAL_ZIP_PATH = "/content/drive/MyDrive/tcn_fusion_results_run3.zip"

if EVAL_RESTORE_FROM_ZIP:
    from google.colab import drive
    drive.mount('/content/drive')

    zip_path = Path(EVAL_ZIP_PATH)
    if not zip_path.exists():
        raise FileNotFoundError(f"Zip not found: {zip_path}")

    !mkdir -p /content/adaptive_portfolio_rl
    !unzip -q -o {zip_path} -d /content/adaptive_portfolio_rl
    print("‚úÖ Restored results from zip")
else:
    print("‚ÑπÔ∏è EVAL_RESTORE_FROM_ZIP=False")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Restored results from zip


In [3]:
from pathlib import Path

EVAL_ZIP_PATH = Path("/content/drive/MyDrive/tcn_fusion_results_run3.zip")

# 1) Verify zip exists
print("zip exists:", EVAL_ZIP_PATH.exists())
if not EVAL_ZIP_PATH.exists():
    raise FileNotFoundError(EVAL_ZIP_PATH)

# 2) Inspect zip top-level structure
!unzip -l "{EVAL_ZIP_PATH}" | head -n 40

# 3) Extract to /content (clean target)
!mkdir -p /content/eval_restore
!unzip -q -o "{EVAL_ZIP_PATH}" -d /content/eval_restore

# 4) Auto-detect correct results root
candidates = [
    Path("/content/eval_restore/tcn_fusion_results"),
    Path("/content/eval_restore/tcn_fusion_results_run3/tcn_fusion_results"),
    Path("/content/eval_restore/tcn_fusion_results_run3"),
]
for c in candidates:
    print(c, "logs:", (c / "logs").exists(), "actors:", len(list(c.rglob("*_actor.weights.h5"))))

EVAL_RESULTS_ROOT = next(
    c for c in candidates
    if (c / "logs").exists() and len(list(c.rglob("*_actor.weights.h5"))) > 0
)
print("‚úÖ EVAL_RESULTS_ROOT =", EVAL_RESULTS_ROOT)


zip exists: True
Archive:  /content/drive/MyDrive/tcn_fusion_results_run3.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2026-02-22 15:15   tcn_fusion_results/
        0  2026-02-22 19:42   tcn_fusion_results/high_watermark_checkpoints/
  1897264  2026-02-22 16:17   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00067_shp0p602_actor.weights.h5
  1893680  2026-02-22 16:45   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00082_shp0p560_critic.weights.h5
  1897264  2026-02-22 17:26   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00102_shp0p702_actor.weights.h5
  1893680  2026-02-22 15:43   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00042_shp0p994_critic.weights.h5
  1893680  2026-02-22 17:44   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00111_shp0p553_critic.weights.h5
  1893680  2026-02-22 15:27   tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00018_shp0p761_c

In [4]:
# Install project requirements in Colab VM
import subprocess, sys
from pathlib import Path

REPO_DIR = Path("/content/adaptive_portfolio_rl")
REQ_FILE = REPO_DIR / "requirements.txt"

if not REQ_FILE.exists():
    raise FileNotFoundError(f"Missing requirements file: {REQ_FILE}")

print("Using python:", sys.executable)
subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(REQ_FILE)], check=True)

print("‚úÖ Requirements installed")

Using python: /usr/bin/python3
‚úÖ Requirements installed


In [5]:
import tensorflow as tf
tf.keras.mixed_precision.set_global_policy("float32")
print(tf.keras.mixed_precision.global_policy())

<DTypePolicy "float32">


## 3) Imports

In [6]:
import copy
import json
import re
from dataclasses import replace
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

from src.config import get_active_config
from src.data_utils import DataProcessor
from src.notebook_helpers.tcn_phase1 import (
    prepare_phase1_dataset,
    create_experiment6_result_stub,
    evaluate_experiment6_checkpoint,
    load_training_metadata_into_config,
    build_evaluation_track_summary,
    build_ablation_table,
    compare_agent_vs_baseline,
    Phase1Dataset,
    split_dataset_by_date,
    identify_covariance_columns,
)

## 4) Evaluation run settings
Adjust once here.

In [None]:
EVAL_RANDOM_SEED = 42
EVAL_RESULTS_ROOT = Path("/content/eval_restore/tcn_fusion_results")

# Deterministic policy mode: 'mean' is recommended for stable ranking.
EVAL_DETERMINISTIC_MODE = 'mode'
EVAL_STOCHASTIC_MODE = "sample"

# Stochastic robustness checks per checkpoint.
EVAL_NUM_STOCHASTIC_RUNS = 5
EVAL_STOCHASTIC_EPISODE_LIMIT = 252

# Selection for ablation basket
EVAL_TOP_HW = 8         # high-watermark checkpoints by filename Sharpe tag
EVAL_TOP_PERIODIC = 4   # periodic step checkpoints by most recent step
EVAL_INCLUDE_ROOT = True
EVAL_INCLUDE_RARE = False

# Save outputs
EVAL_SAVE_LOGS = True
EVAL_SAVE_ARTIFACTS = True

In [8]:
print((EVAL_RESULTS_ROOT / "logs").exists())
print(len(list(EVAL_RESULTS_ROOT.rglob("*_actor.weights.h5"))))

True
104


In [9]:
print("root exists:", EVAL_RESULTS_ROOT.exists())
print("logs exists:", (EVAL_RESULTS_ROOT / "logs").exists())
print("actor ckpts:", len(list(EVAL_RESULTS_ROOT.rglob("*_actor.weights.h5"))))

root exists: True
logs exists: True
actor ckpts: 104


## 5) Build evaluation dataset and load latest metadata config

In [10]:
if "eval_phase1_data" in globals():
    del eval_phase1_data

In [11]:
# ============================================================================
# EVAL CONFIG + FEATURE LOCK (metadata-trained layout) + DATASET BUILD
# ============================================================================

from src.config import get_active_config
from src.data_utils import DataProcessor
from src.notebook_helpers.tcn_phase1 import (
    load_training_metadata_into_config,
    Phase1Dataset,
    prepare_phase1_dataset,
    split_dataset_by_date,
    identify_covariance_columns,
)

if not EVAL_RESULTS_ROOT.exists():
    raise FileNotFoundError(f"Missing results root: {EVAL_RESULTS_ROOT}")


def eval_extract_trained_state_layout(metadata_dict: dict):
    arch = metadata_dict.get("Architecture_Settings", {}) or {}

    # Try effective first, then template
    effective = arch.get("agent_params_effective", {}) or {}
    template = arch.get("agent_params_template", {}) or {}

    layout = effective.get("state_layout")
    if not isinstance(layout, dict) or not layout:
        layout = template.get("state_layout")

    if not isinstance(layout, dict) or not layout:
        raise ValueError("Could not find state_layout in metadata (agent_params_effective/template).")

    active_cols = layout.get("active_feature_columns")
    if not isinstance(active_cols, list) or not active_cols:
        raise ValueError("state_layout.active_feature_columns missing/empty in metadata.")

    return layout, list(dict.fromkeys(active_cols))


def eval_apply_metadata_feature_lock(cfg, trained_active_feature_columns):
    """
    Lock feature selection to the trained layout while preserving project-level drops.

    Important: include trained-only runtime groups (e.g., Actuarial_*) in the
    candidate universe so counts and lock math stay aligned with training.
    """
    probe_cfg = copy.deepcopy(cfg)
    probe_fp = probe_cfg.setdefault("feature_params", {})
    probe_fs = probe_fp.setdefault("feature_selection", {})
    probe_fs["disable_features"] = False
    probe_fs["disabled_features"] = []

    probe = DataProcessor(probe_cfg)
    core_all_cols = list(dict.fromkeys(probe.get_feature_columns("phase1")))

    # Ensure runtime-only trained columns are represented in lock universe.
    for col in trained_active_feature_columns:
        if col not in core_all_cols:
            core_all_cols.append(col)

    trained_set = set(trained_active_feature_columns)
    from_core_gap = {c for c in core_all_cols if c not in trained_set}

    # Preserve preconfigured drops (audit + curated disables).
    existing_disabled = set(
        cfg.get("feature_params", {})
        .get("feature_selection", {})
        .get("disabled_features", [])
    )
    disabled = sorted(existing_disabled.union(from_core_gap))

    fp = cfg.setdefault("feature_params", {})
    fs = fp.setdefault("feature_selection", {})
    fs["disable_features"] = True
    fs["disabled_features"] = disabled

    return core_all_cols, disabled


def eval_bind_trained_feature_layout(processor, trained_active_feature_columns):
    """
    Force eval-time feature list to match the exact trained state layout.
    This prevents runtime-family omissions (e.g., actuarial) when loading
    from a pre-normalized CSV without re-running full feature engineering.
    """
    trained_cols = list(dict.fromkeys(trained_active_feature_columns))
    base_get_feature_columns = processor.get_feature_columns

    def _locked_get_feature_columns(phase='phase1'):
        if str(phase).lower() == 'phase1':
            return list(trained_cols)
        return base_get_feature_columns(phase)

    processor.get_feature_columns = _locked_get_feature_columns
    return processor


# ------------------------------------------------------------------
# Build eval config from latest metadata
# ------------------------------------------------------------------
eval_config = copy.deepcopy(get_active_config("phase1"))

eval_logs_dir = EVAL_RESULTS_ROOT / "logs"
meta_files = sorted(eval_logs_dir.glob("*_metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
if not meta_files:
    raise FileNotFoundError(f"No metadata JSON in {eval_logs_dir}")

EVAL_METADATA_PATH = meta_files[0]
print("üìÑ Using metadata:", EVAL_METADATA_PATH)

with open(EVAL_METADATA_PATH, "r", encoding="utf-8") as f:
    eval_metadata = json.load(f)

eval_config = load_training_metadata_into_config(
    EVAL_METADATA_PATH,
    copy.deepcopy(eval_config),
    verbose=True,
)

# Enforce architecture family used by checkpoints
eval_config["agent_params"]["actor_critic_type"] = "TCN_FUSION"
eval_config["agent_params"]["use_fusion"] = True
eval_config["agent_params"]["use_attention"] = False

# Extract trained state layout and lock features to it
trained_state_layout, trained_active_feature_columns = eval_extract_trained_state_layout(eval_metadata)

# Keep layout in config for agent reconstruction compatibility
eval_config["agent_params"]["state_layout"] = copy.deepcopy(trained_state_layout)
eval_config["agent_params"]["asset_feature_dim"] = int(trained_state_layout.get("asset_feature_dim", 0) or 0)
eval_config["agent_params"]["global_feature_dim"] = int(trained_state_layout.get("global_feature_dim", 0) or 0)
eval_config["agent_params"]["num_assets"] = int(trained_state_layout.get("num_assets", 10) or 10)

core_all_cols, eval_disabled_features = eval_apply_metadata_feature_lock(
    eval_config, trained_active_feature_columns
)

act_trained = [c for c in trained_active_feature_columns if c.startswith("Actuarial_")]
print("‚úÖ Eval metadata feature lock applied")
print("   trained active_feature_columns:", len(trained_active_feature_columns))
print("   trained actuarial columns:", len(act_trained), act_trained)
print("   core feature_columns (+runtime groups):", len(core_all_cols))
print("   disabled_features:", len(eval_disabled_features))
print("   expected active after lock:", len(core_all_cols) - len(eval_disabled_features))
print("   state_layout asset/global dims:",
      trained_state_layout.get("asset_feature_dim"),
      trained_state_layout.get("global_feature_dim"))

# ------------------------------------------------------------------
# Build eval dataset from SAVED normalized master features (no rebuild)
# ------------------------------------------------------------------
EVAL_USE_SAVED_NORMALIZED = True
EVAL_FORCE_REBUILD_PHASE1 = True  # IMPORTANT: avoid stale globals from prior runs

if EVAL_FORCE_REBUILD_PHASE1 and "eval_phase1_data" in globals():
    del eval_phase1_data
    print("üßπ Cleared stale eval_phase1_data from runtime")

if "eval_phase1_data" in globals():
    print("‚ÑπÔ∏è Reusing eval_phase1_data from current runtime")
else:
    if not EVAL_USE_SAVED_NORMALIZED:
        eval_phase1_data = prepare_phase1_dataset(eval_config, force_download=False)
        eval_phase1_data.data_processor = eval_bind_trained_feature_layout(
            eval_phase1_data.data_processor,
            trained_active_feature_columns,
        )
    else:
        normalized_candidates = [
            EVAL_RESULTS_ROOT / "data" / "master_features_NORMALIZED.csv",
            Path("/content/adaptive_portfolio_rl/tcn_fusion_results_export2/data/master_features_NORMALIZED.csv"),
            Path(eval_config.get("BASE_DATA_PATH", "/content/adaptive_portfolio_rl/data")) / "master_features_NORMALIZED.csv",
            Path("/content/adaptive_portfolio_rl/data/master_features_NORMALIZED.csv"),
        ]
        normalized_path = next((p for p in normalized_candidates if p.exists()), None)
        if normalized_path is None:
            raise FileNotFoundError(
                "Could not find master_features_NORMALIZED.csv in expected locations:\n"
                + "\n".join(str(p) for p in normalized_candidates)
            )

        print("üì¶ Loading normalized master from:", normalized_path)
        master_df_norm = pd.read_csv(normalized_path)

        if "Date" not in master_df_norm.columns:
            raise ValueError("Normalized CSV missing required 'Date' column")
        if "Ticker" not in master_df_norm.columns:
            raise ValueError("Normalized CSV missing required 'Ticker' column")

        master_df_norm["Date"] = pd.to_datetime(
            master_df_norm["Date"], utc=True, errors="coerce"
        ).dt.tz_localize(None)
        master_df_norm = master_df_norm.dropna(subset=["Date"]).sort_values(["Date", "Ticker"]).reset_index(drop=True)

        analysis_start = pd.to_datetime(eval_config.get("ANALYSIS_START_DATE", "2003-09-02"))
        analysis_end = pd.to_datetime(eval_config.get("ANALYSIS_END_DATE", "2025-09-01"))
        master_df_norm = master_df_norm[
            (master_df_norm["Date"] >= analysis_start) &
            (master_df_norm["Date"] <= analysis_end)
        ].copy()

        missing_trained = [c for c in trained_active_feature_columns if c not in master_df_norm.columns]
        if missing_trained:
            raise ValueError(
                f"Saved normalized CSV missing {len(missing_trained)} trained active columns. "
                f"Sample: {missing_trained[:10]}"
            )

        eval_processor = DataProcessor(eval_config)
        eval_processor = eval_bind_trained_feature_layout(eval_processor, trained_active_feature_columns)

        split_date = eval_config.get("TRAIN_TEST_SPLIT_DATE")
        if split_date:
            train_df, test_df, train_end_date, test_start_date = split_dataset_by_date(
                master_df_norm, date_column="Date", split_date=split_date
            )
        else:
            train_df, test_df, train_end_date, test_start_date = split_dataset_by_date(
                master_df_norm, date_column="Date", train_fraction=0.8
            )

        eval_phase1_data = Phase1Dataset(
            master_df=master_df_norm,
            train_df=train_df,
            test_df=test_df,
            scalers={},
            train_end_date=train_end_date,
            test_start_date=test_start_date,
            covariance_columns=identify_covariance_columns(master_df_norm.columns),
            data_processor=eval_processor,
        )

        print("‚úÖ Built eval_phase1_data from saved normalized master")
        print("   Train shape:", eval_phase1_data.train_df.shape)
        print("   Test shape:", eval_phase1_data.test_df.shape)
        print("   Covariance cols:", len(eval_phase1_data.covariance_columns))



üìÑ Using metadata: /content/eval_restore/tcn_fusion_results/logs/Exp6_TCN_FUSION_Enhanced_TAPE_training_20260222_151442_metadata.json
‚úÖ Applied training metadata to config
   Metadata: /content/eval_restore/tcn_fusion_results/logs/Exp6_TCN_FUSION_Enhanced_TAPE_training_20260222_151442_metadata.json
   Run timestamp: 20260222_151442
   Architecture: TCN_FUSION
   Turnover target: 0.35
   DSR scalar: 2.0
   PPO update timesteps: scheduled
   Episode length curriculum: True
   RA-KL enabled: True
   Profile override loaded: True
   Credit assignment mode: step_reward_plus_terminal_bonus
   Retroactive episode scaling: False
‚úÖ Eval metadata feature lock applied
   trained active_feature_columns: 73
   core feature_columns: 75
   disabled_features: 25
   expected active after lock: 50
   state_layout asset/global dims: 37 36
üì¶ Loading normalized master from: /content/adaptive_portfolio_rl/data/master_features_NORMALIZED.csv
‚úÇÔ∏è  TIME-BASED TRAIN/TEST SPLIT (80/20 split)
   Train

In [12]:
print(eval_phase1_data.train_df.shape, eval_phase1_data.test_df.shape)
print("Date min/max test:", eval_phase1_data.test_df["Date"].min(), eval_phase1_data.test_df["Date"].max())

_eval_used = list(dict.fromkeys(eval_phase1_data.data_processor.get_feature_columns("phase1")))
_eval_act = [c for c in _eval_used if c.startswith("Actuarial_")]
print("Eval phase1 feature count:", len(_eval_used))
print("Eval actuarial features:", len(_eval_act), _eval_act)


(43867, 105) (11030, 105)
Date min/max test: 2021-04-12 00:00:00 2025-08-29 00:00:00


## 6) Inspect latest training CSV logs (for diagnostics context)

In [13]:
if not eval_logs_dir.exists():
    raise FileNotFoundError(f"Missing logs dir: {eval_logs_dir}")

def eval_latest_csv(pattern):
    files = sorted(eval_logs_dir.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return files[0] if files else None

eval_latest_episodes_csv = eval_latest_csv('*episodes*.csv')
eval_latest_step_diag_csv = eval_latest_csv('*step_diagnostics*.csv')
eval_latest_summary_csv = eval_latest_csv('*summary*.csv')

print('episodes:', eval_latest_episodes_csv)
print('step diagnostics:', eval_latest_step_diag_csv)
print('summary:', eval_latest_summary_csv)

if eval_latest_episodes_csv:
    eval_episodes_df = pd.read_csv(eval_latest_episodes_csv)
    display(eval_episodes_df.tail(5))

episodes: /content/eval_restore/tcn_fusion_results/logs/Exp6_TCN_FUSION_Enhanced_TAPE_training_20260222_151442_episodes.csv
step diagnostics: /content/eval_restore/tcn_fusion_results/logs/Exp6_TCN_FUSION_Enhanced_TAPE_training_20260222_151442_step_diagnostics.csv
summary: /content/eval_restore/tcn_fusion_results/logs/Exp6_TCN_FUSION_Enhanced_TAPE_training_20260222_151442_summary.csv


Unnamed: 0,update,timestep,episode,elapsed_time,episode_return_pct,episode_sharpe,episode_sortino,episode_max_dd,episode_volatility,episode_win_rate,...,actor_grad_norm,critic_grad_norm,alpha_min,alpha_max,alpha_mean,ratio_mean,ratio_std,drawdown_lambda_peak,episode_length,termination_reason
44,225,92416,158,14838.857658,48.391193,0.692723,1.042249,13.488559,0.125335,52.03575,...,1.625257,2.862172,1.132812,3.09668,1.94873,1.010993,0.241467,0.0,1008.0,episode_limit
45,230,94656,160,15195.194719,83.526459,0.871294,1.223971,17.313385,0.167852,54.617676,...,1.562851,1.024781,1.135742,3.087891,1.935547,0.975577,0.205509,0.000813,1008.0,episode_limit
46,235,96896,162,15554.685364,86.818874,1.21488,1.606043,12.643,0.118229,53.525323,...,1.066061,2.458133,1.217773,3.195312,1.929199,1.015326,0.333674,0.0,1008.0,episode_limit
47,240,99136,165,15911.863909,57.332436,0.690668,0.948153,15.314398,0.152376,53.823237,...,1.264512,1.392787,1.262695,2.782227,1.970703,0.993374,0.183947,0.0,1008.0,episode_limit
48,242,100000,166,16050.095959,41.295427,0.615622,0.909083,12.647292,0.12008,51.340616,...,1.1152,3.511396,1.214844,2.888672,1.931152,0.968386,0.240597,0.0,1008.0,episode_limit


## 7) Checkpoint discovery and ablation basket construction

In [20]:
# ============================================================================
# CHECKPOINT DISCOVERY + ABLATION BASKET (with forced episode includes)
# ============================================================================

#import re
#from pathlib import Path
#import numpy as np
#import pandas as pd

# -----------------------
# Config knobs
# -----------------------
EVAL_TOP_ROOT = globals().get("EVAL_TOP_ROOT", 2)
EVAL_TOP_HW = globals().get("EVAL_TOP_HW", 8)
EVAL_TOP_PERIODIC = globals().get("EVAL_TOP_PERIODIC", 4)
EVAL_TOP_RARE = globals().get("EVAL_TOP_RARE", 3)

EVAL_INCLUDE_ROOT = globals().get("EVAL_INCLUDE_ROOT", True)
EVAL_INCLUDE_RARE = globals().get("EVAL_INCLUDE_RARE", False)

# Force-include these episodes even if they are not in top-sharpe basket
# Hard-set forced episodes for this run
EVAL_FORCE_EPISODES = [90, 102, 106, 118, 134, 144, 162, 166]
EVAL_FORCE_EPISODES = sorted({int(x) for x in EVAL_FORCE_EPISODES})

if "EVAL_RESULTS_ROOT" not in globals():
    raise NameError("EVAL_RESULTS_ROOT is not defined")

EVAL_RESULTS_ROOT = Path(EVAL_RESULTS_ROOT)
if not EVAL_RESULTS_ROOT.exists():
    raise FileNotFoundError(f"Missing results root: {EVAL_RESULTS_ROOT}")


# -----------------------
# Parse helpers
# -----------------------
def _ckpt_name(x) -> str:
    return x.name if isinstance(x, Path) else str(x)


def eval_parse_sharpe_from_name(x):
    # Supports: ..._shp1p234... and ..._shm0p456...
    name = _ckpt_name(x)
    m = re.search(r"_sh([pm])(\d+)p(\d+)", name)
    if not m:
        return None
    sign = 1.0 if m.group(1) == "p" else -1.0
    return sign * float(f"{m.group(2)}.{m.group(3)}")


def eval_parse_episode(x):
    name = _ckpt_name(x)
    m = re.search(r"_ep(\d+)", name)
    return int(m.group(1)) if m else None


def eval_parse_step(x):
    name = _ckpt_name(x)
    m = re.search(r"_step(\d+)", name)
    return int(m.group(1)) if m else None


# -----------------------
# Discovery
# -----------------------
def eval_discover_actor_files(results_root: Path) -> pd.DataFrame:
    actors = sorted(results_root.rglob("*_actor.weights.h5"))
    rows = []

    for actor in actors:
        prefix = str(actor).replace("_actor.weights.h5", "")
        critic = Path(prefix + "_critic.weights.h5")
        if not critic.exists():
            continue

        parent_name = actor.parent.name
        if parent_name == "high_watermark_checkpoints":
            kind = "high_watermark"
        elif parent_name == "step_sharpe_checkpoints":
            kind = "step_sharpe"
        elif parent_name == "rare_models":
            kind = "rare"
        elif eval_parse_step(actor) is not None:
            kind = "periodic_step"
        else:
            kind = "root"

        rows.append({
            "actor_path": str(actor),
            "critic_path": str(critic),
            "checkpoint_prefix": prefix,
            "checkpoint_kind": kind,
            "episode": eval_parse_episode(actor),
            "step": eval_parse_step(actor),
            "sharpe_tag": eval_parse_sharpe_from_name(actor),
            "mtime": actor.stat().st_mtime,
        })

    cols = [
        "actor_path", "critic_path", "checkpoint_prefix",
        "checkpoint_kind", "episode", "step", "sharpe_tag", "mtime"
    ]
    return pd.DataFrame(rows, columns=cols) if rows else pd.DataFrame(columns=cols)


# -----------------------
# Basket selection
# -----------------------
def eval_select_ablation_basket(df_ckpt: pd.DataFrame) -> pd.DataFrame:
    picks = []

    if EVAL_INCLUDE_ROOT:
        root_df = df_ckpt[df_ckpt["checkpoint_kind"] == "root"].copy()
        if not root_df.empty:
            picks.append(root_df.sort_values("mtime", ascending=False).head(EVAL_TOP_ROOT))

    hw_df = df_ckpt[df_ckpt["checkpoint_kind"] == "high_watermark"].copy()
    if not hw_df.empty:
        hw_df["sharpe_rank_key"] = hw_df["sharpe_tag"].fillna(-np.inf)
        picks.append(
            hw_df.sort_values(
                ["sharpe_rank_key", "episode", "mtime"],
                ascending=[False, False, False]
            ).head(EVAL_TOP_HW)
        )

    periodic_df = df_ckpt[df_ckpt["checkpoint_kind"] == "periodic_step"].copy()
    if not periodic_df.empty:
        picks.append(
            periodic_df.sort_values(["step", "mtime"], ascending=[False, False]).head(EVAL_TOP_PERIODIC)
        )

    if EVAL_INCLUDE_RARE:
        rare_df = df_ckpt[df_ckpt["checkpoint_kind"] == "rare"].copy()
        if not rare_df.empty:
            rare_df["sharpe_rank_key"] = rare_df["sharpe_tag"].fillna(-np.inf)
            picks.append(
                rare_df.sort_values(
                    ["sharpe_rank_key", "episode", "mtime"],
                    ascending=[False, False, False]
                ).head(EVAL_TOP_RARE)
            )

    out = pd.concat(picks, ignore_index=True) if picks else pd.DataFrame(columns=df_ckpt.columns)

    # Force include specific episodes (prefer high_watermark/root)
    if EVAL_FORCE_EPISODES:
        forced = df_ckpt[
            (df_ckpt["episode"].isin(EVAL_FORCE_EPISODES)) &
            (df_ckpt["checkpoint_kind"].isin(["high_watermark", "root"]))
        ].copy()
        if not forced.empty:
            out = pd.concat([out, forced], ignore_index=True)

    out = out.drop_duplicates(subset=["checkpoint_prefix"]).reset_index(drop=True)
    out = out.sort_values(
        ["checkpoint_kind", "episode", "sharpe_tag", "step", "mtime"],
        ascending=[True, True, False, False, False]
    ).reset_index(drop=True)

    return out


# -----------------------
# Run
# -----------------------
eval_ckpt_df = eval_discover_actor_files(EVAL_RESULTS_ROOT)
if eval_ckpt_df.empty:
    raise RuntimeError(f"No valid actor+critic checkpoint pairs found under {EVAL_RESULTS_ROOT}")

print("All checkpoint pairs:", len(eval_ckpt_df))
print("By kind:", eval_ckpt_df["checkpoint_kind"].value_counts().to_dict())

eval_ablation_ckpts = eval_select_ablation_basket(eval_ckpt_df)

found_forced = sorted(
    set(eval_ablation_ckpts["episode"].dropna().astype(int).tolist()) & set(EVAL_FORCE_EPISODES)
)
missing_forced = sorted(set(EVAL_FORCE_EPISODES) - set(found_forced))

print("Selected for ablation:", len(eval_ablation_ckpts))
print("Forced requested:", EVAL_FORCE_EPISODES)
print("Forced found:", found_forced)
print("Forced missing:", missing_forced)

display(
    eval_ablation_ckpts[
        ["checkpoint_kind", "episode", "step", "sharpe_tag", "actor_path"]
    ].head(100)
)

All checkpoint pairs: 104
By kind: {'high_watermark': 104}
Selected for ablation: 16
Forced requested: [90, 102, 106, 118, 134, 144, 162, 166]
Forced found: [90, 102, 106, 118, 134, 144, 162, 166]
Forced missing: []


Unnamed: 0,checkpoint_kind,episode,step,sharpe_tag,actor_path
0,high_watermark,3,,1.916,/content/eval_restore/tcn_fusion_results/high_...
1,high_watermark,6,,3.044,/content/eval_restore/tcn_fusion_results/high_...
2,high_watermark,15,,1.935,/content/eval_restore/tcn_fusion_results/high_...
3,high_watermark,26,,1.775,/content/eval_restore/tcn_fusion_results/high_...
4,high_watermark,31,,2.62,/content/eval_restore/tcn_fusion_results/high_...
5,high_watermark,36,,1.87,/content/eval_restore/tcn_fusion_results/high_...
6,high_watermark,48,,1.617,/content/eval_restore/tcn_fusion_results/high_...
7,high_watermark,58,,2.091,/content/eval_restore/tcn_fusion_results/high_...
8,high_watermark,90,,1.531,/content/eval_restore/tcn_fusion_results/high_...
9,high_watermark,102,,0.702,/content/eval_restore/tcn_fusion_results/high_...


## 8) Evaluate ablation basket (deterministic + stochastic)

In [None]:
from src.notebook_helpers.tcn_phase1 import (
    load_run_checkpoint_prefixes_from_metadata,
    preflight_checkpoint_loadability,
)

run_prefixes = load_run_checkpoint_prefixes_from_metadata(
    EVAL_METADATA_PATH,
    results_root=EVAL_RESULTS_ROOT,
    allowed_types={"high_watermark", "deterministic_validation_high_watermark", "final_high_watermark_style"},
    require_both_files=True,
)

# Fallback for older metadata
if not run_prefixes:
    print("‚ÑπÔ∏è No run-scoped checkpoint records in metadata; falling back to discovered checkpoints.")
    if "eval_ckpt_df" not in globals() or eval_ckpt_df is None or len(eval_ckpt_df) == 0:
        eval_ckpt_df = eval_discover_actor_files(EVAL_RESULTS_ROOT)

    fallback_df = eval_ckpt_df[eval_ckpt_df["checkpoint_kind"].isin(["high_watermark", "root"])].copy()
    run_prefixes = fallback_df["checkpoint_prefix"].dropna().unique().tolist()

print("run checkpoints:", len(run_prefixes))

preflight_df = preflight_checkpoint_loadability(
    checkpoint_prefixes=run_prefixes,
    phase1_data=eval_phase1_data,
    config=eval_config,
    random_seed=EVAL_RANDOM_SEED,
    use_covariance=True,
    architecture=eval_config["agent_params"]["actor_critic_type"],
)

if preflight_df is None or preflight_df.empty:
    print("‚ö†Ô∏è preflight returned empty; keeping current eval_ablation_ckpts")
else:
    display(preflight_df.head())
    if "compatible" in preflight_df.columns and "checkpoint_prefix" in preflight_df.columns:
        compatible_prefixes = set(preflight_df.loc[preflight_df["compatible"], "checkpoint_prefix"])
        eval_ablation_ckpts = eval_ablation_ckpts[
            eval_ablation_ckpts["checkpoint_prefix"].isin(compatible_prefixes)
        ].reset_index(drop=True)
        print("compatible selected:", len(eval_ablation_ckpts))

In [None]:
display(
    eval_ablation_ckpts[["checkpoint_kind", "episode", "step", "sharpe_tag", "checkpoint_prefix"]]
    .sort_values(["episode", "step"], na_position="last")
    .reset_index(drop=True)
)

In [21]:
# before running evaluate loop
eval_config["training_params"]["evaluation_action_execution_beta"] = 0.25
eval_config["training_params"]["evaluation_turnover_penalty_scalar"] = 2.5

In [None]:
def eval_run_one_checkpoint(eval_cfg, phase1_data, ckpt_prefix, seed=42):
    stub = create_experiment6_result_stub(
        random_seed=seed,
        use_covariance=True,
        architecture=eval_cfg["agent_params"]["actor_critic_type"],
        checkpoint_path=ckpt_prefix,
        agent_config=copy.deepcopy(eval_cfg["agent_params"]),  # important
        base_agent_params=None,
    )


    return evaluate_experiment6_checkpoint(
        experiment6=stub,
        phase1_data=phase1_data,
        config=eval_cfg,
        random_seed=seed,
        checkpoint_path_override=ckpt_prefix,
        deterministic_eval_mode=EVAL_DETERMINISTIC_MODE,
        num_eval_runs=EVAL_NUM_STOCHASTIC_RUNS,
        stochastic_eval_mode = EVAL_STOCHASTIC_MODE,
        stochastic_episode_length_limit=EVAL_STOCHASTIC_EPISODE_LIMIT,
        save_eval_logs=EVAL_SAVE_LOGS,
        save_eval_artifacts=EVAL_SAVE_ARTIFACTS,
    )


eval_evaluations = {}
eval_failures = {}

for i, row in eval_ablation_ckpts.iterrows():
    ep = int(row["episode"]) if pd.notna(row["episode"]) else -1
    st = int(row["step"]) if pd.notna(row["step"]) else -1
    label = f"{row['checkpoint_kind']}__ep{ep:04d}__step{st:06d}"
    prefix = row["checkpoint_prefix"]

    print(f"\n[{i+1}/{len(eval_ablation_ckpts)}] Evaluating: {label}")
    try:
        ev = eval_run_one_checkpoint(eval_config, eval_phase1_data, prefix, seed=EVAL_RANDOM_SEED)
        eval_evaluations[label] = ev
    except Exception as e:
        eval_failures[label] = f"{type(e).__name__}: {e}"
        print(f"‚ùå Failed {label}: {eval_failures[label]}")

print("‚úÖ Completed evaluations:", len(eval_evaluations))
print("‚ö†Ô∏è Failed evaluations:", len(eval_failures))

if eval_failures:
    print("\nFailure samples:")
    for k, v in list(eval_failures.items())[:10]:
        print(" -", k, "->", v)


[1/16] Evaluating: high_watermark__ep0003__step-00001

LOADING CUSTOM CHECKPOINT: /content/eval_restore/tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00003_shp1p916
‚úÖ Found actor weights: /content/eval_restore/tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00003_shp1p916_actor.weights.h5
‚úÖ Found critic weights: /content/eval_restore/tcn_fusion_results/high_watermark_checkpoints/exp6_tape_hw_ep00003_shp1p916_critic.weights.h5
üèóÔ∏è Recreating evaluation environments...
üîß Building models before loading weights...
   ‚úÖ Models built successfully
üìÇ Loading checkpoint weights...
   ‚úÖ Weights loaded successfully
   üéØ Deterministic eval policy modes: ['mean']
   üéØ Stochastic eval policy mode:     sample

DETERMINISTIC EVALUATION (det_mean)

üìä DETERMINISTIC TEST RESULTS:
   Eval Track: det_mean
   Start Date: 2021-04-12
   Market Regime: Post-Pandemic Rally (2021)
   Episode Length: 1103 days (4.38 years)
   Final Portfolio Value: $165,761.

: 

## 9) Ablation table and leaderboard

In [None]:
if not eval_evaluations:
    raise RuntimeError('No successful evaluations to summarize.')

eval_ablation_table = build_ablation_table(eval_evaluations)

display(eval_ablation_table.head(30))

# Deterministic-first leaderboard view
eval_leaderboard = eval_ablation_table.copy()
eval_leaderboard['risk_adjusted_score'] = (
    eval_leaderboard['det_sharpe'].fillna(-999)
    - 0.5 * eval_leaderboard['det_max_drawdown'].fillna(1.0)
    - 0.1 * eval_leaderboard['det_turnover'].fillna(1.0)
)
eval_leaderboard = eval_leaderboard.sort_values(['risk_adjusted_score', 'det_sharpe'], ascending=False).reset_index(drop=True)

print('Top by risk-adjusted score:')
display(eval_leaderboard.head(10))

## 10) Build industry baseline returns (equal-weight and cash)

In [None]:
def eval_identify_asset_column(df: pd.DataFrame):
    candidates = ['Ticker', 'ticker', 'tic', 'asset', 'Asset', 'symbol', 'Symbol']
    for c in candidates:
        if c in df.columns:
            return c
    return None


def eval_identify_return_column(df: pd.DataFrame):
    candidates = ['LogReturn_1d', 'log_return_1d', 'Return_1d', 'return_1d', 'daily_return']
    for c in candidates:
        if c in df.columns:
            return c
    return None


def eval_fetch_sp500_returns(start_date: pd.Timestamp, end_date: pd.Timestamp) -> pd.Series:
    """
    Fetch S&P500 daily simple returns for benchmark comparison.
    Primary source: yfinance '^GSPC'.
    Fallback: empty series if fetch fails.
    """
    try:
        import yfinance as yf
    except Exception:
        try:
            !pip -q install yfinance
            import yfinance as yf
        except Exception:
            print('‚ö†Ô∏è Could not install/import yfinance; SP500 benchmark disabled.')
            return pd.Series(dtype=float)

    try:
        df = yf.download('^GSPC', start=str(start_date.date()), end=str((end_date + pd.Timedelta(days=1)).date()), auto_adjust=True, progress=False)
        if df is None or df.empty or 'Close' not in df.columns:
            print('‚ö†Ô∏è SP500 download returned empty data.')
            return pd.Series(dtype=float)
        close = pd.Series(df['Close']).dropna()
        ret = close.pct_change().dropna().astype(float)
        ret.index = pd.to_datetime(ret.index)
        return ret
    except Exception as e:
        print(f'‚ö†Ô∏è SP500 fetch failed: {type(e).__name__}: {e}')
        return pd.Series(dtype=float)


def eval_build_baselines_from_phase1(phase1_data):
    test_df = phase1_data.test_df.copy()
    if 'Date' not in test_df.columns:
        raise ValueError('test_df must contain Date column')

    ret_col = eval_identify_return_column(test_df)
    if ret_col is None:
        raise ValueError('Could not identify return column in test_df')

    if 'LogReturn' in ret_col or 'log' in ret_col.lower():
        test_df['_simple_ret'] = np.expm1(test_df[ret_col].astype(float))
    else:
        test_df['_simple_ret'] = test_df[ret_col].astype(float)

    # Industry baseline 1: equal-weight over available assets each day
    eqw = (
        test_df.groupby('Date')['_simple_ret']
        .mean()
        .sort_index()
        .astype(float)
    )

    # Industry baseline 2: cash (0% daily return)
    cash = pd.Series(np.zeros(len(eqw)), index=pd.to_datetime(eqw.index), name='cash')

    # Industry baseline 3: S&P 500 (^GSPC)
    dt_index = pd.to_datetime(eqw.index)
    sp500_ret = eval_fetch_sp500_returns(dt_index.min(), dt_index.max())
    if not sp500_ret.empty:
        # align to model dates; missing market holidays become 0 return for alignment stability
        sp500_ret = sp500_ret.reindex(dt_index).fillna(0.0)
    else:
        sp500_ret = pd.Series(dtype=float)

    # reset to plain 0..n index for compare_agent_vs_baseline
    eqw = eqw.reset_index(drop=True)
    cash = cash.reset_index(drop=True)
    sp500 = sp500_ret.reset_index(drop=True) if not sp500_ret.empty else pd.Series(dtype=float)
    return eqw, cash, sp500


eval_baseline_eqw, eval_baseline_cash, eval_baseline_sp500 = eval_build_baselines_from_phase1(eval_phase1_data)
print('Baseline lengths | EQW:', len(eval_baseline_eqw), 'Cash:', len(eval_baseline_cash), 'SP500:', len(eval_baseline_sp500))

## 11) Benchmark each evaluated checkpoint vs baselines

In [None]:
benchmark_rows = []
for label, ev in eval_evaluations.items():
    try:
        cmp_eqw = compare_agent_vs_baseline(ev, eval_baseline_eqw)
    except Exception as e:
        cmp_eqw = {'error': str(e)}

    try:
        cmp_cash = compare_agent_vs_baseline(ev, eval_baseline_cash)
    except Exception as e:
        cmp_cash = {'error': str(e)}

    try:
        if len(eval_baseline_sp500) > 0:
            cmp_sp500 = compare_agent_vs_baseline(ev, eval_baseline_sp500)
        else:
            cmp_sp500 = {'error': 'SP500 baseline unavailable'}
    except Exception as e:
        cmp_sp500 = {'error': str(e)}

    row = {
        'label': label,
        'det_sharpe': (ev.deterministic_metrics or {}).get('sharpe_ratio', np.nan),
        'det_return': (ev.deterministic_metrics or {}).get('annualized_return', np.nan),
        'det_mdd': (ev.deterministic_metrics or {}).get('max_drawdown_abs', np.nan),
        'det_turnover': (ev.deterministic_metrics or {}).get('turnover', np.nan),
    }

    for prefix, comp in [('eqw', cmp_eqw), ('cash', cmp_cash), ('sp500', cmp_sp500)]:
        if isinstance(comp, dict) and 'error' not in comp:
            for k, v in comp.items():
                row[f'{prefix}_{k}'] = v
        else:
            row[f'{prefix}_error'] = comp.get('error', 'unknown') if isinstance(comp, dict) else 'unknown'

    benchmark_rows.append(row)

eval_benchmark_df = pd.DataFrame(benchmark_rows)

display(eval_benchmark_df.sort_values('det_sharpe', ascending=False).head(20))

## 12) Champion selection (production candidate)

In [None]:
if eval_benchmark_df.empty:
    raise RuntimeError('No benchmark rows available.')

# Balanced production-style objective: reward risk-adjusted return, penalize drawdown/turnover.
eval_benchmark_df['selection_score'] = (
    eval_benchmark_df['det_sharpe'].fillna(-999)
    + 0.2 * eval_benchmark_df['det_return'].fillna(0.0)
    - 0.7 * eval_benchmark_df['det_mdd'].fillna(1.0)
    - 0.1 * eval_benchmark_df['det_turnover'].fillna(1.0)
)

champion_row = eval_benchmark_df.sort_values('selection_score', ascending=False).iloc[0]
EVAL_CHAMPION_LABEL = champion_row['label']
EVAL_CHAMPION = eval_evaluations[EVAL_CHAMPION_LABEL]

print('üèÜ Champion label:', EVAL_CHAMPION_LABEL)
print(champion_row[['det_sharpe', 'det_return', 'det_mdd', 'det_turnover', 'selection_score']])

## 13) Regime-sliced performance (champion vs equal-weight)

In [None]:
def eval_regime_tag(dates: pd.Series):
    d = pd.to_datetime(dates)
    conds = [
        (d <= pd.Timestamp('2020-02-19')),
        (d >= pd.Timestamp('2020-02-20')) & (d <= pd.Timestamp('2020-06-30')),
        (d >= pd.Timestamp('2020-07-01')) & (d <= pd.Timestamp('2021-12-31')),
        (d >= pd.Timestamp('2022-01-01')) & (d <= pd.Timestamp('2023-12-31')),
        (d >= pd.Timestamp('2024-01-01')),
    ]
    labels = ['pre_covid', 'covid_crash', 'post_covid_recovery', 'inflation_rates', 'recent']
    out = np.select(conds, labels, default='other')
    return pd.Series(out)


def eval_sharpe(x):
    x = pd.Series(x).dropna()
    if len(x) < 2:
        return np.nan
    std = x.std(ddof=1)
    if std <= 1e-12:
        return np.nan
    return np.sqrt(252.0) * x.mean() / std

# Build aligned daily return series for champion and baselines
champ_port = np.array(EVAL_CHAMPION.deterministic_portfolio)
champ_ret = pd.Series(np.diff(champ_port) / champ_port[:-1]).reset_index(drop=True)
eqw_ret = eval_baseline_eqw.reset_index(drop=True)
sp500_ret = eval_baseline_sp500.reset_index(drop=True) if len(eval_baseline_sp500) > 0 else pd.Series(dtype=float)

n_core = min(len(champ_ret), len(eqw_ret), len(eval_phase1_data.test_df['Date'].drop_duplicates()) - 1)
if len(sp500_ret) > 0:
    n = min(n_core, len(sp500_ret))
else:
    n = n_core

dates = pd.to_datetime(eval_phase1_data.test_df['Date'].drop_duplicates().sort_values()).reset_index(drop=True).iloc[1:n+1]

reg_df = pd.DataFrame({
    'Date': dates.reset_index(drop=True),
    'champion_ret': champ_ret.iloc[:n].reset_index(drop=True),
    'eqw_ret': eqw_ret.iloc[:n].reset_index(drop=True),
})
if len(sp500_ret) > 0:
    reg_df['sp500_ret'] = sp500_ret.iloc[:n].reset_index(drop=True)
else:
    reg_df['sp500_ret'] = np.nan

reg_df['regime'] = eval_regime_tag(reg_df['Date'])

regime_rows = []
for regime, g in reg_df.groupby('regime'):
    row = {
        'regime': regime,
        'n_days': len(g),
        'champion_sharpe': eval_sharpe(g['champion_ret']),
        'eqw_sharpe': eval_sharpe(g['eqw_ret']),
        'champion_total_return': float((1.0 + g['champion_ret']).prod() - 1.0),
        'eqw_total_return': float((1.0 + g['eqw_ret']).prod() - 1.0),
    }
    if g['sp500_ret'].notna().any():
        row['sp500_sharpe'] = eval_sharpe(g['sp500_ret'])
        row['sp500_total_return'] = float((1.0 + g['sp500_ret'].fillna(0.0)).prod() - 1.0)
    else:
        row['sp500_sharpe'] = np.nan
        row['sp500_total_return'] = np.nan
    regime_rows.append(row)

eval_regime_df = pd.DataFrame(regime_rows).sort_values('regime').reset_index(drop=True)
display(eval_regime_df)

## 14) Statistical confidence: bootstrap Sharpe difference (champion - equal-weight)

In [None]:
def eval_block_bootstrap_sharpe_diff(agent_ret, base_ret, n_boot=2000, block=20, seed=42):
    rng = np.random.default_rng(seed)
    a = np.asarray(agent_ret, dtype=float)
    b = np.asarray(base_ret, dtype=float)
    n = min(len(a), len(b))
    a = a[:n]
    b = b[:n]

    def _sharpe(x):
        x = pd.Series(x).dropna()
        if len(x) < 2:
            return np.nan
        s = x.std(ddof=1)
        if s <= 1e-12:
            return np.nan
        return np.sqrt(252.0) * x.mean() / s

    diffs = []
    n_blocks = int(np.ceil(n / block))
    max_start = max(1, n - block + 1)

    for _ in range(n_boot):
        idx = []
        for __ in range(n_blocks):
            st = int(rng.integers(0, max_start))
            idx.extend(range(st, min(st + block, n)))
        idx = np.asarray(idx[:n])
        d = _sharpe(a[idx]) - _sharpe(b[idx])
        if np.isfinite(d):
            diffs.append(float(d))

    if not diffs:
        return {'n_boot_eff': 0, 'mean': np.nan, 'ci_low': np.nan, 'ci_high': np.nan, 'p_le_zero': np.nan}

    diffs = np.asarray(diffs)
    return {
        'n_boot_eff': int(len(diffs)),
        'mean': float(np.mean(diffs)),
        'ci_low': float(np.quantile(diffs, 0.025)),
        'ci_high': float(np.quantile(diffs, 0.975)),
        'p_le_zero': float(np.mean(diffs <= 0.0)),
    }

bootstrap_eqw = eval_block_bootstrap_sharpe_diff(
    reg_df['champion_ret'].values,
    reg_df['eqw_ret'].values,
    n_boot=2000,
    block=20,
    seed=EVAL_RANDOM_SEED,
)

if reg_df['sp500_ret'].notna().any():
    bootstrap_sp500 = eval_block_bootstrap_sharpe_diff(
        reg_df['champion_ret'].values,
        reg_df['sp500_ret'].fillna(0.0).values,
        n_boot=2000,
        block=20,
        seed=EVAL_RANDOM_SEED,
    )
else:
    bootstrap_sp500 = {'n_boot_eff': 0, 'mean': np.nan, 'ci_low': np.nan, 'ci_high': np.nan, 'p_le_zero': np.nan}

print('Bootstrap Sharpe diff (Champion - EQW):')
print(bootstrap_eqw)
print('Bootstrap Sharpe diff (Champion - SP500):')
print(bootstrap_sp500)

## 15) Training-diagnostics quality checks from CSV metrics
Uses saved CSVs to report KL stability, turnover drivers, and execution quality.

In [None]:
diag_report = {}

if eval_latest_episodes_csv and Path(eval_latest_episodes_csv).exists():
    ep = pd.read_csv(eval_latest_episodes_csv)
    diag_report['episodes_rows'] = len(ep)

    if 'approx_kl' in ep.columns:
        kl = pd.to_numeric(ep['approx_kl'], errors='coerce').dropna()
        if len(kl):
            diag_report['approx_kl_mean'] = float(kl.mean())
            diag_report['approx_kl_p50'] = float(kl.quantile(0.50))
            diag_report['approx_kl_p90'] = float(kl.quantile(0.90))

    if {'episode_turnover_pct', 'approx_kl'}.issubset(ep.columns):
        x = pd.to_numeric(ep['episode_turnover_pct'], errors='coerce')
        y = pd.to_numeric(ep['approx_kl'], errors='coerce')
        valid = x.notna() & y.notna()
        if valid.any():
            diag_report['corr_turnoverpct_kl'] = float(np.corrcoef(x[valid], y[valid])[0, 1])

if eval_latest_step_diag_csv and Path(eval_latest_step_diag_csv).exists():
    sd = pd.read_csv(eval_latest_step_diag_csv)
    diag_report['step_diag_rows'] = len(sd)

    for col in ['l1_w_delta', 'turnover_penalty_contrib', 'tx_cost_contrib_reward_pts', 'action_realization_l1']:
        if col in sd.columns:
            s = pd.to_numeric(sd[col], errors='coerce').dropna()
            if len(s):
                diag_report[f'{col}_mean'] = float(s.mean())
                diag_report[f'{col}_p90'] = float(s.quantile(0.90))

print(json.dumps(diag_report, indent=2))

## 16) Save final evaluation package
Exports leaderboard, benchmark table, regime table, diagnostics, and champion metadata.

In [None]:
eval_out_dir = EVAL_RESULTS_ROOT / 'logs'
eval_out_dir.mkdir(parents=True, exist_ok=True)

ts = datetime.now().strftime('%Y%m%d_%H%M%S')

eval_ablation_path = eval_out_dir / f'final_eval_ablation_{ts}.csv'
eval_benchmark_path = eval_out_dir / f'final_eval_benchmark_{ts}.csv'
eval_regime_path = eval_out_dir / f'final_eval_regime_{ts}.csv'
eval_diag_path = eval_out_dir / f'final_eval_diagnostics_{ts}.json'
eval_meta_path = eval_out_dir / f'final_eval_champion_{ts}.json'

eval_ablation_table.to_csv(eval_ablation_path, index=False)
eval_benchmark_df.to_csv(eval_benchmark_path, index=False)
eval_regime_df.to_csv(eval_regime_path, index=False)

with open(eval_diag_path, 'w', encoding='utf-8') as f:
    json.dump({
        'bootstrap_sharpe_diff_eqw': bootstrap_eqw,
        'bootstrap_sharpe_diff_sp500': bootstrap_sp500,
        'diagnostics': diag_report,
        'metadata_path': str(EVAL_METADATA_PATH),
    }, f, indent=2)

with open(eval_meta_path, 'w', encoding='utf-8') as f:
    json.dump({
        'champion_label': EVAL_CHAMPION_LABEL,
        'selection_row': champion_row.to_dict(),
        'deterministic_metrics': EVAL_CHAMPION.deterministic_metrics,
        'checkpoint_description': EVAL_CHAMPION.checkpoint_description,
    }, f, indent=2, default=str)

print('‚úÖ Saved evaluation package:')
print('-', eval_ablation_path)
print('-', eval_benchmark_path)
print('-', eval_regime_path)
print('-', eval_diag_path)
print('-', eval_meta_path)