In [None]:
# ===== Cell 0: 環境設定（全セル共通で利用）=====
from __future__ import annotations

from typing import Any, Callable, Dict, Optional

import os
import numpy as np
import pandas as pd
import matplotlib           # 追加
import matplotlib.backends  # 追加（←これがポイント）
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# ------------------------
# 実験スイッチ（Notebook全体で共有）
# ------------------------
FMS_THRESHOLD: int = 1              # FMS >= 1 を陽性ラベルとみなす
EPOCH_LEN: int = 30                 # 30 / 60 / 120 のいずれか（本実験では 30 固定運用）
MODEL_BACKEND: str = "xgb"          # "xgb" / "rf" のいずれか

USE_MSSQ_FEATURE: bool = False      # True: MSSQ を特徴量に含める
USE_VIMSSQ_FEATURE: bool = False    # True: VIMSSQ を特徴量に含める

SEED_BASE: int = 20251101
TOP_SUBSET_K: int = 15              # subset探索で使う上位特徴数

# ---- XGB 正則化レベル（パターン選択）----
# 0: 正則化0（初期パラ）
# 1: 正則化1（改変パラ）
# ★ ドライバノートから上書きできるように globals() を参照
XGB_REG_LEVEL: int = int(globals().get("XGB_REG_LEVEL", 0))

# ---- 時系列特徴（履歴連結）の設定 ----
# HISTORY_N_EPOCHS = 1: 従来どおり履歴なし（f_k）
# HISTORY_N_EPOCHS = H>1: 直近 H エポック分を連結 [f_{k-H+1}, ..., f_k]
HISTORY_N_EPOCHS: int = 1
if HISTORY_N_EPOCHS < 1:
    raise ValueError("HISTORY_N_EPOCHS は 1 以上で指定してください。")

# ---- MSSQ / VIMSSQ の High / Low 閾値 ----
MSSQ_THRESHOLD_FIXED: float   = 11.0  # 例：MSSQ >= 11 を High
VIMSSQ_THRESHOLD_FIXED: float = 3     # 例：VIMSSQ >= 3 を High

# ---- グループ分けの基準（FMS推移プロットなどで使用）----
# "MSSQ" または "VIMSSQ" を指定
GROUPING_BASIS_FOR_PLOTS: str = "MSSQ"

if EPOCH_LEN not in (30, 60, 120):
    raise ValueError("EPOCH_LEN は 30/60/120 から選択してください。")

# ---------------- 設定（等間隔グリッド＋近傍再探索） ----------------
COARSE_STEPS = 101      # 0.0〜1.0 を等間隔
FINE_STEPS   = 101      # 近傍再探索の細かさ
FINE_MARGIN  = 0.01     # 近傍幅（±0.01）

# ★ ドライバノートから上書きできるように globals() を参照
CORR_THRESHOLD = float(globals().get("CORR_THRESHOLD", 0.7))  # 相関除去の閾値（Cell3A-pre でも使用）

VERBOSE      = True

# ---- 実行フラグ（Cell4/Cell5/Cell6 を切替）----
RUN_CELL4: bool = bool(globals().get("RUN_CELL4", True))
RUN_CELL5: bool = bool(globals().get("RUN_CELL5", True))
RUN_CELL6_NEUTRAL: bool = RUN_CELL4
RUN_CELL6_STRAT: bool = RUN_CELL5

# ------------------------
# ファイル入出力ルート
# ------------------------
BASE_INPUT_DIR = r"C:\\Users\\taiki\\OneDrive - Science Tokyo\\デスクトップ\\研究\\本実験結果"
BASE_ANALYSIS_DIR = os.path.join(BASE_INPUT_DIR, "ANALYSIS")
BASE_LEVEL0_DIR = os.path.join(BASE_ANALYSIS_DIR, "機械学習")  # 階層0

# --- 階層1パラメータタグ（可変要素）---
# ★ FSモードは文字列 FS_MODE から決める
FS_MODE = globals().get("FS_MODE", "mean")  # "mean" / "rank" / "rfe"

USE_FS_SHAP_MEAN: bool = (FS_MODE == "mean")  # SHAP値平均（TreeSHAPのmean(|SHAP|)）
USE_FS_SHAP_RANK: bool = (FS_MODE == "rank")  # SHAP順位平均（fold rankの平均）
USE_FS_RFE: bool        = (FS_MODE == "rfe")  # RFE（fold rank）

_fs_flags = [USE_FS_SHAP_MEAN, USE_FS_SHAP_RANK, USE_FS_RFE]
if sum(_fs_flags) != 1:
    raise ValueError("FS_MODE は 'mean' / 'rank' / 'rfe' のいずれかになるようにしてください")

# 正則化セットのラベル（XGB_REG_LEVEL から自動生成）
REGULARIZATION_TAG = f"正則化{XGB_REG_LEVEL}"

if USE_FS_SHAP_MEAN:
    FEATURE_SELECTION_METHOD = "SHAP-mean"
    FEATURE_RANKING_FILE = "SHAP_MEAN_FEATURE_RANKING.CSV"
    GROUP_RANKING_FILE = "SHAP_MEAN_GROUP_RANKING.CSV"
elif USE_FS_SHAP_RANK:
    FEATURE_SELECTION_METHOD = "SHAP-rank"
    FEATURE_RANKING_FILE = "SHAP_RANK_FEATURE_RANKING.CSV"
    GROUP_RANKING_FILE = "SHAP_RANK_GROUP_RANKING.CSV"
else:
    FEATURE_SELECTION_METHOD = "RFE"
    FEATURE_RANKING_FILE = "RFE_FEATURE_RANKING.CSV"
    GROUP_RANKING_FILE = "RFE_GROUP_RANKING.CSV"


def build_level1_dir(
    feature_selection: str,
    history_epochs: int,
    coarse_steps: int,
    fine_steps: int,
    corr_threshold: float,
) -> str:
    parts = [
        feature_selection,
        f"H{history_epochs:02d}",
        f"Grid{coarse_steps}",
        f"Corr{corr_threshold:.2f}",
        REGULARIZATION_TAG,
    ]
    return os.path.join(BASE_LEVEL0_DIR, "__".join(parts))


LEVEL1_DIR = build_level1_dir(
    FEATURE_SELECTION_METHOD,
    HISTORY_N_EPOCHS,
    COARSE_STEPS,
    FINE_STEPS,
    CORR_THRESHOLD,
)
os.makedirs(LEVEL1_DIR, exist_ok=True)

CURRENT_CELL_ID = None
OUT_DIR = None


def set_cell_output(cell_id: int) -> None:
    """階層2: Cellごとの出力先をセット（Cell0, Cell1, ...）"""
    global CURRENT_CELL_ID, OUT_DIR
    CURRENT_CELL_ID = cell_id
    OUT_DIR = os.path.join(LEVEL1_DIR, f"Cell{cell_id}")
    os.makedirs(OUT_DIR, exist_ok=True)
    print(f"[Cell{cell_id}] OUT_DIR -> {OUT_DIR}")


def outpath(filename: str) -> str:
    if OUT_DIR is None:
        raise RuntimeError("先に set_cell_output(cell_id) を呼んでください。")
    return os.path.join(OUT_DIR, filename)


def cell_output_path(cell_id: int, filename: str) -> str:
    """前セルのCSVや図を参照するときに使う"""
    return os.path.join(LEVEL1_DIR, f"Cell{cell_id}", filename)


print(
    f"[LEVEL1_DIR] {LEVEL1_DIR}  |  "
    f"EPOCH_LEN={EPOCH_LEN}s | HISTORY_N_EPOCHS={HISTORY_N_EPOCHS} | "
    f"FS_MODE={FS_MODE} | CORR_THRESHOLD={CORR_THRESHOLD} | XGB_REG_LEVEL={XGB_REG_LEVEL}"
)
set_cell_output(0)  # Cell0 の出力先をセット

# ------------------------
# 対象被験者・時間窓
# ------------------------
SUBJECT_IDS = [
    "10061","10063","10064",
    "10071","10072","10073","10074",
    "10081","10082","10083",
    "10091","10092","10093","10094",
    "10101","10102","10103",
]

BASELINE_EPOCH = 1770               # ベースライン行（必須）
ML_START, ML_END = 1800, 2400       # 学習に使う epoch_start 範囲 [start, end)

# ------------------------
# 描画スタイル
# ------------------------
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "font.size": 20, "axes.titlesize": 26, "axes.labelsize": 22,
    "xtick.labelsize": 20, "ytick.labelsize": 20, "legend.fontsize": 20,
})

# ------------------------
# FMS二値化ヘルパ
# ------------------------
def binarize_fms(series: pd.Series, threshold: Optional[int] = None) -> pd.Series:
    th = FMS_THRESHOLD if threshold is None else int(threshold)
    return (series >= th).astype(int)

# ------------------------
# モデルレジストリ
# ------------------------
ModelBuilder = Callable[..., Any]
MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {}


def register_backend(name: str, params: Dict[str, Any], builder: ModelBuilder) -> None:
    MODEL_REGISTRY[name] = {"params": params, "builder": builder}


def _build_xgb(params: Dict[str, Any], *, scale_pos_weight: Optional[float] = None):
    cfg = params.copy()
    if scale_pos_weight is not None:
        cfg["scale_pos_weight"] = float(scale_pos_weight)
    return xgb.XGBClassifier(**cfg)


def _build_rf(params: Dict[str, Any], **_):
    return RandomForestClassifier(**params)


def set_model_backend(name: str) -> None:
    name = name.lower()
    if name not in MODEL_REGISTRY:
        raise KeyError(f"[ERROR] backend '{name}' は未登録: {list(MODEL_REGISTRY.keys())}")
    global MODEL_BACKEND
    MODEL_BACKEND = name


def build_estimator(
    backend: Optional[str] = None,
    *,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    name = (backend or MODEL_BACKEND).lower()
    if name not in MODEL_REGISTRY:
        raise KeyError(f"[ERROR] backend '{name}' は未登録。")
    base = MODEL_REGISTRY[name]["params"].copy()
    if overrides:
        base.update(overrides)
    builder = MODEL_REGISTRY[name]["builder"]
    return builder(base, scale_pos_weight=scale_pos_weight)


def fit_estimator(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    *,
    backend: Optional[str] = None,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    model = build_estimator(
        backend=backend, scale_pos_weight=scale_pos_weight, overrides=overrides
    )
    model.fit(X_train, y_train)
    return model


def predict_positive_score(model, X: pd.DataFrame) -> np.ndarray:
    X = X.astype(np.float32, copy=False)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return np.asarray(model.decision_function(X), dtype=float)
    return model.predict(X).astype(float)


In [None]:
# ===== Cell 0A: XGBoost 正則化プリセット定義＆登録 =====
from __future__ import annotations

from typing import Any, Dict

# ------------------------
# XGB パラメータプリセット
# ------------------------
# level:
#   0 -> 正則化0（初期パラ）
#   1 -> 正則化1（改変①: 低データ数・高特徴量向けに正則化を強めた設定）
XGB_PARAM_PRESETS: Dict[int, Dict[str, Any]] = {
    0: dict(
        # 正則化0: 初期パラ
        n_estimators=100,
        eval_metric="logloss",
        n_jobs=1,
        tree_method="hist",
        device="cpu",
        seed=0,
        random_state=0,
    ),
    1: dict(
        # 正則化1: 改変①（浅い木＋強めの正則化）
        n_estimators=100,
        eval_metric="logloss",
        n_jobs=1,
        tree_method="hist",
        device="cpu",
        seed=0,
        random_state=0,

        # ---- 学習率（少しだけ低く） ----
        learning_rate=0.1,      # デフォルト0.3 → 0.1 にして一歩ずつ学習

        # ---- 木の複雑さ（軽く制限）----
        max_depth=3,            # デフォルト6 → 3（浅めの木に）
        min_child_weight=5,     # デフォルト1 → 5（少数サンプルでの分割を禁止気味に）
        gamma=0.5,              # デフォルト0 → 0.5（ショボい分割は切り捨て）

        # ---- サブサンプリング（軽く正則化）----
        subsample=0.8,          # 1.0 → 0.8（各木が見るデータを8割に）
        colsample_bytree=0.6,   # 1.0 → 0.6（各木が見る特徴量を6割に）

        # ---- L2 / L1 正則化（本命）----
        reg_lambda=20.0,        # L2 正則化
        reg_alpha=2.0,          # L1 正則化で“自動特徴選択”を少し効かせる
    ),
}

# ------------------------
# レベル説明（ログ・ドキュメント用）
# ------------------------
XGB_REG_DESCRIPTIONS: Dict[int, str] = {
    0: "正則化0: 初期パラ（ほぼデフォルト設定に近い XGB）",
    1: "正則化1: 改変①（浅い木 + 強めの L1/L2 正則化 + サブサンプリング）",
}


def get_xgb_params_by_level(level: int) -> Dict[str, Any]:
    """
    正則化レベルに応じた XGB ハイパーパラメータ dict を返す.

    level:
        0 -> 正則化0（初期パラ）
        1 -> 正則化1（改変①）
    """
    if level not in XGB_PARAM_PRESETS:
        raise ValueError(
            f"[get_xgb_params_by_level] 未定義の level={level} です。"
            f" 定義済み: {sorted(XGB_PARAM_PRESETS.keys())}"
        )
    return XGB_PARAM_PRESETS[level].copy()


def describe_xgb_reg_level(level: int) -> str:
    """
    正則化レベルの概要説明を返す（ログ・ドキュメント用）.
    """
    if level not in XGB_REG_DESCRIPTIONS:
        raise ValueError(
            f"[describe_xgb_reg_level] 未定義の level={level} です。"
            f" 定義済み: {sorted(XGB_REG_DESCRIPTIONS.keys())}"
        )
    return XGB_REG_DESCRIPTIONS[level]


# ------------------------
# Cell0 で決めたレベルに応じて XGB を登録
# ------------------------
XGB_PARAMS: Dict[str, Any] = get_xgb_params_by_level(XGB_REG_LEVEL)

# Cell0 側で定義済みの register_backend, _build_xgb を使う
register_backend("xgb", XGB_PARAMS, _build_xgb)

# 簡単なログ
print(f"[Cell0A] XGB_REG_LEVEL={XGB_REG_LEVEL} ({REGULARIZATION_TAG})")
try:
    print(f"[Cell0A] {describe_xgb_reg_level(XGB_REG_LEVEL)}")
except Exception as e:
    print(f"[Cell0A][WARN] 正則化レベル説明の取得に失敗: {e}")

main_keys = [
    "learning_rate", "max_depth", "min_child_weight", "gamma",
    "subsample", "colsample_bytree", "reg_lambda", "reg_alpha",
]
print("[Cell0A] XGB_PARAMS (main):",
      {k: XGB_PARAMS[k] for k in main_keys if k in XGB_PARAMS})


In [None]:
# ===== Cell 0B: RandomForest パラメータ定義＆登録 =====
from __future__ import annotations

from typing import Any, Dict

# ------------------------
# RF ハイパーパラメータ
# ------------------------
RF_PARAMS: Dict[str, Any] = dict(
    n_estimators=100,      # 論文：決定木100本
    max_features=1,        # 論文：max feature of one

    # 以下は論文に記載がないので，ほぼデフォルト＋再現性用
    bootstrap=True,        # scikit-learn のデフォルト
    random_state=SEED_BASE,
    n_jobs=1,
)

# Cell0 側で定義済みの register_backend, _build_rf を使う
register_backend("rf", RF_PARAMS, _build_rf)

# ------------------------
# 最終ログ（利用可能バックエンドの一覧など）
# ------------------------
MODEL_ID = MODEL_BACKEND.upper()
print(f"[Cell0B] MODEL_BACKEND={MODEL_ID} / SEED={SEED_BASE} / backends={list(MODEL_REGISTRY.keys())}")
print(f"[Cell0B] XGB_REG_LEVEL={XGB_REG_LEVEL} ({REGULARIZATION_TAG})")

try:
    desc = describe_xgb_reg_level(XGB_REG_LEVEL)
    print(f"[Cell0B] {desc}")
except Exception as e:
    print(f"[Cell0B][WARN] 正則化レベル説明の取得に失敗: {e}")

# XGB / RF の主なパラメータをざっくり表示
xgb_main_keys = [
    "learning_rate", "max_depth", "min_child_weight", "gamma",
    "subsample", "colsample_bytree", "reg_lambda", "reg_alpha",
]
if "xgb" in MODEL_REGISTRY:
    xgb_cfg = MODEL_REGISTRY["xgb"]["params"]
    print("[Cell0B] XGB_PARAMS (main):",
          {k: xgb_cfg[k] for k in xgb_main_keys if k in xgb_cfg})

print("[Cell0B] RF_PARAMS:", RF_PARAMS)


In [None]:
# ===== Cell 1: データ準備（CSV読込 → EPOCH合成 → SUBJECT_META → 行列出力）=====
set_cell_output(1)



import pandas as pd
import numpy as np
import os

# --------------------------------------------
# ① 30秒EPOCH CSVの読み込み・検証
# --------------------------------------------
def subject_csv_path(sid: str) -> str:
    path = os.path.join(BASE_INPUT_DIR, sid, "EPOCH", f"{sid}_epoch.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"[Cell1] CSV missing for subject {sid}: {path}")
    return path

dfs = []
for sid in SUBJECT_IDS:
    df = pd.read_csv(subject_csv_path(sid))
    if df.shape[1] < 4:
        raise ValueError(f"[Cell1] {sid}: 列数が不足（>=4 必須）")
    df = df.copy()

    # 4列目以降の列名を文字列化（数値列名対策）
    df.columns = list(df.columns[:3]) + [str(c) for c in df.columns[3:]]
    c1, c2, c3 = df.columns[:3]
    df = df.rename(columns={c1: "epoch_start", c2: "epoch_end", c3: "FMS"})

    df["epoch_start"] = pd.to_numeric(df["epoch_start"], errors="coerce").astype("Int64")
    df["epoch_end"]   = pd.to_numeric(df["epoch_end"],   errors="coerce").astype("Int64")
    df["FMS"]         = pd.to_numeric(df["FMS"],         errors="coerce").astype("Int64")

    if df[["epoch_start", "epoch_end", "FMS"]].isna().any().any():
        raise ValueError(f"[Cell1] {sid}: epoch_start/epoch_end/FMS に NaN があります。")

    df.insert(0, "subject_id", sid)
    dfs.append(df)

combined_raw = pd.concat(dfs, ignore_index=True)

# 除外する特徴量（周波数領域など）
exclude_feats = {"HF_power", "LF_power", "LF_HF_ratio"}
feature_cols_all = [
    c for c in combined_raw.columns
    if c not in {"subject_id", "epoch_start", "epoch_end", "FMS"} and c not in exclude_feats
]
if not feature_cols_all:
    raise RuntimeError("[Cell1] 特徴量列が0です。列名や除外設定を確認してください。")

print(f"[Cell1] Loaded subjects={len(SUBJECT_IDS)}, rows={len(combined_raw)}, "
      f"features(after drop)={len(feature_cols_all)}")

print("[Cell1] Physiological feature columns (feature_cols_all):")
for col in feature_cols_all:
    print("  -", col)

# --------------------------------------------
# ② EPOCH_LEN 秒への合成 + baseline差分 + ラベル生成
# --------------------------------------------
if (ML_END - ML_START) % EPOCH_LEN != 0:
    raise ValueError(f"[Cell1] ML window {ML_END-ML_START} が EPOCH_LEN={EPOCH_LEN} で割り切れません。")

rows_per_bin = EPOCH_LEN // 30  # 30秒エポックを何個まとめるか
df_out_list = []

# デバッグ用ディレクトリ
DEBUG_DIR = os.path.join(OUT_DIR, "Cell1_デバッグ")
os.makedirs(DEBUG_DIR, exist_ok=True)

for sid, sdf in combined_raw.groupby("subject_id", sort=False):
    # baseline 行（BASELINE_EPOCH）の取得
    base_row = sdf.loc[sdf["epoch_start"] == BASELINE_EPOCH]
    if len(base_row) != 1:
        raise ValueError(f"[Cell1] {sid}: baseline epoch_start=={BASELINE_EPOCH} が見つからない（{len(base_row)}件）")

    base_vals = base_row[feature_cols_all].astype(float).iloc[0]
    if base_vals.isna().any():
        bad_cols = base_vals.index[base_vals.isna()].tolist()
        raise ValueError(f"[Cell1] {sid}: baselineにNaN -> {bad_cols}")

    # 学習に使う時間窓だけ抽出
    sdf_ml = sdf[(sdf["epoch_start"] >= ML_START) & (sdf["epoch_start"] < ML_END)].copy()
    if sdf_ml.empty:
        raise ValueError(f"[Cell1] {sid}: ML window [{ML_START},{ML_END}) が空です。")

    # 30秒epochを EPOCH_LEN 秒にまとめるためのbin
    sdf_ml["bin_start"] = ML_START + ((sdf_ml["epoch_start"] - ML_START) // EPOCH_LEN) * EPOCH_LEN
    sdf_ml["bin_end"]   = sdf_ml["bin_start"] + EPOCH_LEN

    # 行数が揃っている bin のみ採用
    bin_counts = sdf_ml.groupby(["bin_start", "bin_end"]).size()
    complete_bins = bin_counts[bin_counts == rows_per_bin].index
    sdf_ml = sdf_ml.set_index(["bin_start", "bin_end"]).loc[complete_bins].reset_index()
    if sdf_ml.empty:
        raise ValueError(f"[Cell1] {sid}: EPOCH_LEN={EPOCH_LEN} で完全なbinが無い")

    # 各 bin で平均を取る（FMS も平均）
    agg_dict = {c: "mean" for c in feature_cols_all}
    agg_dict["FMS"] = "mean"
    g = sdf_ml.groupby(["subject_id", "bin_start", "bin_end"], as_index=False).agg(agg_dict)

    # baseline 差分（生理特徴量のみ）
    g_features = g[feature_cols_all].astype(float) - base_vals.values
    if g_features.isna().any().any():
        bad = g_features.columns[g_features.isna().any()].tolist()
        raise ValueError(f"[Cell1] {sid}: baseline差分後にNaN -> {bad}")

    # 出力用に整形
    g_out = pd.concat(
        [g[["subject_id", "bin_start", "bin_end", "FMS"]], g_features],
        axis=1
    )
    g_out = g_out.rename(columns={"bin_start": "epoch_start", "bin_end": "epoch_end"})

    # FMS を二値化
    g_out["label"] = binarize_fms(g_out["FMS"])

    # 列順を整える
    g_out = g_out[["subject_id", "epoch_start", "epoch_end", "FMS", "label"] + feature_cols_all]

    # デバッグ: この被験者のベースライン差分後データをCSVに保存
    debug_path = os.path.join(DEBUG_DIR, f"Cell1_debug_{sid}_E{EPOCH_LEN}s.csv")
    g_out.to_csv(debug_path, index=False, encoding="utf-8-sig")
    print(f"[Cell1-DEBUG] Saved baseline-diff data for subject {sid} -> {debug_path}")

    df_out_list.append(g_out)

df_ml_epoch = pd.concat(df_out_list, ignore_index=True)

print(f"[Cell1] df_ml_epoch shape={df_ml_epoch.shape}")

# --------------------------------------------
# ③ SUBJECT_META & MSSQ / VIMSSQ group（被験者属性読み込み）
# --------------------------------------------
CANDIDATE_SCORE_PATHS = [
    "/mnt/data/summary_scores.xlsx",
    os.path.join(BASE_ANALYSIS_DIR, "summary_scores.xlsx"),
    os.path.join(BASE_ANALYSIS_DIR, "機械学習", "summary_scores.xlsx"),
    os.path.join(BASE_INPUT_DIR, "summary_scores.xlsx"),
]
score_path = next((p for p in CANDIDATE_SCORE_PATHS if os.path.exists(p)), None)
if score_path is None:
    raise FileNotFoundError("[Cell1] summary_scores.xlsx が見つかりません。")

meta_raw = pd.read_excel(score_path, sheet_name="Summary")

required = ["ID", "MSSQ", "VIMSSQ"]
missing = [c for c in required if c not in meta_raw.columns]
if missing:
    raise ValueError(f"[Cell1] summary_scores.xlsx に必須列がありません -> {missing}")

meta = meta_raw[required].copy()
meta["ID"] = (
    meta["ID"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
)
for c in ["MSSQ", "VIMSSQ"]:
    meta[c] = pd.to_numeric(meta[c], errors="raise")

sid_set = set(map(str, SUBJECT_IDS))
meta = meta[meta["ID"].isin(sid_set)].copy()
if meta["ID"].duplicated().any():
    dup_ids = meta.loc[meta["ID"].duplicated(), "ID"].tolist()
    raise ValueError(f"[Cell1] ID 重複 -> {dup_ids}")

# MSSQ_group / VIMSSQ_group を作成
meta["MSSQ_group"]   = np.where(meta["MSSQ"]   >= MSSQ_THRESHOLD_FIXED,   "High", "Low")
meta["VIMSSQ_group"] = np.where(meta["VIMSSQ"] >= VIMSSQ_THRESHOLD_FIXED, "High", "Low")

SUBJECT_META = (
    meta.rename(columns={"ID": "subject_id"})
        .set_index("subject_id")[["MSSQ", "VIMSSQ", "MSSQ_group", "VIMSSQ_group"]]
        .copy()
)

SUBJECT_META.to_csv(outpath("subject_meta.csv"), encoding="utf-8-sig")
print(f"[Cell1] SUBJECT_META saved -> {outpath('subject_meta.csv')} (source='{score_path}')")

# --------------------------------------------
# ④ MSSQ / VIMSSQ をフラグに応じて特徴量に追加（1回だけ）
# --------------------------------------------
trait_cols_to_use: list[str] = []
if USE_MSSQ_FEATURE:
    trait_cols_to_use.append("MSSQ")
if USE_VIMSSQ_FEATURE:
    trait_cols_to_use.append("VIMSSQ")

if trait_cols_to_use:
    merge_cols = ["subject_id"] + trait_cols_to_use
    df_ml_epoch = df_ml_epoch.merge(
        SUBJECT_META.reset_index()[merge_cols],
        on="subject_id",
        how="left",
    )

    if df_ml_epoch[trait_cols_to_use].isna().any().any():
        bad_sids = df_ml_epoch.loc[
            df_ml_epoch[trait_cols_to_use].isna().any(axis=1), "subject_id"
        ].unique().tolist()
        raise ValueError(f"[Cell1] MSSQ/VIMSSQ が欠損の subject_id があります -> {bad_sids}")

    print("[Cell1] df_ml_epoch with trait features (MSSQ/VIMSSQ):")
    print(df_ml_epoch[["subject_id"] + trait_cols_to_use].drop_duplicates().head())
else:
    print("[Cell1] Trait features (MSSQ/VIMSSQ) are disabled by flags.")

# 生理特徴量 + オプションtraitsをまとめた最終的な特徴量リスト
feature_cols_full = feature_cols_all + trait_cols_to_use

print(f"[Cell1] Final feature columns (n={len(feature_cols_full)}):")
for col in feature_cols_full:
    print("  -", col)

# --------------------------------------------
# ⑤ 履歴導入「前」のラベル分布チェック（診断用）
# --------------------------------------------
label_counts = (
    df_ml_epoch
    .groupby("subject_id")["label"]
    .value_counts()
    .unstack(fill_value=0)
)

# 列 0/1 が必ず存在するようにしてから rename
for val in (0, 1):
    if val not in label_counts.columns:
        label_counts[val] = 0
label_counts = label_counts[[0, 1]].rename(columns={0: "neg_before", 1: "pos_before"})

LABEL_BEFORE_PATH = outpath("LABEL_DIST_BEFORE_HISTORY.csv")
label_counts.to_csv(LABEL_BEFORE_PATH, encoding="utf-8-sig")
print(f"[Cell1] Saved label distribution BEFORE history -> {LABEL_BEFORE_PATH}")

print("[Cell1] Label distribution BEFORE history (per subject):")
print(label_counts)

# --------------------------------------------
# ⑥ 学習行列（従来仕様）を一旦作成
#    ※ Cell2 で時系列変換後に X_all / y_all / groups は上書きされる
# --------------------------------------------
fname_raw = f"ML_DATA_DELTA_{EPOCH_LEN}S_RAW.CSV"
df_ml_epoch.to_csv(outpath(fname_raw), index=False, encoding="utf-8-sig")

X_all = df_ml_epoch[feature_cols_full].copy().astype(float)
y_all = df_ml_epoch["label"].copy().astype(int)
groups = df_ml_epoch["subject_id"].copy()

X_all.to_csv(outpath(f"X_RAW_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")
X_all.to_csv(outpath(f"X_SCALED_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")  # 木系でスケーリング不要
pd.DataFrame({
    "subject_id": groups,
    "label": y_all,
    "FMS_mean": df_ml_epoch["FMS"],
}).to_csv(outpath(f"Y_AND_GROUPS_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")

print(f"[Cell1] Saved -> {outpath(fname_raw)} / X_RAW_ALL / X_SCALED_ALL / Y_AND_GROUPS")
print(f"[Cell1] Matrices ready (pre-history): X_all={X_all.shape}, y_all={y_all.shape}, "
      f"SUBJECT_META={SUBJECT_META.shape}")
print(f"[Cell1] n_features(physio)={len(feature_cols_all)}, "
      f"+ traits({len(trait_cols_to_use)}) -> {len(feature_cols_full)}")


In [None]:
# ===== Cell 1A: MSSQ / VIMSSQ 群別の FMS 推移プロット =====
set_cell_output(1)



import matplotlib.pyplot as plt
import numpy as np

# 出力ディレクトリ
FMS_PLOT_DIR = os.path.join(OUT_DIR, "Cell1A_FMS_trajectory")
os.makedirs(FMS_PLOT_DIR, exist_ok=True)

# 描画スタイル（ユーザー規約）
LW = 1.5
FS_TITLE, FS_LABEL, FS_LEGEND, FS_TICK = 30, 24, 20, 20

COLOR_HIGH = "red"
COLOR_LOW  = "blue"


def _prepare_fms_long_use_epoch_end():
    """
    df_ml_epoch から FMS 時系列を取り出し、
    ML_START からの経過時間（分）を epoch_end 基準で付与した長データを返す。
    """
    required_cols = {"subject_id", "epoch_end", "FMS"}
    missing = required_cols - set(df_ml_epoch.columns)
    if missing:
        raise RuntimeError(f"[Cell1A] df_ml_epoch に必須列がありません -> {missing}")

    df = df_ml_epoch[["subject_id", "epoch_end", "FMS"]].copy()

    # ML_START からの経過時間（秒 → 分）を epoch_end 基準で計算
    df["t_min"] = (df["epoch_end"] - ML_START).astype(float) / 60.0
    return df


def _plot_fms_by_group_min_axis(group_col: str, title_prefix: str, save_name: str):
    """
    group_col: "MSSQ_group" または "VIMSSQ_group"
    title_prefix: 図タイトルのプレフィックス（"MSSQ group" など）
    save_name: 保存ファイル名
    """
    if group_col not in SUBJECT_META.columns:
        raise KeyError(
            f"[Cell1A] SUBJECT_META に {group_col} 列がありません。"
            "Cell1 の SUBJECT_META 作成部を確認してください。"
        )

    # 群に属する被験者 ID をプリント
    for level in ["High", "Low"]:
        sids = SUBJECT_META.index[SUBJECT_META[group_col] == level].tolist()
        print(f"[Cell1A] {group_col} = {level}: subjects = {sorted(sids)}")

    # FMS 長データに group_col をマージ（epoch_end 基準）
    df_long = _prepare_fms_long_use_epoch_end()
    df_long = df_long.merge(
        SUBJECT_META.reset_index()[["subject_id", group_col]],
        on="subject_id",
        how="left",
    )

    if df_long[group_col].isna().any():
        bad = df_long.loc[df_long[group_col].isna(), "subject_id"].unique().tolist()
        raise ValueError(f"[Cell1A] {group_col} が欠損の subject_id があります -> {bad}")

    # group × t_min ごとに FMS の平均・標準偏差
    agg = (
        df_long
        .groupby([group_col, "t_min"])["FMS"]
        .agg(["mean", "std"])
        .reset_index()
    )
    agg["std"] = agg["std"].fillna(0.0)

    # プロット
    fig, ax = plt.subplots(figsize=(10, 6))

    for level, color in [("High", COLOR_HIGH), ("Low", COLOR_LOW)]:
        sub = agg[agg[group_col] == level].sort_values("t_min")
        if sub.empty:
            print(f"[Cell1A] 注意: {group_col}={level} のデータがありません。")
            continue

        t = sub["t_min"].values  # [分] 単位
        m = sub["mean"].values
        s = sub["std"].values

        # 平均（太線）
        ax.plot(t, m, label=f"{level} (mean)", linewidth=LW * 2.0, color=color)
        # ±1 SD バンド
        ax.fill_between(t, m - s, m + s, alpha=0.2, color=color, linewidth=0)

    # ----- 軸設定 -----

    # 横軸：0〜10分（または ML 窓長に応じて自動）
    duration_min = (ML_END - ML_START) / 60.0
    # 安全側で 0〜duration_min、整数目盛（0,1,2,...）
    max_tick = int(np.floor(duration_min))
    xticks = np.arange(0, max_tick + 1, 1)
    ax.set_xlim(0.0, duration_min)
    ax.set_xticks(xticks)
    # ラベルは整数表示（0,1,2,...）
    ax.set_xlabel("Time [min]", fontsize=FS_LABEL)

    # 縦軸：FMS 0〜4、整数目盛
    ax.set_ylim(0, 4)
    ax.set_yticks([0, 1, 2, 3, 4])
    ax.set_ylabel("FMS", fontsize=FS_LABEL)

    # タイトル
    ax.set_title(f"{title_prefix}別 FMS 推移", fontsize=FS_TITLE)

    # グリッド
    ax.grid(True)

    # 3分 と 6分30秒 に縦の点線
    ax.axvline(3.0,  linestyle="--", linewidth=LW, color="gray")
    ax.axvline(6.5,  linestyle="--", linewidth=LW, color="gray")

    # 目盛フォント
    ax.tick_params(axis="both", labelsize=FS_TICK)

    # 凡例
    ax.legend(fontsize=FS_LEGEND)

    fig.tight_layout()
    save_path = os.path.join(FMS_PLOT_DIR, save_name)
    fig.savefig(save_path, dpi=300)
    plt.close(fig)

    print(f"[Cell1A] Saved FMS trajectory plot -> {save_path}")


# ---- 基準切り替え：MSSQ / VIMSSQ ----
basis = GROUPING_BASIS_FOR_PLOTS.upper()

if basis == "MSSQ":
    group_col    = "MSSQ_group"
    title_prefix = "MSSQ group"
    save_name    = f"FMS_MSSQ_group_E{EPOCH_LEN}s.png"

elif basis == "VIMSSQ":
    group_col    = "VIMSSQ_group"
    title_prefix = "VIMSSQ group"
    save_name    = f"FMS_VIMSSQ_group_E{EPOCH_LEN}s.png"

else:
    raise ValueError(
        f"[Cell1A] GROUPING_BASIS_FOR_PLOTS は 'MSSQ' か 'VIMSSQ' を指定してください "
        f"（現在: {GROUPING_BASIS_FOR_PLOTS}）"
    )

print(f"[Cell1A] GROUPING_BASIS_FOR_PLOTS = {GROUPING_BASIS_FOR_PLOTS} で FMS 推移を描画します。")

_plot_fms_by_group_min_axis(
    group_col=group_col,
    title_prefix=title_prefix,
    save_name=save_name,
)


In [None]:
# ===== Cell 2: モデリング共通ヘルパ（fit / SHAP / 評価）=====
set_cell_output(2)



from typing import Dict, Optional, Tuple

import numpy as np
import pandas as pd
import shap
from sklearn.metrics import roc_auc_score, accuracy_score


# --------------------------------------------
# 学習ラッパー（Cell0のレジストリAPIを利用）
# --------------------------------------------
def fit_classifier(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    *,
    backend: Optional[str] = None,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    """
    Cell0 の fit_estimator を直接包む薄いラッパ。
    - SHAP/評価セルから backend を差し替えたい場合のみ backend / overrides を指定する。
    """
    if "fit_estimator" not in globals():
        raise RuntimeError("[Cell2] fit_estimator が未定義です。Cell0 を先に実行してください。")
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    return fit_estimator(
        X_train,
        y_train,
        backend=backend,
        scale_pos_weight=scale_pos_weight,
        overrides=overrides,
    )


# --------------------------------------------
# TreeSHAP ベースの特徴重要度算出
# --------------------------------------------
def compute_train_shap_abs_mean(model, X_ref: pd.DataFrame) -> pd.Series:
    """
    学習データ X_ref 上での平均絶対SHAP値（降順）。
    - XGB/RF 等の木モデルを想定（TreeSHAP）。
    - SVM など非対応モデルでは ValueError を送出する。
    """
    X_ref = X_ref.astype(np.float32, copy=False)

    # 背景データ（最大128行）
    bg_n = min(128, len(X_ref))
    X_bg = X_ref.sample(n=bg_n, random_state=SEED_BASE) if bg_n >= 2 else X_ref

    try:
        explainer = shap.TreeExplainer(
            model,
            data=X_bg,
            model_output="probability",
            feature_perturbation="interventional",
        )
        sv_any = explainer.shap_values(X_ref)
    except Exception:
        # probability指定が非対応な場合に raw へフォールバック
        explainer = shap.TreeExplainer(
            model,
            model_output="raw",
            feature_perturbation="tree_path_dependent",
        )
        sv_any = explainer.shap_values(X_ref)

    # shap_values の戻り値形状を統一（2D: n_samples × n_features）
    classes = getattr(model, "classes_", None)
    pos_idx = int(np.where(classes == 1)[0][0]) if classes is not None and 1 in list(classes) else -1

    if isinstance(sv_any, list):
        sv = sv_any[pos_idx]
    else:
        sv = getattr(sv_any, "values", sv_any)
        sv = np.asarray(sv)
        if sv.ndim == 3:
            sv = sv[..., pos_idx]
        elif sv.ndim == 1:
            sv = sv.reshape(-1, 1)

    if sv.shape[1] != X_ref.shape[1]:
        raise RuntimeError(
            f"[Cell2] SHAP shape mismatch: sv.shape={sv.shape}, X_ref.shape={X_ref.shape}"
        )

    abs_mean = np.mean(np.abs(sv), axis=0)
    return pd.Series(abs_mean, index=X_ref.columns, name="mean_abs").sort_values(ascending=False)


# --------------------------------------------
# 評価ユーティリティ
# --------------------------------------------
def _is_probability_like(scores: np.ndarray) -> bool:
    return np.isfinite(scores).all() and 0.0 <= scores.min() and scores.max() <= 1.0


def evaluate_fold(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    """
    - ROC AUC: 2クラス時のみ。
    - Accuracy: 確率なら 0.5、スコアなら 0.0 を閾値とする（詳細な最適化は別セル）。
    """
    X_test = X_test.astype(np.float32, copy=False)
    scores = predict_positive_score(model, X_test)

    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, scores)
    else:
        roc_auc = float("nan")

    thr = 0.5 if _is_probability_like(scores) else 0.0
    pred = (scores >= thr).astype(int)
    acc = accuracy_score(y_test.astype(int), pred)

    return {"roc_auc": float(roc_auc), "accuracy": float(acc)}


print("[Cell2] Modeling helpers ready (fit_classifier / compute_train_shap_abs_mean / evaluate_fold)")


In [None]:
# ===== Cell 3: 時系列特徴量変換（固定幅 H エポック）＋統計ログ =====
set_cell_output(3)



import numpy as np
import pandas as pd

# 必須オブジェクトの存在チェック
required = [
    "df_ml_epoch",
    "feature_cols_all",
    "trait_cols_to_use",
    "HISTORY_N_EPOCHS",
    "EPOCH_LEN",
    "outpath",
]
missing = [name for name in required if name not in globals()]
if missing:
    raise RuntimeError(f"[Cell3] 未定義の変数があります: {missing}")

H = int(HISTORY_N_EPOCHS)
if H < 1:
    raise ValueError("[Cell3] HISTORY_N_EPOCHS は 1 以上である必要があります。")

print(f"[Cell3] 時系列特徴量変換を開始: HISTORY_N_EPOCHS={H}, EPOCH_LEN={EPOCH_LEN}")

# --------------------------------------------
# ① 履歴導入「前」のラベル分布を再計算（保険）
#    ※ Cell1でも保存しているが、ここでも再作成しておく
# --------------------------------------------
label_before = (
    df_ml_epoch
    .groupby("subject_id")["label"]
    .value_counts()
    .unstack(fill_value=0)
)

for val in (0, 1):
    if val not in label_before.columns:
        label_before[val] = 0
label_before = label_before[[0, 1]].rename(columns={0: "neg_before", 1: "pos_before"})

print("[Cell3] Label distribution BEFORE history (recomputed):")
print(label_before)

# --------------------------------------------
# ② HISTORY_N_EPOCHS == 1 の場合は変換せず、そのまま利用
# --------------------------------------------
if H == 1:
    print("[Cell3] HISTORY_N_EPOCHS == 1 のため、履歴連結は行わず df_ml_epoch をそのまま使用します。")

    df_ml_ts = df_ml_epoch.copy()

    # サンプル統計（履歴導入後＝beforeと同じ）
    per_subject_stats = []
    for sid, g in df_ml_epoch.groupby("subject_id", sort=False):
        n_raw = len(g)
        pos_after = int((g["label"] == 1).sum())
        neg_after = int((g["label"] == 0).sum())
        per_subject_stats.append({
            "subject_id": sid,
            "n_raw": n_raw,
            "n_ts": n_raw,
            "pos_after": pos_after,
            "neg_after": neg_after,
        })

    subject_stats_df = pd.DataFrame(per_subject_stats).set_index("subject_id")
    subject_stats_df = subject_stats_df.join(label_before, how="left")

    # 比率なども一応追加
    subject_stats_df["ratio_n_ts"] = subject_stats_df["n_ts"] / subject_stats_df["n_raw"].replace(0, np.nan)

    STATS_PATH = outpath("SUBJECT_TS_STATS.csv")
    subject_stats_df.to_csv(STATS_PATH, encoding="utf-8-sig")
    print(f"[Cell3] SUBJECT_TS_STATS saved -> {STATS_PATH}")
    print(subject_stats_df)

    # 特徴量リストは Cell1 での feature_cols_full をそのまま使用
    if "feature_cols_full" not in globals():
        raise RuntimeError("[Cell3] feature_cols_full が未定義です。Cell1 を先に実行してください。")

    ts_feature_cols = feature_cols_full

    # グローバル行列を設定（従来仕様）
    X_all = df_ml_ts[ts_feature_cols].astype(float)
    y_all = df_ml_ts["label"].astype(int)
    groups = df_ml_ts["subject_id"].copy()

    df_ml_ts.to_csv(outpath(f"ML_DATA_TS_{EPOCH_LEN}S_H{H}.CSV"), index=False, encoding="utf-8-sig")
    print(f"[Cell3] df_ml_ts saved -> {outpath(f'ML_DATA_TS_{EPOCH_LEN}S_H{H}.CSV')}")
    print(f"[Cell3] Matrices ready (H=1): X_all={X_all.shape}, y_all={y_all.shape}")
else:
    # --------------------------------------------
    # ③ HISTORY_N_EPOCHS >= 2: 固定幅 H エポック履歴に変換
    # --------------------------------------------
    # 物理特徴量部分の lag付き列名
    physio_lag_cols = [
        f"{col}_lag{lag}"
        for lag in range(H - 1, -1, -1)  # 例: H=3 -> lag2, lag1, lag0
        for col in feature_cols_all
    ]
    ts_feature_cols = physio_lag_cols + trait_cols_to_use

    print(f"[Cell3] physio_lag_cols: {len(physio_lag_cols)} 列, traits: {trait_cols_to_use}")

    df_ts_list = []
    per_subject_stats = []

    for sid, g in df_ml_epoch.groupby("subject_id", sort=False):
        g = g.sort_values("epoch_start").reset_index(drop=True)
        n_raw = len(g)

        rows_ts = []

        # i: 履歴ウィンドウの末尾インデックス
        for i in range(H - 1, n_raw):
            block = g.iloc[i - H + 1: i + 1]

            # epoch_start が EPOCH_LEN 刻みで連続しているか確認
            starts = block["epoch_start"].to_numpy()
            diffs = np.diff(starts)
            if not np.all(diffs == EPOCH_LEN):
                # 欠損をまたぐウィンドウは使わない
                continue

            # 直近Hエポック分の生理特徴量を連結
            features_seq = []
            for row_idx in range(i - H + 1, i + 1):
                features_seq.append(
                    g.loc[row_idx, feature_cols_all].to_numpy(dtype=float)
                )
            features_concat = np.concatenate(features_seq, axis=0)

            if features_concat.shape[0] != len(physio_lag_cols):
                raise RuntimeError(
                    f"[Cell3] features_concat length mismatch: "
                    f"{features_concat.shape[0]} vs {len(physio_lag_cols)}"
                )

            row = {
                "subject_id": g.loc[i, "subject_id"],
                "epoch_start": int(g.loc[i, "epoch_start"]),
                "epoch_end": int(g.loc[i, "epoch_end"]),
                "FMS": float(g.loc[i, "FMS"]),
                "label": int(g.loc[i, "label"]),
            }

            # lag付き物理特徴
            for c_idx, col_name in enumerate(physio_lag_cols):
                row[col_name] = float(features_concat[c_idx])

            # trait（MSSQ/VIMSSQ）はlagなしでそのまま
            for tcol in trait_cols_to_use:
                row[tcol] = float(g.loc[i, tcol])

            rows_ts.append(row)

        df_sub_ts = pd.DataFrame(rows_ts)
        n_ts = len(df_sub_ts)

        if n_ts > 0:
            pos_after = int((df_sub_ts["label"] == 1).sum())
            neg_after = int((df_sub_ts["label"] == 0).sum())
        else:
            pos_after = 0
            neg_after = 0

        per_subject_stats.append({
            "subject_id": sid,
            "n_raw": n_raw,
            "n_ts": n_ts,
            "pos_after": pos_after,
            "neg_after": neg_after,
        })

        df_ts_list.append(df_sub_ts)

        print(f"[Cell3] subject {sid}: n_raw={n_raw}, n_ts={n_ts}, "
              f"pos_after={pos_after}, neg_after={neg_after}")

    # 被験者ごと時系列ウィンドウを縦結合
    df_ml_ts = pd.concat(df_ts_list, ignore_index=True) if df_ts_list else pd.DataFrame()

    print(f"[Cell3] df_ml_ts shape={df_ml_ts.shape}")

    # --------------------------------------------
    # ④ サンプル数・ラベル分布統計（before/after）を集計
    # --------------------------------------------
    subject_stats_df = pd.DataFrame(per_subject_stats).set_index("subject_id")
    subject_stats_df = subject_stats_df.join(label_before, how="left")

    # 比率などを追加
    subject_stats_df["ratio_n_ts"] = subject_stats_df["n_ts"] / subject_stats_df["n_raw"].replace(0, np.nan)

    STATS_PATH = outpath("SUBJECT_TS_STATS.csv")
    subject_stats_df.to_csv(STATS_PATH, encoding="utf-8-sig")
    print(f"[Cell3] SUBJECT_TS_STATS saved -> {STATS_PATH}")
    print(subject_stats_df)

    # --------------------------------------------
    # ⑤ グローバル行列 X_all / y_all / groups を履歴版に差し替え
    # --------------------------------------------
    if df_ml_ts.empty:
        raise RuntimeError("[Cell3] df_ml_ts が空です。履歴ウィンドウ条件が厳しすぎる可能性があります。")

    X_all = df_ml_ts[ts_feature_cols].astype(float)
    y_all = df_ml_ts["label"].astype(int)
    groups = df_ml_ts["subject_id"].copy()

    df_ml_ts.to_csv(outpath(f"ML_DATA_TS_{EPOCH_LEN}S_H{H}.CSV"), index=False, encoding="utf-8-sig")
    print(f"[Cell3] df_ml_ts saved -> {outpath(f'ML_DATA_TS_{EPOCH_LEN}S_H{H}.CSV')}")
    print(f"[Cell3] Matrices ready (H={H}): X_all={X_all.shape}, y_all={y_all.shape}")
    print(f"[Cell3] ts_feature_cols n={len(ts_feature_cols)}")


In [None]:
# ===== Cell 3A: 高相関特徴の事前除去（グループ単位） =====
set_cell_output(3)


import json
import re
import numpy as np
import pandas as pd
from collections import defaultdict

required = ["X_all", "outpath"]
missing = [name for name in required if name not in globals()]
if missing:
    raise RuntimeError(f"[Cell3A-pre] 未定義の変数/関数があります: {missing}")

MIN_VARIANCE = 1e-8
FEATURE_LIST_PATH = outpath("FEATURES_AFTER_CORR.json")

# ---------- グループ名取得ヘルパ ----------
def get_feature_group(col: str) -> str:
    """
    ベース特徴名を返す:
      - 'xxx_lag0', 'xxx_lag1', ... → 'xxx'
      - それ以外（MSSQ, VIMSSQ 等）は列名そのまま
    """
    m = re.match(r"(.+)_lag\d+$", col)
    return m.group(1) if m else col

# ---------- 数値列だけ抽出 ----------
X_num = X_all.select_dtypes(include=[np.number]).copy()
if X_num.empty:
    raise RuntimeError("[Cell3A-pre] 数値列がありません。")

# ---------- 列 → グループ / グループ → 列 ----------
col_to_group: dict[str, str] = {}
group_to_cols: dict[str, list[str]] = defaultdict(list)

for col in X_num.columns:
    g = get_feature_group(col)
    col_to_group[col] = g
    group_to_cols[g].append(col)

group_names_in_order: list[str] = []
seen = set()
for col in X_num.columns:
    g = col_to_group[col]
    if g not in seen:
        group_names_in_order.append(g)
        seen.add(g)

# ---------- グループ代表系列（lagの平均） ----------
X_group = pd.DataFrame(index=X_num.index)
for g, cols in group_to_cols.items():
    X_group[g] = X_num[cols].mean(axis=1)

# ---------- 分散がほぼゼロのグループを除外 ----------
var = X_group.var(axis=0, ddof=1).fillna(0.0)
valid_groups = var[var > MIN_VARIANCE].index.tolist()
if not valid_groups:
    raise RuntimeError("[Cell3A-pre] 分散がほぼゼロのため使用可能なグループがありません。")

# 優先順は X_all の列順に従ってグループ順序を決定
priority_groups = [g for g in group_names_in_order if g in valid_groups]
X_use = X_group[priority_groups]

# ---------- グループ代表同士の相関行列 ----------
corr = X_use.corr(method="pearson").abs()

keep_groups: list[str] = []
dropped_groups_detail: list[dict] = []

for g in priority_groups:
    conflict = None
    for kept in keep_groups:
        if corr.loc[g, kept] >= CORR_THRESHOLD:
            conflict = kept
            break
    if conflict is None:
        # まだどの kept とも高相関でない → 代表として残す
        keep_groups.append(g)
    else:
        # すでに keep に入っている代表 (conflict) を残し、後から出てきた g を除去
        dropped_groups_detail.append({
            "group": g,
            "representative_group": conflict,
            "abs_corr": float(corr.loc[g, conflict]),
            "dropped_columns": group_to_cols[g],
            "representative_columns": group_to_cols[conflict],
        })

# ---------- グループ→列 への展開 ----------
keep_columns = [c for c in X_num.columns if col_to_group[c] in keep_groups]
dropped_columns = [c for c in X_num.columns if col_to_group[c] not in keep_groups]

payload = {
    # 既存セル互換：ここは「残す列名」のリスト
    "keep": keep_columns,

    # 追加情報：グループ単位の情報
    "keep_groups": keep_groups,
    "dropped_groups": dropped_groups_detail,
    "threshold": CORR_THRESHOLD,
    "total_groups": len(priority_groups),
    "total_columns": len(X_num.columns),
}
with open(FEATURE_LIST_PATH, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"[Cell3A-pre] group_keep={len(keep_groups)} / total_groups={len(priority_groups)}, "
      f"drop_groups={len(dropped_groups_detail)}")
print(f"[Cell3A-pre] keep_columns={len(keep_columns)} / total_columns={len(X_num.columns)}")
print(f"[Cell3A-pre] JSON -> {FEATURE_LIST_PATH}")

# ---- ログ出力 ----
if dropped_groups_detail:
    print(f"[Cell3A-pre] |r| >= {CORR_THRESHOLD:.2f} のグループペア（後から出てきた方を DROP）：")
    for d in dropped_groups_detail:
        print(
            f"  [KEEP-G] {d['representative_group']}  "
            f"[DROP-G] {d['group']}  "
            f"(abs_corr={d['abs_corr']:.3f})  "
            f"cols_keep={len(d['representative_columns'])}, "
            f"cols_drop={len(d['dropped_columns'])}"
        )
else:
    print("[Cell3A-pre] 高相関によるグループ除去はありませんでした。")


In [None]:
# ===== Cell 3B-mean: SHAP値平均（LOSO, TreeSHAP・グループ単位） =====
set_cell_output(3)


RUN_CELL_3A_SHAP = USE_FS_SHAP_MEAN  # デフォルトはTrue

if RUN_CELL_3A_SHAP:
    from sklearn.model_selection import LeaveOneGroupOut
    import json
    import os
    import re
    from collections import defaultdict

    import numpy as np
    import pandas as pd
    import matplotlib
    import matplotlib.backends
    import matplotlib.pyplot as plt

    required = [
        "X_all", "y_all", "groups",
        "fit_classifier", "evaluate_fold",
        "outpath", "compute_train_shap_abs_mean"
    ]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell3A-SHAP-RANK] 未定義の変数/関数があります: {missing}")

    # ---------- グループ名取得ヘルパ ----------
    def get_feature_group(col: str) -> str:
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col

    # TreeSHAP のバックエンド（木モデル）
    SHAP_BACKEND = "xgb"

    FEATURE_LIST_PATH = cell_output_path(3, "FEATURES_AFTER_CORR.json")

    # --- 特徴量プールの決定（相関事前除去の結果があれば利用） ---
    if os.path.exists(FEATURE_LIST_PATH):
        with open(FEATURE_LIST_PATH, "r", encoding="utf-8") as f:
            keep_payload = json.load(f)
        # 事前除去で KEEP された列だけ使う
        keep_cols = keep_payload.get("keep", [])
        feature_pool = [c for c in keep_cols if c in X_all.columns]
        print(f"[Cell3A-SHAP-RANK] correlation-pruned columns loaded ({len(feature_pool)} cols)")
    else:
        feature_pool = list(X_all.columns)
        print("[Cell3A-SHAP-RANK] correlation-pruned list not found. Using all columns.")

    if not feature_pool:
        raise RuntimeError("[Cell3A-SHAP-RANK] feature_pool が空です。Cell3A-pre の結果を確認してください。")

    X_source = X_all[feature_pool].copy()

    # 列→グループ / グループ→列
    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = defaultdict(list)
    for col in X_source.columns:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols[g].append(col)

    group_names = list(group_to_cols.keys())

    logo = LeaveOneGroupOut()

    # ★列単位: foldごとの mean(|SHAP|) を保持
    col_shap_frames = []
    # ★グループ単位: foldごとの mean(|SHAP|) を保持
    group_shap_frames = []
    # foldごとの性能
    metrics_rows = []

    # --- LOSO ループ ---
    for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_source, y_all, groups), start=1):
        X_tr = X_source.iloc[tr_idx].astype(np.float32)
        y_tr = y_all.iloc[tr_idx].astype(int)
        X_te = X_source.iloc[te_idx].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)

        if len(np.unique(y_tr)) < 2:
            raise RuntimeError(f"[Cell3A-SHAP-RANK] fold{fold_id}: 学習側が単一クラスです。")

        # --- 木モデルで学習（TreeSHAP 対応） ---
        model = fit_classifier(X_tr, y_tr, backend=SHAP_BACKEND)

        # --- 列単位の mean(|SHAP|) を計算 ---
        shap_mean_col = compute_train_shap_abs_mean(model, X_tr)
        # 全列順に並べ直し（fold間でインデックス整合のため）
        shap_mean_col = shap_mean_col.reindex(X_tr.columns)

        # ★列単位: mean(|SHAP|) をそのまま保存（順位は後でまとめて計算）
        shap_mean_col.name = f"fold{fold_id}"
        col_shap_frames.append(shap_mean_col)

        # --- グループ単位への集約（mean_abs を「合計」） ---
        group_importance: dict[str, float] = defaultdict(float)
        for col, val in shap_mean_col.items():
            g = col_to_group[col]
            group_importance[g] += float(val)

        shap_mean_group = pd.Series(group_importance)

        # ★グループ単位: mean(|SHAP|) を保存
        shap_mean_group.name = f"fold{fold_id}"
        group_shap_frames.append(shap_mean_group)

        # --- Fold ごとの性能評価 ---
        metrics = evaluate_fold(model, X_te, y_te)
        metrics.update({
            "fold_id": fold_id,
            "test_subject": groups.iloc[te_idx].iloc[0],
        })
        metrics_rows.append(metrics)

        preview_groups = shap_mean_group.sort_values(ascending=False).head(5).index.tolist()
        print(f"[Cell3A-SHAP-RANK] fold{fold_id}: ranked groups={len(shap_mean_group)} (top5 groups={preview_groups})")

    # ---------- 列単位ランキング（純粋な |SHAP| 平均ベース） ----------
    # 各列×各fold の mean(|SHAP|) テーブル
    col_shap_df = pd.concat(col_shap_frames, axis=1)

    # ここで fold 平均の mean(|SHAP|) を計算
    col_rank_df = col_shap_df.copy()
    col_rank_df["shap_mean"] = col_shap_df.mean(axis=1)
    col_rank_df["shap_median"] = col_shap_df.median(axis=1)

    # ★重要度は shap_mean が大きいほど高いので，降順にランク付け
    #   rank_mean は「fold平均 |SHAP| に基づく総合順位（1が最重要）」とする
    col_rank_df["rank_mean"] = col_rank_df["shap_mean"].rank(
        ascending=False, method="min"
    ).astype(int)

    # shap_mean の順位でソート
    col_rank_df = col_rank_df.sort_values("rank_mean")

    rank_path_cols = outpath("SHAP_MEAN_FEATURE_RANKING.CSV")
    col_rank_df.to_csv(rank_path_cols, encoding="utf-8-sig")
    col_rank_df.to_csv(outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")

    print(f"[Cell3A-SHAP-RANK] 列単位ランキング saved -> {rank_path_cols}")

    # ---------- グループ単位ランキング（純粋な |SHAP| 平均ベース・メイン） ----------
    group_shap_df = pd.concat(group_shap_frames, axis=1)
    group_rank_df = group_shap_df.copy()
    group_rank_df["shap_mean"] = group_shap_df.mean(axis=1)
    group_rank_df["shap_median"] = group_shap_df.median(axis=1)

    # ★グループも shap_mean の大きさで順位付け
    group_rank_df["rank_mean"] = group_rank_df["shap_mean"].rank(
        ascending=False, method="min"
    ).astype(int)

    group_rank_df = group_rank_df.sort_values("rank_mean")

    rank_path_groups = outpath("SHAP_MEAN_GROUP_RANKING.CSV")
    group_rank_df.to_csv(rank_path_groups, encoding="utf-8-sig")
    group_rank_df.to_csv(outpath("SHAP_GROUP_RANKING_LABELED.CSV"), encoding="utf-8-sig")

    print(f"[Cell3A-SHAP-RANK] グループ単位ランキング saved -> {rank_path_groups}")

    # ---------- Foldごとの性能 ----------
    metrics_path = outpath("LOSO_METRICS.CSV")
    pd.DataFrame(metrics_rows).to_csv(metrics_path, index=False, encoding="utf-8-sig")
    print(f"[Cell3A-SHAP-RANK] LOSO metrics saved -> {metrics_path}")

    # ---------- グループランキングの可視化 ----------
    TOP_K = 8

    # 全グループ（★rank ではなく shap_mean を描画）
    plt.figure(figsize=(10, max(5, len(group_rank_df)//3)))
    plt.barh(group_rank_df.index[::-1], group_rank_df["shap_mean"][::-1])
    plt.xlabel("Mean |SHAP| over folds")
    plt.ylabel("Feature group")
    plt.title("SHAP-based Feature Group Importance (All)")
    plt.tight_layout()
    plt.savefig(outpath("SHAP_MEAN_GROUP_RANKING_ALL.PNG"), dpi=300)
    plt.close()

    # 上位 TOP_K グループ
    topk = group_rank_df.head(TOP_K).iloc[::-1]
    plt.figure(figsize=(12, 7))
    ax = plt.gca()
    ax.barh(topk.index, topk["shap_mean"])
    ax.set_xlabel("Mean |SHAP| over folds")
    ax.set_ylabel("Feature group")
    ax.set_title(f"Top-{TOP_K} SHAP-based Feature Group Importance")
    plt.tight_layout()
    plt.savefig(outpath("SHAP_MEAN_GROUP_TOP8_RANKING.PNG"), dpi=300)
    plt.close()

    print(f"[Cell3A-SHAP-RANK] 図を保存 -> "
          f"{outpath('SHAP_MEAN_GROUP_RANKING_ALL.PNG')} / {outpath('SHAP_MEAN_GROUP_TOP8_RANKING.PNG')}")


In [None]:
# ===== Cell 3B-rank: SHAP順位平均（LOSO, TreeSHAP・グループ単位） =====
set_cell_output(3)


RUN_CELL_3A_SHAP = USE_FS_SHAP_RANK  # デフォルトはTrue

if RUN_CELL_3A_SHAP:
    from sklearn.model_selection import LeaveOneGroupOut
    import json
    import os
    import re
    from collections import defaultdict

    import numpy as np
    import pandas as pd
    import matplotlib
    import matplotlib.backends
    import matplotlib.pyplot as plt

    required = [
        "X_all", "y_all", "groups",
        "fit_classifier", "evaluate_fold",
        "outpath", "compute_train_shap_abs_mean"
    ]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell3A-SHAP-RANK] 未定義の変数/関数があります: {missing}")

    # ---------- グループ名取得ヘルパ ----------
    def get_feature_group(col: str) -> str:
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col

    # TreeSHAP のバックエンド（木モデル）
    SHAP_BACKEND = "xgb"

    FEATURE_LIST_PATH = cell_output_path(3, "FEATURES_AFTER_CORR.json")

    # --- 特徴量プールの決定（相関事前除去の結果があれば利用） ---
    if os.path.exists(FEATURE_LIST_PATH):
        with open(FEATURE_LIST_PATH, "r", encoding="utf-8") as f:
            keep_payload = json.load(f)
        # 事前除去で KEEP された列だけ使う
        keep_cols = keep_payload.get("keep", [])
        feature_pool = [c for c in keep_cols if c in X_all.columns]
        print(f"[Cell3A-SHAP-RANK] correlation-pruned columns loaded ({len(feature_pool)} cols)")
    else:
        feature_pool = list(X_all.columns)
        print("[Cell3A-SHAP-RANK] correlation-pruned list not found. Using all columns.")

    if not feature_pool:
        raise RuntimeError("[Cell3A-SHAP-RANK] feature_pool が空です。Cell3A-pre の結果を確認してください。")

    X_source = X_all[feature_pool].copy()

    # 列→グループ / グループ→列
    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = defaultdict(list)
    for col in X_source.columns:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols[g].append(col)

    group_names = list(group_to_cols.keys())

    logo = LeaveOneGroupOut()

    # 列単位の fold別ランキング（従来互換）
    col_ranking_frames = []
    # グループ単位の fold別ランキング（新仕様）
    group_ranking_frames = []
    # foldごとの性能
    metrics_rows = []

    # --- LOSO ループ ---
    for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_source, y_all, groups), start=1):
        X_tr = X_source.iloc[tr_idx].astype(np.float32)
        y_tr = y_all.iloc[tr_idx].astype(int)
        X_te = X_source.iloc[te_idx].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)

        if len(np.unique(y_tr)) < 2:
            raise RuntimeError(f"[Cell3A-SHAP-RANK] fold{fold_id}: 学習側が単一クラスです。")

        # --- 木モデルで学習（TreeSHAP 対応） ---
        model = fit_classifier(X_tr, y_tr, backend=SHAP_BACKEND)

        # --- 列単位の mean(|SHAP|) を計算 ---
        shap_mean_col = compute_train_shap_abs_mean(model, X_tr)
        # 全列順に並べ直し（fold間でインデックス整合のため）
        shap_mean_col = shap_mean_col.reindex(X_tr.columns)

        # --- 列単位の順位（互換用） ---
        col_ranks = shap_mean_col.rank(ascending=False, method="min").astype(int)
        col_ranks.name = f"fold{fold_id}"
        col_ranking_frames.append(col_ranks)

        # --- グループ単位への集約（mean_abs を「合計」） ---
        group_importance: dict[str, float] = defaultdict(float)
        for col, val in shap_mean_col.items():
            g = col_to_group[col]
            group_importance[g] += float(val)

        shap_mean_group = pd.Series(group_importance)
        # グループ単位の順位
        group_ranks = shap_mean_group.rank(ascending=False, method="min").astype(int)
        group_ranks.name = f"fold{fold_id}"
        group_ranking_frames.append(group_ranks)

        # --- Fold ごとの性能評価 ---
        metrics = evaluate_fold(model, X_te, y_te)
        metrics.update({
            "fold_id": fold_id,
            "test_subject": groups.iloc[te_idx].iloc[0],
        })
        metrics_rows.append(metrics)

        preview_groups = shap_mean_group.sort_values(ascending=False).head(5).index.tolist()
        print(f"[Cell3A-SHAP-RANK] fold{fold_id}: ranked groups={len(shap_mean_group)} (top5 groups={preview_groups})")

    # ---------- 列単位ランキング（互換用） ----------
    col_rank_df = pd.concat(col_ranking_frames, axis=1)
    col_rank_df["rank_mean"] = col_rank_df.mean(axis=1)
    col_rank_df["rank_median"] = col_rank_df.median(axis=1)
    col_rank_df = col_rank_df.sort_values("rank_mean")

    rank_path_cols = outpath("SHAP_RANK_FEATURE_RANKING.CSV")
    col_rank_df.to_csv(rank_path_cols, encoding="utf-8-sig")
    col_rank_df.to_csv(outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")

    print(f"[Cell3A-SHAP-RANK] 列単位ランキング saved -> {rank_path_cols}")

    # ---------- グループ単位ランキング（新仕様・メイン） ----------
    group_rank_df = pd.concat(group_ranking_frames, axis=1)
    group_rank_df["rank_mean"] = group_rank_df.mean(axis=1)
    group_rank_df["rank_median"] = group_rank_df.median(axis=1)
    group_rank_df = group_rank_df.sort_values("rank_mean")

    rank_path_groups = outpath("SHAP_RANK_GROUP_RANKING.CSV")
    group_rank_df.to_csv(rank_path_groups, encoding="utf-8-sig")
    group_rank_df.to_csv(outpath("SHAP_GROUP_RANKING_LABELED.CSV"), encoding="utf-8-sig")

    print(f"[Cell3A-SHAP-RANK] グループ単位ランキング saved -> {rank_path_groups}")

    # ---------- Foldごとの性能 ----------
    metrics_path = outpath("LOSO_METRICS.CSV")
    pd.DataFrame(metrics_rows).to_csv(metrics_path, index=False, encoding="utf-8-sig")
    print(f"[Cell3A-SHAP-RANK] LOSO metrics saved -> {metrics_path}")

    # ---------- グループランキングの可視化 ----------
    TOP_K = 8

    # 全グループ
    plt.figure(figsize=(10, max(5, len(group_rank_df)//3)))
    plt.barh(group_rank_df.index[::-1], group_rank_df["rank_mean"][::-1])
    plt.xlabel("Average rank (lower=better)")
    plt.ylabel("Feature group")
    plt.title("SHAP-based Feature Group Ranking (All)")
    plt.tight_layout()
    plt.savefig(outpath("SHAP_RANK_GROUP_RANKING_ALL.PNG"), dpi=300)
    plt.close()

    # 上位 TOP_K グループ
    topk = group_rank_df.head(TOP_K).iloc[::-1]
    plt.figure(figsize=(12, 7))
    ax = plt.gca()
    ax.barh(topk.index, topk["rank_mean"])
    ax.set_xlabel("Average rank (lower=better)")
    ax.set_ylabel("Feature group")
    ax.set_title(f"Top-{TOP_K} SHAP-based Feature Group Ranking")
    plt.tight_layout()
    plt.savefig(outpath("SHAP_RANK_GROUP_TOP8_RANKING.PNG"), dpi=300)
    plt.close()

    print(f"[Cell3A-SHAP-RANK] 図を保存 -> "
          f"{outpath('SHAP_RANK_GROUP_RANKING_ALL.PNG')} / {outpath('SHAP_RANK_GROUP_TOP8_RANKING.PNG')}")


In [None]:
# ===== Cell 3B-RFE: RFE特徴量ランキング（LOSO, グループ単位） =====
set_cell_output(3)



RUN_CELL_3A_RFE = USE_FS_RFE  # デフォルトでは使わない

if RUN_CELL_3A_RFE:

    from sklearn.feature_selection import RFE
    from sklearn.model_selection import LeaveOneGroupOut
    import json
    import os
    import re
    from collections import defaultdict

    import numpy as np
    import pandas as pd
    import matplotlib
    import matplotlib.backends
    import matplotlib.pyplot as plt

    required = ["X_all", "y_all", "groups", "build_estimator", "fit_classifier", "evaluate_fold", "outpath"]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell3A-RFE] 未定義の変数/関数があります: {missing}")

    # ---------- グループ名取得ヘルパ ----------
    def get_feature_group(col: str) -> str:
        """
        ベース特徴名:
          - 'xxx_lag0', 'xxx_lag1', ... → 'xxx'
          - それ以外 → 列名そのまま
        """
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col

    RFE_BACKEND = "xgb"   # RFE では XGB 固定
    RFE_STEP = 1          # 1本ずつ削除
    RFE_MIN_FEATURES = 1  # 最低1列まで落としてフルランキングを得る

    FEATURE_LIST_PATH = cell_output_path(3, "FEATURES_AFTER_CORR.json")

    # --- 特徴量プールの決定（相関事前除去の結果があれば利用） ---
    if os.path.exists(FEATURE_LIST_PATH):
        with open(FEATURE_LIST_PATH, "r", encoding="utf-8") as f:
            keep_payload = json.load(f)
        feature_pool = [c for c in keep_payload.get("keep", []) if c in X_all.columns]
        print(f"[Cell3A-RFE] correlation-pruned columns loaded ({len(feature_pool)} cols)")
    else:
        feature_pool = list(X_all.columns)
        print("[Cell3A-RFE] correlation-pruned list not found. Using all columns.")

    if not feature_pool:
        raise RuntimeError("[Cell3A-RFE] feature_pool が空です。Cell3A-pre の結果を確認してください。")

    X_source = X_all[feature_pool].copy()

    # 列→グループ / グループ→列
    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = defaultdict(list)
    for col in X_source.columns:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols[g].append(col)

    group_names = list(group_to_cols.keys())

    logo = LeaveOneGroupOut()
    # 列単位RFEランキング（参考用）
    col_ranking_frames = []
    # グループ単位RFEランキング（メイン）
    group_ranking_frames = []
    # Foldごとの性能指標
    metrics_rows = []

    # --- LOSO ループ ---
    for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_source, y_all, groups), start=1):
        X_tr = X_source.iloc[tr_idx].astype(np.float32)
        y_tr = y_all.iloc[tr_idx].astype(int)
        X_te = X_source.iloc[te_idx].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)

        if len(np.unique(y_tr)) < 2:
            raise RuntimeError(f"[Cell3A-RFE] fold{fold_id}: 学習側が単一クラスです。")

        # 元の実装と同じく「雛形モデル」を作る
        base_estimator = build_estimator(backend=RFE_BACKEND)

        selector = RFE(
            estimator=base_estimator,
            step=max(1, int(RFE_STEP)),
            n_features_to_select=max(1, int(RFE_MIN_FEATURES)),
        )
        selector.fit(X_tr, y_tr)

        # --- 列単位のRFE順位（1=最重要） ---
        ranks_col = pd.Series(selector.ranking_, index=X_tr.columns, name=f"fold{fold_id}")
        col_ranking_frames.append(ranks_col)

        # --- 列順位 → グループ順位へ集約 ---
        #   各グループに属する列の「最小rank」をそのグループのrankとする
        group_rank_dict: dict[str, int] = {}
        for col, r in ranks_col.items():
            g = col_to_group[col]
            if g not in group_rank_dict:
                group_rank_dict[g] = int(r)
            else:
                group_rank_dict[g] = min(group_rank_dict[g], int(r))

        group_ranks = pd.Series(group_rank_dict, name=f"fold{fold_id}")
        group_ranking_frames.append(group_ranks)

        # --- Foldごとの性能評価 ---
        # ここがバグっていたので修正：
        #   selector で「どの列を残すか」だけ決めておいて、
        #   選ばれた列だけで改めてモデルを学習 → それを評価に使う
        selected_cols = list(X_tr.columns[selector.support_])

        # ★FIX: ここでちゃんと学習し直す
        model = fit_classifier(
            X_tr[selected_cols],
            y_tr,
            backend=RFE_BACKEND,
        )

        X_te_sel = X_te[selected_cols]
        metrics = evaluate_fold(model, X_te_sel, y_te)
        metrics.update({
            "fold_id": fold_id,
            "test_subject": groups.iloc[te_idx].iloc[0],
            "n_selected_features": len(selected_cols),
        })
        metrics_rows.append(metrics)

        # ログ表示
        preview_groups = group_ranks.sort_values().head(5).index.tolist()
        print(f"[Cell3A-RFE] fold{fold_id}: groups={len(group_ranks)}, "
              f"top5_groups(by rank)={preview_groups}")

    # ---------- 列単位RFEランキング（参考・互換用） ----------
    rfe_col_rank = pd.concat(col_ranking_frames, axis=1)
    rfe_col_rank["rank_mean"] = rfe_col_rank.mean(axis=1)
    rfe_col_rank["rank_median"] = rfe_col_rank.median(axis=1)
    rfe_col_rank = rfe_col_rank.sort_values("rank_mean")

    col_rank_path = outpath("RFE_FEATURE_RANKING.CSV")
    rfe_col_rank.to_csv(col_rank_path, encoding="utf-8-sig")
    rfe_col_rank.to_csv(outpath("RFE_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")
    print(f"[Cell3A-RFE] 列単位RFEランキング saved -> {col_rank_path}")

    # ---------- グループ単位RFEランキング（メイン） ----------
    rfe_group_rank = pd.concat(group_ranking_frames, axis=1)
    rfe_group_rank["rank_mean"] = rfe_group_rank.mean(axis=1)
    rfe_group_rank["rank_median"] = rfe_group_rank.median(axis=1)
    rfe_group_rank = rfe_group_rank.sort_values("rank_mean")

    group_rank_path = outpath("RFE_GROUP_RANKING.CSV")
    rfe_group_rank.to_csv(group_rank_path, encoding="utf-8-sig")
    rfe_group_rank.to_csv(outpath("RFE_GROUP_RANKING_LABELED.CSV"), encoding="utf-8-sig")
    print(f"[Cell3A-RFE] グループ単位RFEランキング saved -> {group_rank_path}")

    # ---------- LOSO性能ログ ----------
    metrics_path = outpath("RFE_LOSO_METRICS.CSV")
    pd.DataFrame(metrics_rows).to_csv(metrics_path, index=False, encoding="utf-8-sig")
    print(f"[Cell3A-RFE] LOSO metrics saved -> {metrics_path}")

    # ---------- グループRFEランキングの可視化 ----------
    TOP_K = 8

    plt.figure(figsize=(10, max(5, len(rfe_group_rank)//3)))
    plt.barh(rfe_group_rank.index[::-1], rfe_group_rank["rank_mean"][::-1])
    plt.xlabel("Average RFE rank (lower=better)")
    plt.ylabel("Feature group")
    plt.title("RFE-based Feature Group Ranking (All)")
    plt.tight_layout()
    plt.savefig(outpath("RFE_GROUP_RANKING_ALL.PNG"), dpi=300)
    plt.close()

    topk = rfe_group_rank.head(TOP_K).iloc[::-1]
    plt.figure(figsize=(12, 7))
    ax = plt.gca()
    ax.barh(topk.index, topk["rank_mean"])
    ax.set_xlabel("Average RFE rank (lower=better)")
    ax.set_ylabel("Feature group")
    ax.set_title(f"Top-{TOP_K} RFE-based Feature Group Ranking")
    plt.tight_layout()
    plt.savefig(outpath("RFE_GROUP_TOP8_RANKING.PNG"), dpi=300)
    plt.close()

    print(f"[Cell3A-RFE] 図を保存 -> "
          f"{outpath('RFE_GROUP_RANKING_ALL.PNG')} / {outpath('RFE_GROUP_TOP8_RANKING.PNG')}")


In [None]:
# ===== Cell 4: Top-k ROC-AUC（ニュートラル） =====
set_cell_output(4)


import os
from collections import defaultdict
import re

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

required = ["X_all", "y_all", "groups",
            "fit_classifier", "predict_positive_score", "outpath"]
missing = [name for name in required if name not in globals()]
if missing:
    raise RuntimeError(f"[Cell4-Subset] 未定義の変数/関数があります: {missing}")

# ---------- グループ名取得ヘルパ（lag付き列を1つのグループにまとめる） ----------
def get_feature_group(col: str) -> str:
    """
    ベース特徴名:
      - 'xxx_lag0', 'xxx_lag1', ... → 'xxx'
      - それ以外 → 列名そのまま
    """
    m = re.match(r"(.+)_lag\d+$", col)
    return m.group(1) if m else col

# 列→グループ / グループ→列 の対応を作成
col_to_group: dict[str, str] = {}
group_to_cols: dict[str, list[str]] = defaultdict(list)
for col in X_all.columns:
    g = get_feature_group(col)
    col_to_group[col] = g
    group_to_cols[g].append(col)

print(f"[Cell4-Subset] feature groups = {len(group_to_cols)} "
      f"(columns={len(X_all.columns)})")

# ---------- SHAPランキングの読み込み（行＝グループまたは列） ----------
rank_csv = cell_output_path(3, FEATURE_RANKING_FILE)
if not os.path.exists(rank_csv):
    raise FileNotFoundError("[Cell4-Subset] ランキングCSVがありません。選択した特徴量選定セルを実行してください。")

rank_df = pd.read_csv(rank_csv, index_col=0)

# 「小さいほど重要」なスコアを作る（rank_mean 優先, なければ mean_abs に基づく）
if "rank_mean" in rank_df.columns:
    base_score = rank_df["rank_mean"].astype(float)
elif "mean_abs" in rank_df.columns:
    # mean_abs が大きいほど重要 → 符号を反転して「小さいほど重要」にする
    base_score = (-rank_df["mean_abs"].astype(float))
else:
    # 何もなければ行順そのものをスコアにする（最初がもっとも重要）
    base_score = pd.Series(
        np.arange(len(rank_df), dtype=float),
        index=rank_df.index,
    )

# ---------- 行（特徴orグループ）→ グループスコアへ集約 ----------
group_score: dict[str, float] = {}
for name, score in base_score.items():
    g = get_feature_group(str(name))  # すでにグループ名ならそのまま
    if g not in group_score:
        group_score[g] = float(score)
    else:
        # より重要な列が1つでもあれば、そのグループはそのスコアを採用（min）
        group_score[g] = min(group_score[g], float(score))

group_rank = pd.Series(group_score, name="score").sort_values(ascending=True)

# 実際に X_all に存在するグループだけに絞る
group_order = [g for g in group_rank.index if g in group_to_cols]
if not group_order:
    raise RuntimeError("[Cell4-Subset] ランキングに該当するグループが X_all に存在しません。")

total_groups = len(group_order)
print(f"[Cell4-Subset] Using SHAP group ranking ({total_groups} groups):")
print(group_order)

# 旧コードとの互換用：TOP_SUBSET_K があればそれを使い，なければ total_groups で上書き
TOP_SUBSET_K = int(globals().get("TOP_SUBSET_K", 0))
if TOP_SUBSET_K <= 0 or TOP_SUBSET_K > total_groups:
    TOP_SUBSET_K = total_groups
globals()["TOP_SUBSET_K"] = TOP_SUBSET_K

logo = LeaveOneGroupOut()
results = []

# ---- 上位kグループの累積セットで評価（k=1..total_groups）----
for k in range(1, total_groups + 1):
    use_groups = group_order[:k]

    # この k グループに属する全ての列を集約
    feats: list[str] = []
    for g in use_groups:
        feats.extend(group_to_cols[g])
    # 念のため重複を除去（順序はグループ順→列順を維持）
    seen = set()
    feats_unique = []
    for f in feats:
        if f not in seen:
            feats_unique.append(f)
            seen.add(f)
    feats = feats_unique

    y_true_all = []
    y_score_all = []

    for tr_idx, te_idx in logo.split(X_all, y_all, groups):
        X_tr = X_all.iloc[tr_idx][feats].astype(np.float32)
        y_tr = y_all.iloc[tr_idx].astype(int)
        X_te = X_all.iloc[te_idx][feats].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)

        # 学習側が単一クラスならこのfoldはスキップ
        if len(np.unique(y_tr)) < 2:
            continue

        model = fit_classifier(X_tr, y_tr)
        proba = predict_positive_score(model, X_te)

        y_true_all.append(y_te)
        y_score_all.append(proba)

    if not y_true_all:
        auc = float("nan")
    else:
        y_true = np.concatenate(y_true_all)
        y_score = np.concatenate(y_score_all)
        if len(np.unique(y_true)) < 2:
            auc = float("nan")
        else:
            auc = float(roc_auc_score(y_true, y_score))

    results.append({
        "size": k,                 # 使用した「グループ数」k
        "n_features": len(feats),  # 実際に使った列数
        "groups": use_groups,      # 使用したグループ名
        "features": feats,         # 使用した列名
        "auc": auc,
    })
    print(f"[Cell4-Subset] k(groups)={k}/{total_groups}, "
          f"n_features={len(feats)}, AUC={auc:.4f}, groups={use_groups}")

# ---- 結果の整形・保存 ----
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("size")  # k昇順

results_df["groups_str"] = results_df["groups"].apply(lambda lst: ",".join(lst))
results_df["features_str"] = results_df["features"].apply(lambda lst: ",".join(lst))

# 1) このセル独自のファイル（ALLK_*）
subset_csv_name_allk = "ALLK_TOPORDER_AUC.csv"
subset_path_allk = outpath(subset_csv_name_allk)
results_df[["size", "n_features", "groups_str", "features_str", "auc"]].to_csv(
    subset_path_allk, index=False, encoding="utf-8-sig"
)
print(f"[Cell4-Subset] 保存 (ALLK, group-based) -> {subset_path_allk}")

# 最良のk（AUC最大、同点なら小さいk）を取得
best_row = results_df.sort_values(["auc", "size"], ascending=[False, True]).iloc[0]
print(f"[Cell4-Subset] best k(groups)={int(best_row['size'])}, "
      f"n_features={int(best_row['n_features'])}, "
      f"auc={best_row['auc']:.4f}, "
      f"groups={best_row['groups']}")

# 2) このセル独自の JSON（ALLK_SUBSET_BEST.json）
best_json_name_allk = "ALLK_SUBSET_BEST.json"
with open(outpath(best_json_name_allk), "w", encoding="utf-8") as f:
    json.dump({
        # 互換用：従来の "size" は「グループ数」として扱う
        "size": int(best_row["size"]),
        "n_features": int(best_row["n_features"]),
        "auc": float(best_row["auc"]),
        "groups": best_row["groups"],
        "features": best_row["features"],
    }, f, ensure_ascii=False, indent=2)
print(f"[Cell4-Subset] BEST (ALLK, group-based) -> {outpath(best_json_name_allk)}")

# 3) 旧「組合せ探索版」と同じ名前・形式でも保存（互換用）
subset_csv_name_compat = f"TOP{TOP_SUBSET_K}_SUBSET_AUC.csv"
subset_path_compat = outpath(subset_csv_name_compat)
results_df[["size", "n_features", "groups_str", "features_str", "auc"]].to_csv(
    subset_path_compat, index=False, encoding="utf-8-sig"
)
print(f"[Cell4-Subset] 互換CSV (group-based) -> {subset_path_compat}")

best_json_name_compat = f"TOP{TOP_SUBSET_K}_SUBSET_BEST.json"
with open(outpath(best_json_name_compat), "w", encoding="utf-8") as f:
    json.dump({
        "size": int(best_row["size"]),             # グループ数
        "n_features": int(best_row["n_features"]), # 列数
        "auc": float(best_row["auc"]),
        "groups": best_row["groups"],
        "features": best_row["features"],
    }, f, ensure_ascii=False, indent=2)
print(f"[Cell4-Subset] 互換BEST JSON (group-based) -> {outpath(best_json_name_compat)}")

# グローバル変数も旧仕様に合わせて更新
globals()["BEST_SUBSET_FEATURES"] = best_row["features"]   # 実際に使う列
globals()["BEST_SUBSET_GROUPS"] = best_row["groups"]       # 追加：使ったグループ名
globals()["BEST_SUBSET_K"] = int(best_row["size"])         # グループ数として解釈

# ---- グラフ描画：横軸=グループ数k（右ほど少ない）, 縦軸=ROC-AUC ----
FS_TITLE, FS_LABEL, FS_TICK = 30, 24, 20
LW = 1.5

plt.figure(figsize=(8, 8))
plt.plot(results_df["size"], results_df["auc"], marker="o", linewidth=LW)

ax = plt.gca()
ax.set_xlabel("Number of feature groups (k)", fontsize=FS_LABEL)
ax.set_ylabel("ROC-AUC (pooled LOSO)", fontsize=FS_LABEL)
ax.set_title("ROC-AUC vs. number of feature groups (all SHAP-ranked groups)", fontsize=FS_TITLE)

# 横軸：左が k = total_groups（全部）、右が k = 1（1グループだけ）
ax.set_xlim(total_groups, 1)
ax.set_xticks(range(1, total_groups + 1))

ax.set_ylim(0.0, 1.0)

ax.tick_params(axis="x", labelsize=FS_TICK)
ax.tick_params(axis="y", labelsize=FS_TICK)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(outpath("ALLK_TOPORDER_AUC.png"), dpi=300)
plt.close()

print(f"[Cell4-Subset] 図を保存 -> {outpath('ALLK_TOPORDER_AUC.png')}")


In [None]:
# ===== Cell 3C-Subset: Top-k 組合せ探索 =====
set_cell_output(3)


RUN_CELL_3A_Topk_FEATURE=False

if RUN_CELL_3A_Topk_FEATURE:
    import os
    from itertools import combinations
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import LeaveOneGroupOut

    required = ["X_all", "y_all", "groups", "fit_classifier", "predict_positive_score", "outpath"]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell3A-Subset] 未定義の変数/関数があります: {missing}")

    TOP_SUBSET_K = int(globals().get("TOP_SUBSET_K", 15))
    if TOP_SUBSET_K <= 0:
        raise ValueError("TOP_SUBSET_K must be positive")

    rank_csv = cell_output_path(3, FEATURE_RANKING_FILE)
    if not os.path.exists(rank_csv):
        raise FileNotFoundError("[Cell3A-Subset] ランキングCSVがありません。選択した特徴量選定セルを実行してください。")

    rank_df = pd.read_csv(rank_csv, index_col=0)
    if "rank_mean" in rank_df.columns:
        feature_order = rank_df.sort_values("rank_mean").index.tolist()
    elif "mean_abs" in rank_df.columns:
        feature_order = rank_df.sort_values("mean_abs", ascending=False).index.tolist()
    else:
        feature_order = list(rank_df.index)

    feature_order = [f for f in feature_order if f in X_all.columns]
    if not feature_order:
        raise RuntimeError("[Cell3A-Subset] ランキングに該当する特徴が X_all に存在しません。")

    limit = min(TOP_SUBSET_K, len(feature_order))
    top_features = feature_order[:limit]
    print(f"[Cell3A-Subset] Top features ({len(top_features)}): {top_features}")

    logo = LeaveOneGroupOut()
    results = []

    for r in range(1, len(top_features) + 1):
        for comb in combinations(top_features, r):
            feats = list(comb)
            y_true_all = []
            y_score_all = []
            for tr_idx, te_idx in logo.split(X_all, y_all, groups):
                X_tr = X_all.iloc[tr_idx][feats].astype(np.float32)
                y_tr = y_all.iloc[tr_idx].astype(int)
                X_te = X_all.iloc[te_idx][feats].astype(np.float32)
                y_te = y_all.iloc[te_idx].astype(int)
                if len(np.unique(y_tr)) < 2:
                    continue
                model = fit_classifier(X_tr, y_tr)
                proba = predict_positive_score(model, X_te)
                y_true_all.append(y_te)
                y_score_all.append(proba)
            if not y_true_all:
                auc = float("nan")
            else:
                y_true = np.concatenate(y_true_all)
                y_score = np.concatenate(y_score_all)
                if len(np.unique(y_true)) < 2:
                    auc = float("nan")
                else:
                    auc = float(roc_auc_score(y_true, y_score))
            results.append({
                "size": r,
                "features": feats,
                "auc": auc,
            })

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(["auc", "size"], ascending=[False, True])
    results_df["features_str"] = results_df["features"].apply(lambda lst: ",".join(lst))
    subset_csv_name = f"TOP{TOP_SUBSET_K}_SUBSET_AUC.csv"
    subset_path = outpath(subset_csv_name)
    results_df[["size", "features_str", "auc"]].to_csv(subset_path, index=False, encoding="utf-8-sig")

    best_row = results_df.iloc[0]
    print(f"[Cell3A-Subset] best size={int(best_row['size'])}, auc={best_row['auc']:.4f}, features={best_row['features']}")

    best_json_name = f"TOP{TOP_SUBSET_K}_SUBSET_BEST.json"
    with open(outpath(best_json_name), "w", encoding="utf-8") as f:
        json.dump({
            "size": int(best_row["size"]),
            "auc": float(best_row["auc"]),
            "features": best_row["features"],
        }, f, ensure_ascii=False, indent=2)

    globals()["BEST_SUBSET_FEATURES"] = best_row["features"]
    globals()["BEST_SUBSET_K"] = len(best_row["features"])

    print(f"[Cell3A-Subset] 保存 -> {subset_path}")



In [None]:
# ===== Cell 4A: MAXk ROC-AUC（CI計算＋ROC曲線, ニュートラル） =====
RUN_CELL4 = bool(globals().get('RUN_CELL4', True))
if not RUN_CELL4:
    print('[Cell4-14] RUN_CELL4=False -> skip')
else:
    set_cell_output(4)
    
    
    import os
    import json
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_auc_score, roc_curve
    from sklearn.model_selection import LeaveOneGroupOut
    
    
    
    subset_primary = f"TOP{int(globals().get('TOP_SUBSET_K', 15))}_SUBSET_BEST.json"
    subset_candidates = [
        cell_output_path(4, subset_primary),
        cell_output_path(4, "TOP10_SUBSET_BEST.json"),
        cell_output_path(3, subset_primary),
        cell_output_path(3, "TOP10_SUBSET_BEST.json"),
    ]
    subset_json = next((p for p in subset_candidates if os.path.exists(p)), None)
    if subset_json is None:
        raise FileNotFoundError("[Cell4A] TOP*_SUBSET_BEST.json がありません。Cell4 を実行してください。")
    
    with open(subset_json, "r", encoding="utf-8") as f:
        subset_info = json.load(f)
    
    best_features = subset_info.get("features", [])
    if not best_features:
        raise RuntimeError("[Cell4A] JSON 内に features がありません。")
    
    # X_all に存在するものだけに絞る
    best_features = [f for f in best_features if f in X_all.columns]
    if not best_features:
        raise RuntimeError("[Cell4A] X_all に存在する特徴がありません。")
    
    # ★ オプションで MSSQ / VIMSSQ を追加 ★
    extra_traits = []
    if globals().get("USE_MSSQ_FEATURE", False) and "MSSQ" in X_all.columns:
        extra_traits.append("MSSQ")
    if globals().get("USE_VIMSSQ_FEATURE", False) and "VIMSSQ" in X_all.columns:
        extra_traits.append("VIMSSQ")
    
    extra_traits = [f for f in extra_traits if f not in best_features]
    if extra_traits:
        print(f"[Cell4A] 追加で使用する属性特徴: {extra_traits}")
        best_features = best_features + extra_traits
    # ★ ここまで ★
    
    best_k = len(best_features)
    
    # もともとの表示
    print(f"[Cell4A] 使用特徴 ({best_k}) from {os.path.basename(subset_json)}: {best_features}")
    
    # ★ デバッグ用: 実際に使う最終特徴一覧を明示的にプリント ★
    print("[Cell4A][DEBUG] 実際にモデルに渡した特徴量リスト:")
    for i, f_name in enumerate(best_features, start=1):
        print(f"  {i:2d}: {f_name}")
    # ★ ここまで ★
    
    logo = LeaveOneGroupOut()
    y_true_all, proba_all, subj_all = [], [], []
    for tr_idx, te_idx in logo.split(X_all, y_all, groups):
        X_tr = X_all.iloc[tr_idx][best_features].astype(np.float32)
        y_tr = y_all.iloc[tr_idx].astype(int)
        X_te = X_all.iloc[te_idx][best_features].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)
        if len(np.unique(y_tr)) < 2:
            continue
        model = fit_classifier(X_tr, y_tr)
        proba = predict_positive_score(model, X_te)
        y_true_all.append(y_te)
        proba_all.append(proba)
        subj_all.append(groups.iloc[te_idx].values)
    
    if not y_true_all:
        raise RuntimeError("[Cell4A] 評価に必要な fold が得られませんでした。")
    
    y_pool = np.concatenate(y_true_all)
    s_pool = np.concatenate(proba_all)
    subj_pool = np.concatenate(subj_all)
    if len(np.unique(y_pool)) < 2:
        raise RuntimeError("[Cell4A] 真値が単一クラスのため ROC-AUC を計算できません。")
    
    auc_obs = float(roc_auc_score(y_pool, s_pool))
    
    rng = np.random.default_rng(20251101)
    df_pool = pd.DataFrame({"subject": subj_pool, "y_true": y_pool, "y_score": s_pool})
    subj_ids = df_pool["subject"].unique()
    auc_boot = []
    for _ in range(2000):
        sampled = rng.choice(subj_ids, size=len(subj_ids), replace=True)
        df_boot = pd.concat([df_pool[df_pool["subject"] == sid] for sid in sampled], ignore_index=True)
        if df_boot["y_true"].nunique() < 2:
            continue
        auc_boot.append(float(roc_auc_score(df_boot["y_true"], df_boot["y_score"])))
    if auc_boot:
        ci_low = float(np.quantile(auc_boot, 0.025))
        ci_high = float(np.quantile(auc_boot, 0.975))
    else:
        ci_low = ci_high = float("nan")
    
    pd.DataFrame([{"k": best_k, "auc": auc_obs, "ci_low": ci_low, "ci_high": ci_high}]).to_csv(
        outpath("AUC_K_CI.csv"), index=False, encoding="utf-8-sig"
    )
    print(f"[Cell4A] AUC={auc_obs:.4f} (95% CI [{ci_low:.4f}, {ci_high:.4f}])")
    
    fpr, tpr, _ = roc_curve(y_pool, s_pool)
    plt.figure(figsize=(7, 7))
    plt.plot(fpr, tpr, label=f"AUC = {auc_obs:.3f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Chance")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (Best Subset)")
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.4)
    plt.tight_layout()
    plt.savefig(outpath("AUC_K_CI.png"), dpi=300)
    plt.close()
    print(f"[Cell4A] ROC 図を保存 -> {outpath('AUC_K_CI.png')}")
    


In [None]:
# ===== Cell 4B: SHAP 可視化（ニュートラル・MAXkハイライト） =====
RUN_CELL4 = bool(globals().get('RUN_CELL4', True))
if not RUN_CELL4:
    print('[Cell4-13] RUN_CELL4=False -> skip')
else:
    set_cell_output(4)
    
    
    import os
    import re
    import json
    import shap
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    required = ["X_all", "y_all", "fit_classifier", "outpath", "SEED_BASE", "OUT_DIR"]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell4B-SHAP] 未定義の変数/関数があります: {missing}")
    
    # ---------- グループ名取得ヘルパ ----------
    def get_feature_group(col: str) -> str:
        """
        ベース特徴名:
          - 'xxx_lag0', 'xxx_lag1', ... → 'xxx'
          - それ以外 → 列名そのまま
        """
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col
    
    # ---------- 相関事前除去リスト（列） ----------
    FEATURE_LIST_PATH = cell_output_path(3, "FEATURES_AFTER_CORR.json")
    if os.path.exists(FEATURE_LIST_PATH):
        with open(FEATURE_LIST_PATH, "r", encoding="utf-8") as f:
            keep_payload = json.load(f)
        keep_cols = [c for c in keep_payload.get("keep", []) if c in X_all.columns]
        if not keep_cols:
            raise RuntimeError("[Cell4B-SHAP] FEATURES_AFTER_CORR.json の keep に有効な列がありません。")
        cols_for_model = keep_cols
        print(f"[Cell4B-SHAP] Using correlation-pruned columns ({len(cols_for_model)})")
    else:
        cols_for_model = list(X_all.columns)
        print("[Cell4B-SHAP] FEATURES_AFTER_CORR.json が無いため、全列を使用します。")
    
    # ---------- 列→グループ / グループ→列 ----------
    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = {}
    for col in cols_for_model:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols.setdefault(g, []).append(col)
    
    group_names = list(group_to_cols.keys())
    
    # ---------- SHAPグループランキング（順序を決める） ----------
    group_rank_csv = cell_output_path(3, GROUP_RANKING_FILE)
    if os.path.exists(group_rank_csv):
        group_rank_df = pd.read_csv(group_rank_csv, index_col=0)
        if "rank_mean" in group_rank_df.columns:
            group_order = group_rank_df.sort_values("rank_mean").index.tolist()
        else:
            group_order = list(group_rank_df.index)
        print(f"[Cell4B-SHAP] グループランキング順に従って描画します。groups={len(group_order)}")
    else:
        group_order = group_names
        print("[Cell4B-SHAP] GROUP_RANKING_FILE が無いため、グループ名の順で描画します。")
    
    # 念のため、現在の group_to_cols に存在するグループだけに制限
    group_order = [g for g in group_order if g in group_to_cols]
    
    # ---------- SHAP計算用データ（列レベル） ----------
    X_shap = X_all[cols_for_model].astype(np.float32)
    y_shap = y_all.astype(int)
    
    print(f"[Cell4B-SHAP] samples={X_shap.shape[0]}, columns={X_shap.shape[1]}")
    
    # ---------- モデル学習 ----------
    model = fit_classifier(X_shap, y_shap)
    
    # ---------- TreeExplainer で列レベル SHAP 計算 ----------
    background = shap.sample(X_shap, min(256, len(X_shap)), random_state=SEED_BASE)
    explainer = shap.TreeExplainer(
        model,
        data=background,
        model_output="probability",
        feature_perturbation="interventional",
    )
    
    shap_values_any = explainer.shap_values(X_shap)
    if isinstance(shap_values_any, list):
        # 2値分類などで [neg, pos] みたいなリストになる場合は「陽性クラス(1)」を採用
        if hasattr(model, "classes_") and 1 in list(model.classes_):
            class_idx = list(model.classes_).index(1)
        else:
            class_idx = -1  # 最後のクラス
        shap_values_col = shap_values_any[class_idx]
    else:
        shap_values_col = shap_values_any
    
    shap_values_col = np.asarray(shap_values_col)
    if shap_values_col.ndim == 3:
        # (n_samples, n_features, n_classes) → 陽性クラスを取り出す
        if hasattr(model, "classes_") and 1 in list(model.classes_):
            pos_idx = list(model.classes_).index(1)
        else:
            pos_idx = -1
        shap_values_col = shap_values_col[:, :, pos_idx]
    elif shap_values_col.ndim == 1:
        shap_values_col = shap_values_col.reshape(-1, 1)
    
    if shap_values_col.shape[1] != X_shap.shape[1]:
        raise RuntimeError(f"[Cell4B-SHAP] shap_values 形状が一致しません: {shap_values_col.shape} vs {X_shap.shape}")
    
    print("[Cell4B-SHAP] 列レベルの SHAP 値を計算しました。")
    
    # ---------- 列レベル SHAP → グループ SHAP への集約 ----------
    n_samples = shap_values_col.shape[0]
    n_groups = len(group_order)
    
    # 列名→インデックス
    col_index_map = {col: idx for idx, col in enumerate(X_shap.columns)}
    
    # グループ SHAP行列とグループ値行列を作成
    shap_values_group = np.zeros((n_samples, n_groups), dtype=float)
    X_group_df = pd.DataFrame(index=X_shap.index, columns=group_order, dtype=float)
    
    for g_idx, g in enumerate(group_order):
        cols = group_to_cols[g]
        # 現在の X_shap に存在する列だけ
        cols = [c for c in cols if c in X_shap.columns]
        if not cols:
            continue
        col_indices = [col_index_map[c] for c in cols]
    
        # SHAP: 各サンプルにおいて、当該グループに属する列の SHAP を合計
        shap_values_group[:, g_idx] = shap_values_col[:, col_indices].sum(axis=1)
    
        # グループの代表値（色付け用）は、当該列の平均値とする
        X_group_df[g] = X_shap[cols].mean(axis=1)
    
    print(f"[Cell4B-SHAP] グループ数={n_groups}, shap_values_group.shape={shap_values_group.shape}")
    
    # ---------- Top-K の設定 ----------
    TOP_K = int(globals().get("TOP_SUBSET_K", 15))
    TOP_K = max(1, min(TOP_K, n_groups))  # 1〜グループ数にクリップ
    
    # ---------- Top-k subset をグループにマップ（ハイライト用） ----------
    highlight_groups: list[str] = []
    try:
        subset_json_candidates = [
            cell_output_path(4, f"TOP{TOP_K}_SUBSET_BEST.json"),
            cell_output_path(4, "TOP10_SUBSET_BEST.json"),
            cell_output_path(3, f"TOP{TOP_K}_SUBSET_BEST.json"),
            cell_output_path(3, "TOP10_SUBSET_BEST.json"),
        ]
        subset_json_path = next((p for p in subset_json_candidates if os.path.exists(p)), None)
    
        if subset_json_path is not None:
            with open(subset_json_path, "r", encoding="utf-8") as f:
                info = json.load(f)
            feature_cols_subset = info.get("features", [])
            # 列名 -> グループ名 に変換して一意集合に
            highlight_groups = sorted({get_feature_group(c) for c in feature_cols_subset if c in col_to_group})
            print(f"[Cell4B-SHAP] subset highlight groups (from {os.path.basename(subset_json_path)}):")
            print(f"  {highlight_groups}")
        else:
            print("[Cell4B-SHAP] subset JSON が見つからなかったため、ハイライトなし。")
    except Exception as e:
        print(f"[Cell4B-SHAP][WARN] subset読み込み失敗: {e}")
        highlight_groups = []
    
    # ---------- 出力ディレクトリ ----------
    shap_dir = os.path.join(OUT_DIR, "SHAP_GROUP")
    os.makedirs(shap_dir, exist_ok=True)
    
    # =============================================================================
    # 1. Summary plot（Top-K グループ）
    # =============================================================================
    # Top-K グループを group_order から切り出し
    top_groups = group_order[:TOP_K]
    top_idx = [group_order.index(g) for g in top_groups]
    
    X_top = X_group_df[top_groups]
    shap_top = shap_values_group[:, top_idx]
    
    plt.figure()
    shap.summary_plot(
        shap_top,
        X_top,
        show=False,
        plot_type="dot",
        max_display=TOP_K,
        sort=False,  # こちらで列順を制御する
    )
    ax = plt.gca()
    for label in ax.get_yticklabels():
        if label.get_text() in highlight_groups:
            label.set_color("red")
    plt.tight_layout()
    summary_top_path = os.path.join(shap_dir, f"SHAP_GROUP_SUMMARY_TOP{TOP_K}.png")
    plt.savefig(summary_top_path, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"[Cell4B-SHAP] Summary plot (Top-{TOP_K} groups) -> {summary_top_path}")
    
    # =============================================================================
    # 2. Summary plot（全グループ）
    # =============================================================================
    plt.figure()
    shap.summary_plot(
        shap_values_group,
        X_group_df[group_order],
        show=False,
        plot_type="dot",
        max_display=len(group_order),
        sort=False,
    )
    ax = plt.gca()
    for label in ax.get_yticklabels():
        if label.get_text() in highlight_groups:
            label.set_color("red")
    plt.tight_layout()
    summary_all_path = os.path.join(shap_dir, f"SHAP_GROUP_SUMMARY_ALL.png")
    plt.savefig(summary_all_path, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"[Cell4B-SHAP] Summary plot (ALL groups) -> {summary_all_path}")
    
    # =============================================================================
    # 3. 各グループごとの dependence plot
    # =============================================================================
    for g in group_order:
        plt.figure()
        shap.dependence_plot(
            g,
            shap_values_group,
            X_group_df[group_order],  # 全グループを渡す
            show=False,
            interaction_index=None,   # 一変数の関係だけを描画
        )
        plt.tight_layout()
        safe_name = g.replace("/", "_").replace("\\", "_")
        dep_path = os.path.join(shap_dir, f"SHAP_GROUP_DEP_{safe_name}.png")
        plt.savefig(dep_path, dpi=300, bbox_inches="tight")
        plt.close()
        print(f"[Cell4B-SHAP] Dependence plot (group={g}) -> {dep_path}")
    


In [None]:
# ===== Cell 5: Top-k ROC-AUC（MSSQ層別） =====
RUN_CELL5 = bool(globals().get('RUN_CELL5', True))
if not RUN_CELL5:
    print('[Cell5-15] RUN_CELL5=False -> skip')
else:
    set_cell_output(5)
    
    
    import os
    from collections import defaultdict
    import re
    
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import LeaveOneGroupOut
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import json
    
    required = [
        "X_all", "y_all", "groups",
        "SUBJECT_META",
        "fit_classifier", "predict_positive_score",
        "outpath", "cell_output_path",
        "FEATURE_RANKING_FILE",
    ]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell5-MSSQ] 未定義の変数/関数があります: {missing}")
    
    # ---------- MSSQグループ情報の準備 ----------
    if "MSSQ_group" not in SUBJECT_META.columns:
        raise RuntimeError("[Cell5-MSSQ] SUBJECT_META に 'MSSQ_group' 列がありません。")
    
    # subject_id → MSSQ_group の辞書
    _subj_mssq_group = SUBJECT_META["MSSQ_group"].astype(str).to_dict()
    
    # 各サンプル（行）ごとの subject_id / MSSQ_group
    subj_series = groups.astype(str)
    mssq_group_series = subj_series.map(_subj_mssq_group)
    
    if mssq_group_series.isna().any():
        missing_ids = sorted(subj_series[mssq_group_series.isna()].unique())
        raise RuntimeError(
            "[Cell5-MSSQ] MSSQ_group が未設定の被験者があります: "
            + ", ".join(map(str, missing_ids))
        )
    
    # ラベル名（既定: "Low" / "High"）
    MSSQ_LOW_LABEL = str(globals().get("MSSQ_LOW_LABEL", "Low"))
    MSSQ_HIGH_LABEL = str(globals().get("MSSQ_HIGH_LABEL", "High"))
    
    print(f"[Cell5-MSSQ] MSSQ_group ラベル: Low='{MSSQ_LOW_LABEL}', High='{MSSQ_HIGH_LABEL}'")
    print(f"[Cell5-MSSQ] MSSQ_group 割合:\n{mssq_group_series.value_counts()}")
    
    # ---------- グループ名取得ヘルパ（lag付き列を1つのグループにまとめる） ----------
    def get_feature_group(col: str) -> str:
        """
        ベース特徴名:
          - 'xxx_lag0', 'xxx_lag1', ... → 'xxx'
          - それ以外 → 列名そのまま
        """
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col
    
    # 列→グループ / グループ→列 の対応を作成
    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = defaultdict(list)
    for col in X_all.columns:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols[g].append(col)
    
    print(f"[Cell5-MSSQ] feature groups = {len(group_to_cols)} "
          f"(columns={len(X_all.columns)})")
    
    # ---------- SHAPランキングの読み込み（行＝グループまたは列） ----------
    rank_csv = cell_output_path(3, FEATURE_RANKING_FILE)
    if not os.path.exists(rank_csv):
        raise FileNotFoundError("[Cell5-MSSQ] ランキングCSVがありません。Cell3A-SHAP を実行してください。")
    
    rank_df = pd.read_csv(rank_csv, index_col=0)
    
    # 「小さいほど重要」なスコアを作る（rank_mean 優先, なければ mean_abs に基づく）
    if "rank_mean" in rank_df.columns:
        base_score = rank_df["rank_mean"].astype(float)
    elif "mean_abs" in rank_df.columns:
        # mean_abs が大きいほど重要 → 符号を反転して「小さいほど重要」にする
        base_score = (-rank_df["mean_abs"].astype(float))
    else:
        # 何もなければ行順そのものをスコアにする（最初がもっとも重要）
        base_score = pd.Series(
            np.arange(len(rank_df), dtype=float),
            index=rank_df.index,
        )
    
    # ---------- 行（特徴orグループ）→ グループスコアへ集約 ----------
    group_score: dict[str, float] = {}
    for name, score in base_score.items():
        g = get_feature_group(str(name))  # すでにグループ名ならそのまま
        if g not in group_score:
            group_score[g] = float(score)
        else:
            # より重要な列が1つでもあれば、そのグループはそのスコアを採用（min）
            group_score[g] = min(group_score[g], float(score))
    
    group_rank = pd.Series(group_score, name="score").sort_values(ascending=True)
    
    # 実際に X_all に存在するグループだけに絞る
    group_order = [g for g in group_rank.index if g in group_to_cols]
    if not group_order:
        raise RuntimeError("[Cell5-MSSQ] ランキングに該当するグループが X_all に存在しません。")
    
    total_groups = len(group_order)
    print(f"[Cell5-MSSQ] Using SHAP group ranking ({total_groups} groups):")
    print(group_order)
    
    # 旧コードとの互換用：TOP_SUBSET_K があればそれを使い，なければ total_groups で上書き
    TOP_SUBSET_K = int(globals().get("TOP_SUBSET_K", 0))
    if TOP_SUBSET_K <= 0 or TOP_SUBSET_K > total_groups:
        TOP_SUBSET_K = total_groups
    globals()["TOP_SUBSET_K"] = TOP_SUBSET_K
    
    logo = LeaveOneGroupOut()
    results = []
    
    # ---- 上位kグループの累積セットで評価（k=1..total_groups）----
    for k in range(1, total_groups + 1):
        use_groups = group_order[:k]
    
        # この k グループに属する全ての列を集約
        feats: list[str] = []
        for g in use_groups:
            feats.extend(group_to_cols[g])
        # 念のため重複を除去（順序はグループ順→列順を維持）
        seen = set()
        feats_unique = []
        for f in feats:
            if f not in seen:
                feats_unique.append(f)
                seen.add(f)
        feats = feats_unique
    
        y_true_all = []
        y_score_all = []
        mssq_group_all = []
    
        skip_folds = 0
    
        for tr_idx, te_idx in logo.split(X_all, y_all, groups):
            tr_idx = np.asarray(tr_idx)
            te_idx = np.asarray(te_idx)
    
            # テスト側の被験者ID（LOSOなので1名のみを想定）
            te_subj_ids = np.unique(subj_series.iloc[te_idx].astype(str))
            if len(te_subj_ids) != 1:
                raise RuntimeError(
                    f"[Cell5-MSSQ] te_idx に複数被験者が含まれています: {te_subj_ids}"
                )
            test_sid = te_subj_ids[0]
            test_group = _subj_mssq_group.get(test_sid, None)
            if test_group is None:
                skip_folds += 1
                continue
    
            # 訓練側を「テスト被験者と同じ MSSQ_group」に絞る
            tr_groups = subj_series.iloc[tr_idx].map(_subj_mssq_group)
            tr_mask_same_group = (tr_groups == test_group)
            tr_idx_group = tr_idx[tr_mask_same_group.to_numpy()]
    
            if tr_idx_group.size == 0:
                skip_folds += 1
                continue
    
            X_tr = X_all.iloc[tr_idx_group][feats].astype(np.float32)
            y_tr = y_all.iloc[tr_idx_group].astype(int)
            X_te = X_all.iloc[te_idx][feats].astype(np.float32)
            y_te = y_all.iloc[te_idx].astype(int)
    
            # 学習側 or テスト側が単一クラスならこの fold はスキップ
            if len(np.unique(y_tr)) < 2 or len(np.unique(y_te)) < 2:
                skip_folds += 1
                continue
    
            model = fit_classifier(X_tr, y_tr)
            proba = predict_positive_score(model, X_te)
    
            y_true_all.append(y_te.to_numpy())
            y_score_all.append(proba)
            # この fold のテストサンプルは全て同じ MSSQ_group
            mssq_group_all.append(
                np.full_like(y_te.to_numpy(), fill_value=test_group, dtype=object)
            )
    
        if not y_true_all:
            auc_total = float("nan")
            auc_low = float("nan")
            auc_high = float("nan")
        else:
            y_true = np.concatenate(y_true_all)
            y_score = np.concatenate(y_score_all)
            g_all = np.concatenate(mssq_group_all)
    
            if len(np.unique(y_true)) < 2:
                auc_total = float("nan")
                auc_low = float("nan")
                auc_high = float("nan")
            else:
                auc_total = float(roc_auc_score(y_true, y_score))
    
                # MSSQ-Low / High の部分 AUC
                def _safe_auc(mask: np.ndarray) -> float:
                    if mask.sum() == 0:
                        return float("nan")
                    y_sub = y_true[mask]
                    s_sub = y_score[mask]
                    if len(np.unique(y_sub)) < 2:
                        return float("nan")
                    return float(roc_auc_score(y_sub, s_sub))
    
                auc_low = _safe_auc(g_all == MSSQ_LOW_LABEL)
                auc_high = _safe_auc(g_all == MSSQ_HIGH_LABEL)
    
        results.append({
            "size": k,                 # 使用した「グループ数」k
            "n_features": len(feats),  # 実際に使った列数
            "groups": use_groups,      # 使用したグループ名
            "features": feats,         # 使用した列名
            "auc_total": auc_total,
            "auc_low": auc_low,
            "auc_high": auc_high,
            "skip_folds": skip_folds,
        })
        print(
            f"[Cell5-MSSQ] k(groups)={k}/{total_groups}, "
            f"n_features={len(feats)}, "
            f"AUC_total={auc_total:.4f}, "
            f"AUC_Low={auc_low:.4f}, AUC_High={auc_high:.4f}, "
            f"skip_folds={skip_folds}"
        )
    
    # ---- 結果の整形・保存 ----
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values("size")  # k昇順
    
    results_df["groups_str"] = results_df["groups"].apply(lambda lst: ",".join(lst))
    results_df["features_str"] = results_df["features"].apply(lambda lst: ",".join(lst))
    
    # 1) このセル独自のファイル（ALLK_*）
    subset_csv_name_allk = "ALLK_TOPORDER_AUC_MSSQ_SPLIT.csv"
    subset_path_allk = outpath(subset_csv_name_allk)
    results_df[
        ["size", "n_features", "groups_str", "features_str",
         "auc_total", "auc_low", "auc_high", "skip_folds"]
    ].to_csv(subset_path_allk, index=False, encoding="utf-8-sig")
    print(f"[Cell5-MSSQ] 保存 (ALLK, MSSQ-split) -> {subset_path_allk}")
    
    # 最良のk（AUC_total 最大、同点なら小さいk）を取得
    valid_df = results_df.dropna(subset=["auc_total"])
    if valid_df.empty:
        raise RuntimeError("[Cell5-MSSQ] 有効な AUC_total が得られませんでした。")
    
    best_row = valid_df.sort_values(
        ["auc_total", "size"], ascending=[False, True]
    ).iloc[0]
    print(
        f"[Cell5-MSSQ] best k(groups)={int(best_row['size'])}, "
        f"n_features={int(best_row['n_features'])}, "
        f"AUC_total={best_row['auc_total']:.4f}, "
        f"AUC_Low={best_row['auc_low']:.4f}, "
        f"AUC_High={best_row['auc_high']:.4f}, "
        f"groups={best_row['groups']}"
    )
    
    # 2) このセル独自の JSON（ALLK_SUBSET_BEST_MSSQ_SPLIT.json）
    best_json_name_allk = "ALLK_SUBSET_BEST_MSSQ_SPLIT.json"
    with open(outpath(best_json_name_allk), "w", encoding="utf-8") as f:
        json.dump({
            "size": int(best_row["size"]),              # グループ数
            "n_features": int(best_row["n_features"]),  # 列数
            "auc_total": float(best_row["auc_total"]),
            "auc_low": float(best_row["auc_low"]),
            "auc_high": float(best_row["auc_high"]),
            "groups": best_row["groups"],
            "features": best_row["features"],
        }, f, ensure_ascii=False, indent=2)
    print(f"[Cell5-MSSQ] BEST (ALLK, MSSQ-split) -> {outpath(best_json_name_allk)}")
    
    # 3) 旧「組合せ探索版」と同じ名前・形式でも保存（互換用）
    subset_csv_name_compat = f"TOP{TOP_SUBSET_K}_SUBSET_AUC_MSSQ_SPLIT.csv"
    subset_path_compat = outpath(subset_csv_name_compat)
    results_df[
        ["size", "n_features", "groups_str", "features_str", "auc_total"]
    ].rename(columns={"auc_total": "auc"}).to_csv(
        subset_path_compat, index=False, encoding="utf-8-sig"
    )
    print(f"[Cell5-MSSQ] 互換CSV (MSSQ-split) -> {subset_path_compat}")
    
    best_json_name_compat = f"TOP{TOP_SUBSET_K}_SUBSET_BEST_MSSQ_SPLIT.json"
    with open(outpath(best_json_name_compat), "w", encoding="utf-8") as f:
        json.dump({
            "size": int(best_row["size"]),              # グループ数
            "n_features": int(best_row["n_features"]),  # 列数
            "auc": float(best_row["auc_total"]),        # 互換用：auc_total を auc として保存
            "groups": best_row["groups"],
            "features": best_row["features"],
        }, f, ensure_ascii=False, indent=2)
    print(f"[Cell5-MSSQ] 互換BEST JSON (MSSQ-split) -> {outpath(best_json_name_compat)}")
    
    # グローバル変数も旧仕様に合わせて更新（別名変数として保持）
    globals()["BEST_SUBSET_FEATURES_MSSQ_SPLIT"] = best_row["features"]   # 実際に使う列
    globals()["BEST_SUBSET_GROUPS_MSSQ_SPLIT"] = best_row["groups"]       # 使ったグループ名
    globals()["BEST_SUBSET_K_MSSQ_SPLIT"] = int(best_row["size"])         # グループ数として解釈
    
    # ---- グラフ描画：横軸=グループ数k（右ほど少ない）, 縦軸=ROC-AUC ----
    FS_TITLE, FS_LABEL, FS_TICK = 30, 24, 20
    LW = 1.5
    
    plt.figure(figsize=(8, 8))
    plt.plot(results_df["size"], results_df["auc_total"], marker="o",
             linewidth=LW, label="Overall")
    plt.plot(results_df["size"], results_df["auc_low"], marker="s",
             linewidth=LW, linestyle="--", label=f"MSSQ={MSSQ_LOW_LABEL}")
    plt.plot(results_df["size"], results_df["auc_high"], marker="^",
             linewidth=LW, linestyle=":", label=f"MSSQ={MSSQ_HIGH_LABEL}")
    
    ax = plt.gca()
    ax.set_xlabel("Number of feature groups (k)", fontsize=FS_LABEL)
    ax.set_ylabel("ROC-AUC (pooled LOSO, MSSQ-split models)", fontsize=FS_LABEL)
    ax.set_title(
        "ROC-AUC vs. number of feature groups\n(MSSQ-split models, all SHAP-ranked groups)",
        fontsize=FS_TITLE
    )
    
    # 横軸：左が k = total_groups（全部）、右が k = 1（1グループだけ）
    ax.set_xlim(total_groups, 1)
    ax.set_xticks(range(1, total_groups + 1))
    
    ax.set_ylim(0.0, 1.0)
    
    ax.tick_params(axis="x", labelsize=FS_TICK)
    ax.tick_params(axis="y", labelsize=FS_TICK)
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=FS_TICK)
    
    plt.tight_layout()
    plt.savefig(outpath("ALLK_TOPORDER_AUC_MSSQ_SPLIT.png"), dpi=300)
    plt.close()
    
    print(f"[Cell5-MSSQ] 図を保存 -> {outpath('ALLK_TOPORDER_AUC_MSSQ_SPLIT.png')}")
    


In [None]:
# ===== Cell 5A: MAXk ROC-AUC（CI計算＋ROC曲線, MSSQ層別） =====
RUN_CELL5 = bool(globals().get('RUN_CELL5', True))
if not RUN_CELL5:
    print('[Cell5-16] RUN_CELL5=False -> skip')
else:
    set_cell_output(5)
    
    
    import os
    import json
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_auc_score, roc_curve
    from sklearn.model_selection import LeaveOneGroupOut
    
    
    required = [
        "X_all", "y_all", "groups",
        "SUBJECT_META",
        "fit_classifier", "predict_positive_score",
        "outpath", "cell_output_path",
    ]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell5A-MSSQ] 未定義の変数/関数があります: {missing}")
    
    # ---------- MSSQグループ情報 ----------
    if "MSSQ_group" not in SUBJECT_META.columns:
        raise RuntimeError("[Cell5A-MSSQ] SUBJECT_META に 'MSSQ_group' 列がありません。")
    
    _subj_mssq_group = SUBJECT_META["MSSQ_group"].astype(str).to_dict()
    subj_series = groups.astype(str)
    mssq_group_series = subj_series.map(_subj_mssq_group)
    
    if mssq_group_series.isna().any():
        missing_ids = sorted(subj_series[mssq_group_series.isna()].unique())
        raise RuntimeError(
            "[Cell5A-MSSQ] MSSQ_group が未設定の被験者があります: "
            + ", ".join(map(str, missing_ids))
        )
    
    MSSQ_LOW_LABEL = str(globals().get("MSSQ_LOW_LABEL", "Low"))
    MSSQ_HIGH_LABEL = str(globals().get("MSSQ_HIGH_LABEL", "High"))
    
    print(f"[Cell5A-MSSQ] MSSQ_group ラベル: Low='{MSSQ_LOW_LABEL}', High='{MSSQ_HIGH_LABEL}'")
    print(f"[Cell5A-MSSQ] MSSQ_group 割合:\n{mssq_group_series.value_counts()}")
    
    # ---------- 使用する特徴サブセット（JSON）を決定 ----------
    # まず MSSQ-split 専用の BEST を優先
    subset_primary_mssq = f"TOP{int(globals().get('BEST_SUBSET_K_MSSQ_SPLIT', globals().get('TOP_SUBSET_K', 15)))}_SUBSET_BEST_MSSQ_SPLIT.json"
    
    subset_candidates = [
        cell_output_path(5, "ALLK_SUBSET_BEST_MSSQ_SPLIT.json"),
        cell_output_path(5, subset_primary_mssq),
    ]
    
    # それでも無ければ、従来の TOP*_SUBSET_BEST.json をフォールバックとして探す
    subset_primary = f"TOP{int(globals().get('TOP_SUBSET_K', 15))}_SUBSET_BEST.json"
    subset_candidates.extend([
        cell_output_path(3, subset_primary),
        cell_output_path(3, "TOP10_SUBSET_BEST.json"),
    ])
    
    subset_json = next((p for p in subset_candidates if os.path.exists(p)), None)
    if subset_json is None:
        raise FileNotFoundError(
            "[Cell5A-MSSQ] TOP*_SUBSET_BEST(_MSSQ_SPLIT).json が見つかりません。"
            " Cell3A-Subset や Cell5-MSSQ を実行してください。"
        )
    
    with open(subset_json, "r", encoding="utf-8") as f:
        subset_info = json.load(f)
    
    best_features = subset_info.get("features", [])
    if not best_features:
        raise RuntimeError("[Cell5A-MSSQ] JSON 内に features がありません。")
    
    # X_all に存在するものだけに絞る
    best_features = [f for f in best_features if f in X_all.columns]
    if not best_features:
        raise RuntimeError("[Cell5A-MSSQ] X_all に存在する特徴がありません。")
    
    best_k = len(best_features)
    
    print(f"[Cell5A-MSSQ] 使用特徴 ({best_k}) from {os.path.basename(subset_json)}: {best_features}")
    print("[Cell5A-MSSQ][DEBUG] 実際にモデルに渡した特徴量リスト:")
    for i, f_name in enumerate(best_features, start=1):
        print(f"  {i:2d}: {f_name}")
    
    # ---------- LOSO（MSSQ 2群別モデル）でスコアを作成 ----------
    logo = LeaveOneGroupOut()
    y_true_all, proba_all, subj_all, group_all = [], [], [], []
    
    skip_folds = 0
    
    for tr_idx, te_idx in logo.split(X_all, y_all, groups):
        tr_idx = np.asarray(tr_idx)
        te_idx = np.asarray(te_idx)
    
        te_subj_ids = np.unique(subj_series.iloc[te_idx].astype(str))
        if len(te_subj_ids) != 1:
            raise RuntimeError(
                f"[Cell5A-MSSQ] te_idx に複数被験者が含まれています: {te_subj_ids}"
            )
        test_sid = te_subj_ids[0]
        test_group = _subj_mssq_group.get(test_sid, None)
        if test_group is None:
            skip_folds += 1
            continue
    
        # 訓練側を「テスト被験者と同じ MSSQ_group」に絞る
        tr_groups = subj_series.iloc[tr_idx].map(_subj_mssq_group)
        tr_mask_same_group = (tr_groups == test_group)
        tr_idx_group = tr_idx[tr_mask_same_group.to_numpy()]
    
        if tr_idx_group.size == 0:
            skip_folds += 1
            continue
    
        X_tr = X_all.iloc[tr_idx_group][best_features].astype(np.float32)
        y_tr = y_all.iloc[tr_idx_group].astype(int)
        X_te = X_all.iloc[te_idx][best_features].astype(np.float32)
        y_te = y_all.iloc[te_idx].astype(int)
    
        # 学習側 or テスト側が単一クラスならこの fold はスキップ
        if len(np.unique(y_tr)) < 2 or len(np.unique(y_te)) < 2:
            skip_folds += 1
            continue
    
        model = fit_classifier(X_tr, y_tr)
        proba = predict_positive_score(model, X_te)
    
        y_true_all.append(y_te.to_numpy())
        proba_all.append(proba)
        subj_all.append(subj_series.iloc[te_idx].to_numpy())
        group_all.append(
            np.full_like(y_te.to_numpy(), fill_value=test_group, dtype=object)
        )
    
    print(f"[Cell5A-MSSQ] スキップされた fold 数: {skip_folds}")
    
    if not y_true_all:
        raise RuntimeError("[Cell5A-MSSQ] 評価に必要な fold が得られませんでした。")
    
    y_pool = np.concatenate(y_true_all)
    s_pool = np.concatenate(proba_all)
    subj_pool = np.concatenate(subj_all)
    g_pool = np.concatenate(group_all)
    
    if len(np.unique(y_pool)) < 2:
        raise RuntimeError("[Cell5A-MSSQ] 真値が単一クラスのため ROC-AUC を計算できません。")
    
    # ---------- AUC + 95% CI を被験者単位ブートストラップで計算 ----------
    rng = np.random.default_rng(20251101)
    
    df_pool = pd.DataFrame({
        "subject": subj_pool,
        "mssq_group": g_pool,
        "y_true": y_pool,
        "y_score": s_pool,
    })
    pred_csv_path = cell_output_path(5, "MSSQ_SPLIT_PREDICTIONS.csv")
    df_pool.to_csv(pred_csv_path, index=False, encoding="utf-8-sig")
    print(f"[Cell5A-MSSQ] cross-validated predictions -> {pred_csv_path}")
    
    def _bootstrap_auc_by_subject(df: pd.DataFrame, n_boot: int = 2000) -> tuple[float, float, float]:
        """被験者単位で再標本化して AUC の95%CIを推定する。"""
        # 観測値
        if df["y_true"].nunique() < 2:
            return (float("nan"), float("nan"), float("nan"))
        auc_obs = float(roc_auc_score(df["y_true"], df["y_score"]))
    
        subj_ids = df["subject"].unique()
        if len(subj_ids) == 0:
            return (auc_obs, float("nan"), float("nan"))
    
        auc_boot = []
        for _ in range(n_boot):
            sampled = rng.choice(subj_ids, size=len(subj_ids), replace=True)
            df_boot = pd.concat(
                [df[df["subject"] == sid] for sid in sampled],
                ignore_index=True
            )
            if df_boot["y_true"].nunique() < 2:
                continue
            auc_boot.append(float(roc_auc_score(df_boot["y_true"], df_boot["y_score"])))
        if auc_boot:
            ci_low = float(np.quantile(auc_boot, 0.025))
            ci_high = float(np.quantile(auc_boot, 0.975))
        else:
            ci_low = ci_high = float("nan")
        return auc_obs, ci_low, ci_high
    
    # overall
    auc_overall, ci_low_overall, ci_high_overall = _bootstrap_auc_by_subject(df_pool)
    
    # MSSQ-Low
    df_low = df_pool[df_pool["mssq_group"] == MSSQ_LOW_LABEL]
    auc_low, ci_low_low, ci_high_low = _bootstrap_auc_by_subject(df_low)
    
    # MSSQ-High
    df_high = df_pool[df_pool["mssq_group"] == MSSQ_HIGH_LABEL]
    auc_high, ci_low_high, ci_high_high = _bootstrap_auc_by_subject(df_high)
    
    records = [
        {
            "group": "overall",
            "k": best_k,
            "auc": auc_overall,
            "ci_low": ci_low_overall,
            "ci_high": ci_high_overall,
            "n_subjects": int(df_pool["subject"].nunique()),
            "n_samples": int(len(df_pool)),
        },
        {
            "group": f"MSSQ={MSSQ_LOW_LABEL}",
            "k": best_k,
            "auc": auc_low,
            "ci_low": ci_low_low,
            "ci_high": ci_high_low,
            "n_subjects": int(df_low["subject"].nunique()),
            "n_samples": int(len(df_low)),
        },
        {
            "group": f"MSSQ={MSSQ_HIGH_LABEL}",
            "k": best_k,
            "auc": auc_high,
            "ci_low": ci_low_high,
            "ci_high": ci_high_high,
            "n_subjects": int(df_high["subject"].nunique()),
            "n_samples": int(len(df_high)),
        },
    ]
    
    auc_summary_path = outpath("AUC_K_CI_MSSQ_SPLIT.csv")
    pd.DataFrame(records).to_csv(auc_summary_path, index=False, encoding="utf-8-sig")
    print(f"[Cell5A-MSSQ] AUC summary を保存 -> {auc_summary_path}")
    
    print(
        f"[Cell5A-MSSQ] overall AUC={auc_overall:.4f} "
        f"(95% CI [{ci_low_overall:.4f}, {ci_high_overall:.4f}])"
    )
    print(
        f"[Cell5A-MSSQ] MSSQ={MSSQ_LOW_LABEL} AUC={auc_low:.4f} "
        f"(95% CI [{ci_low_low:.4f}, {ci_high_low:.4f}])"
    )
    print(
        f"[Cell5A-MSSQ] MSSQ={MSSQ_HIGH_LABEL} AUC={auc_high:.4f} "
        f"(95% CI [{ci_low_high:.4f}, {ci_high_high:.4f}])"
    )
    
    # ---------- ROC 曲線（overall） ----------
    fpr, tpr, _ = roc_curve(y_pool, s_pool)
    
    FS_TITLE, FS_LABEL, FS_TICK = 30, 24, 20
    LW = 1.5
    
    plt.figure(figsize=(7, 7))
    plt.plot(fpr, tpr, linewidth=LW,
             label=f"MSSQ-split models (AUC = {auc_overall:.3f})")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Chance")
    
    plt.xlabel("False Positive Rate", fontsize=FS_LABEL)
    plt.ylabel("True Positive Rate", fontsize=FS_LABEL)
    plt.title("ROC Curve (Best Subset, MSSQ-split models)", fontsize=FS_TITLE)
    plt.legend(loc="lower right", fontsize=FS_TICK)
    plt.grid(True, alpha=0.4)
    
    plt.tick_params(axis="x", labelsize=FS_TICK)
    plt.tick_params(axis="y", labelsize=FS_TICK)
    
    plt.tight_layout()
    plt.savefig(outpath("AUC_K_CI_MSSQ_SPLIT.png"), dpi=300)
    plt.close()
    print(f"[Cell5A-MSSQ] ROC 図を保存 -> {outpath('AUC_K_CI_MSSQ_SPLIT.png')}")
    


In [None]:
# ===== Cell 5B: SHAP 可視化（MSSQ層別・MAXkハイライト） =====
RUN_CELL5 = bool(globals().get('RUN_CELL5', True))
if not RUN_CELL5:
    print('[Cell5B] RUN_CELL5=False -> skip')
else:
    set_cell_output(5)

    import os
    import re
    import json
    import shap
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    required = ["X_all", "y_all", "fit_classifier", "outpath", "SEED_BASE", "OUT_DIR"]
    missing = [name for name in required if name not in globals()]
    if missing:
        raise RuntimeError(f"[Cell5B-SHAP] 未定義の変数/関数があります: {missing}")

    def get_feature_group(col: str) -> str:
        m = re.match(r"(.+)_lag\d+$", col)
        return m.group(1) if m else col

    FEATURE_LIST_PATH = cell_output_path(3, "FEATURES_AFTER_CORR.json")
    if os.path.exists(FEATURE_LIST_PATH):
        with open(FEATURE_LIST_PATH, "r", encoding="utf-8") as f:
            keep_payload = json.load(f)
        keep_cols = [c for c in keep_payload.get("keep", []) if c in X_all.columns]
        if not keep_cols:
            raise RuntimeError("[Cell5B-SHAP] FEATURES_AFTER_CORR.json の keep に有効な列がありません。")
        cols_for_model = keep_cols
        print(f"[Cell5B-SHAP] Using correlation-pruned columns ({len(cols_for_model)})")
    else:
        cols_for_model = list(X_all.columns)
        print("[Cell5B-SHAP] FEATURES_AFTER_CORR.json が無いため、全列を使用します。")

    col_to_group: dict[str, str] = {}
    group_to_cols: dict[str, list[str]] = {}
    for col in cols_for_model:
        g = get_feature_group(col)
        col_to_group[col] = g
        group_to_cols.setdefault(g, []).append(col)

    group_names = list(group_to_cols.keys())

    group_rank_csv = cell_output_path(3, GROUP_RANKING_FILE)
    if os.path.exists(group_rank_csv):
        group_rank_df = pd.read_csv(group_rank_csv, index_col=0)
        if "rank_mean" in group_rank_df.columns:
            group_order = group_rank_df.sort_values("rank_mean").index.tolist()
        else:
            group_order = list(group_rank_df.index)
        print(f"[Cell5B-SHAP] グループランキング順に従って描画します。groups={len(group_order)}")
    else:
        group_order = group_names
        print("[Cell5B-SHAP] GROUP_RANKING_FILE が無いため、グループ名の順で描画します。")
    group_order = [g for g in group_order if g in group_to_cols]

    X_shap = X_all[cols_for_model].astype(np.float32)
    y_shap = y_all.astype(int)
    print(f"[Cell5B-SHAP] samples={X_shap.shape[0]}, columns={X_shap.shape[1]}")

    model = fit_classifier(X_shap, y_shap)

    background = shap.sample(X_shap, min(256, len(X_shap)), random_state=SEED_BASE)
    explainer = shap.TreeExplainer(
        model,
        data=background,
        model_output="probability",
        feature_perturbation="interventional",
    )

    shap_values_any = explainer.shap_values(X_shap)
    if isinstance(shap_values_any, list):
        if hasattr(model, "classes_") and 1 in list(model.classes_):
            class_idx = list(model.classes_).index(1)
        else:
            class_idx = -1
        shap_values_col = shap_values_any[class_idx]
    else:
        shap_values_col = shap_values_any

    shap_values_col = np.asarray(shap_values_col)
    if shap_values_col.ndim == 3:
        if hasattr(model, "classes_") and 1 in list(model.classes_):
            pos_idx = list(model.classes_).index(1)
        else:
            pos_idx = -1
        shap_values_col = shap_values_col[:, :, pos_idx]
    elif shap_values_col.ndim == 1:
        shap_values_col = shap_values_col.reshape(-1, 1)

    if shap_values_col.shape[1] != X_shap.shape[1]:
        raise RuntimeError(f"[Cell5B-SHAP] shap_values 形状が一致しません: {shap_values_col.shape} vs {X_shap.shape}")

    n_samples = shap_values_col.shape[0]
    n_groups = len(group_order)

    shap_values_group = np.zeros((n_samples, n_groups), dtype=float)
    for j, g in enumerate(group_order):
        cols = group_to_cols[g]
        idx_cols = [X_shap.columns.get_loc(c) for c in cols]
        shap_values_group[:, j] = np.sum(shap_values_col[:, idx_cols], axis=1)

    shap_abs_mean_group = np.mean(np.abs(shap_values_group), axis=0)
    shap_abs_mean_group_series = pd.Series(shap_abs_mean_group, index=group_order, name="mean_abs")

    shap_mean_df = pd.DataFrame({
        "group": group_order,
        "mean_abs": shap_abs_mean_group,
    }).sort_values("mean_abs", ascending=False)

    shap_csv_path = cell_output_path(5, "SHAP_GROUP_MEAN_ABS.csv")
    shap_mean_df.to_csv(shap_csv_path, index=False, encoding="utf-8-sig")
    print(f"[Cell5B-SHAP] グループ SHAP 平均絶対値を保存 -> {shap_csv_path}")

    TOP_K = int(globals().get("TOP_SUBSET_K", 15))
    highlight_groups: list[str] = []
    subset_json_candidates = [
        cell_output_path(5, f"TOP{TOP_K}_SUBSET_BEST_MSSQ_SPLIT.json"),
        cell_output_path(5, "TOP10_SUBSET_BEST_MSSQ_SPLIT.json"),
        cell_output_path(5, "ALLK_SUBSET_BEST_MSSQ_SPLIT.json"),
        cell_output_path(4, f"TOP{TOP_K}_SUBSET_BEST.json"),
        cell_output_path(3, f"TOP{TOP_K}_SUBSET_BEST.json"),
    ]
    subset_json_path = next((p for p in subset_json_candidates if os.path.exists(p)), None)
    if subset_json_path is not None:
        try:
            with open(subset_json_path, "r", encoding="utf-8") as f:
                info = json.load(f)
            feature_cols_subset = info.get("features", [])
            highlight_groups = sorted({get_feature_group(c) for c in feature_cols_subset if c in col_to_group})
            print(f"[Cell5B-SHAP] subset highlight groups (from {os.path.basename(subset_json_path)}):")
            print(f"  {highlight_groups}")
        except Exception as e:
            print(f"[Cell5B-SHAP][WARN] subset読み込み失敗: {e}")
            highlight_groups = []
    else:
        print("[Cell5B-SHAP] subset JSON が見つからなかったため、ハイライトなし。")

    plt.figure(figsize=(12, 8))
    shap.summary_plot(
        shap_values_group,
        features=pd.DataFrame(shap_values_group, columns=group_order),
        feature_names=group_order,
        max_display=TOP_K,
        show=False,
        plot_type="bar",
    )
    if highlight_groups:
        ax = plt.gca()
        for label in ax.get_yticklabels():
            if label.get_text() in highlight_groups:
                label.set_color("red")
                label.set_fontweight("bold")
    plt.tight_layout()
    summary_top_path = cell_output_path(5, f"SHAP_GROUP_SUMMARY_TOP{TOP_K}.png")
    plt.savefig(summary_top_path, dpi=300)
    plt.close()
    print(f"[Cell5B-SHAP] Summary plot (Top-{TOP_K} groups) -> {summary_top_path}")

    plt.figure(figsize=(12, 8))
    shap.summary_plot(
        shap_values_group,
        features=pd.DataFrame(shap_values_group, columns=group_order),
        feature_names=group_order,
        max_display=len(group_order),
        show=False,
        plot_type="bar",
    )
    if highlight_groups:
        ax = plt.gca()
        for label in ax.get_yticklabels():
            if label.get_text() in highlight_groups:
                label.set_color("red")
                label.set_fontweight("bold")
    plt.tight_layout()
    summary_all_path = cell_output_path(5, "SHAP_GROUP_SUMMARY_ALL.png")
    plt.savefig(summary_all_path, dpi=300)
    plt.close()
    print(f"[Cell5B-SHAP] Summary plot (ALL groups) -> {summary_all_path}")


In [None]:
# ===== Cell 6 helper: Inner LOSO folds builder =====
set_cell_output(6)



from typing import List
import pandas as pd

def choose_inner_folds_loso(train_subject_ids: List[str]) -> List[List[str]]:
    """
    外側LOSOで得た “学習側の被験者ID” リストを受け取り、
    1名ずつ検証に回す inner-LOSO のfoldリスト（[[sid1], [sid2], ...]）を返す。
    """
    if not isinstance(train_subject_ids, (list, tuple)):
        raise RuntimeError("[inner folds] train_subject_ids は list/tuple を想定しています。")
    uniq = list(pd.unique(pd.Series([str(sid) for sid in train_subject_ids])))
    if len(uniq) == 0:
        raise RuntimeError("[inner folds] train_subject_ids が空です。")
    uniq_sorted = sorted(uniq, key=lambda x: (len(x), x))
    folds = [[sid] for sid in uniq_sorted]
    print(f"[inner folds] {len(folds)} splits -> val subjects = {', '.join(uniq_sorted)}")
    return folds


In [None]:
# ===== Cell 6-Neutral: inner-LOSO τ最適化（グリッド探索→CSV出力のみ） =====
RUN_CELL6_NEUTRAL = bool(globals().get('RUN_CELL6_NEUTRAL', True))
if not RUN_CELL6_NEUTRAL:
    print('[Cell6N-18] RUN_CELL6_NEUTRAL=False -> skip')
else:
    set_cell_output(6)
    
    import os
    import json
    import numpy as np
    import pandas as pd
    import sklearn.metrics as skm
    from sklearn.model_selection import LeaveOneGroupOut
    
    # ---------------- モード切替（F1 / BA） ----------------
    CELL6_MODE = str(globals().get("CELL6_MODE", "F1")).upper()
    if CELL6_MODE not in {"F1", "BA"}:
        raise ValueError(f"[Cell6] CELL6_MODE は 'F1' または 'BA' を指定してください（今: {CELL6_MODE}）")
    MODE_TAG = CELL6_MODE
    METRIC_LABEL = MODE_TAG
    
    # グリッド探索の分解能（未定義ならデフォルト値）
    COARSE_STEPS = int(globals().get("COARSE_STEPS", 51))
    FINE_STEPS   = int(globals().get("FINE_STEPS", 51))
    FINE_MARGIN  = float(globals().get("FINE_MARGIN", 0.05))
    
    # ---------------- 出力ディレクトリ（Cell6 内に F1/BA サブディレクトリ） ----------------
    CELL6_ROOT = OUT_DIR  # 例: .../Cell6
    MODE_DIR = os.path.join(CELL6_ROOT, MODE_TAG)
    os.makedirs(MODE_DIR, exist_ok=True)
    
    def cell6_out(filename: str) -> str:
        path = os.path.join(MODE_DIR, filename)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path
    
    GROUP_AWARE_DIR = os.path.join(MODE_DIR, "GROUP_AWARE")
    os.makedirs(GROUP_AWARE_DIR, exist_ok=True)
    
    def groupaware_out(filename: str) -> str:
        path = os.path.join(GROUP_AWARE_DIR, filename)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path
    
    # ---------------- 群分け基準（MSSQ / VIMSSQ 切り替え） ----------------
    GROUPING_BASIS_FOR_FAIRNESS = globals().get(
        "GROUPING_BASIS_FOR_FAIRNESS",
        globals().get("GROUPING_BASIS_FOR_PLOTS", "MSSQ"),
    )
    basis = str(GROUPING_BASIS_FOR_FAIRNESS).upper()
    if basis == "MSSQ":
        GROUP_COL_NAME = "MSSQ_group"
    elif basis == "VIMSSQ":
        GROUP_COL_NAME = "VIMSSQ_group"
    else:
        raise ValueError(
            f"[Cell6] GROUPING_BASIS_FOR_FAIRNESS は 'MSSQ' か 'VIMSSQ' を指定してください（今: {GROUPING_BASIS_FOR_FAIRNESS}）"
        )
    print(f"[Cell6] MODE={MODE_TAG} / group_col={GROUP_COL_NAME}")
    
    # ---------------- 基本チェック ----------------
    req = [
        "X_all", "y_all", "groups", "SUBJECT_META",
        "choose_inner_folds_loso", "fit_classifier",
        "predict_positive_score", "outpath"
    ]
    missing = [v for v in req if v not in globals()]
    if missing:
        raise RuntimeError(f"[Cell6] 未定義の変数/関数があります: {missing}")
    
    # ---------------- 入力整形 ----------------
    X_base = X_all.astype(np.float32)
    y_base = y_all.astype(int)
    g_base = groups.astype(str)
    
    # 群ラベルを High/Low に正規化
    if "subject_id" in SUBJECT_META.columns:
        mapper = SUBJECT_META.set_index("subject_id")[GROUP_COL_NAME].astype(str).to_dict()
    else:
        mapper = SUBJECT_META[GROUP_COL_NAME].astype(str).to_dict()
    
    fair_groups_raw = g_base.map(mapper)
    if fair_groups_raw.isna().any():
        raise RuntimeError(f"[Cell6] {GROUP_COL_NAME} 未割当ID: {sorted(set(g_base[fair_groups_raw.isna()]))}")
    
    fair_groups = (
        fair_groups_raw.astype(str).str.strip().str.lower().map({"high": "High", "low": "Low"})
    )
    if fair_groups.isna().any():
        bad_labels = sorted(set(fair_groups_raw[~fair_groups_raw.isin(["High","Low","high","low"]) ]))
        raise RuntimeError(f"[Cell6] {GROUP_COL_NAME} に 'High'/'Low' 以外のラベル: {bad_labels}")
    
    print(f"\n[Cell6-DEBUG] SUBJECT_META {GROUP_COL_NAME} 分類一覧")
    if "subject_id" in SUBJECT_META.columns:
        dbg_meta = SUBJECT_META[["subject_id", GROUP_COL_NAME]].copy()
    else:
        dbg_meta = SUBJECT_META.reset_index()[["subject_id", GROUP_COL_NAME]].copy()
    print(dbg_meta.sort_values([GROUP_COL_NAME, "subject_id"]).to_string(index=False))
    
    # ---------------- 特徴選抜（Cell3A-Subset の JSON を流用） ----------------
    subset_primary = f"TOP{int(globals().get('TOP_SUBSET_K', 15))}_SUBSET_BEST.json"
    subset_candidates = [
        cell_output_path(4, subset_primary),
        cell_output_path(4, "TOP10_SUBSET_BEST.json"),
        cell_output_path(3, subset_primary),
        cell_output_path(3, "TOP10_SUBSET_BEST.json"),
    ]
    subset_json = next((p for p in subset_candidates if os.path.exists(p)), None)
    if subset_json is None:
        raise FileNotFoundError("[Cell6] TOP*_SUBSET_BEST.json が見つからない．Cell4 を実行すること．")
    
    with open(subset_json, "r", encoding="utf-8") as f:
        subset_info = json.load(f)
    raw_features = subset_info.get("features", [])
    if not raw_features:
        raise RuntimeError(f"[Cell6] JSON 内に 'features' が空です -> {os.path.basename(subset_json)}")
    
    feature_order = [f for f in raw_features if f in X_base.columns]
    if not feature_order:
        raise RuntimeError(f"[Cell6] JSON の features が X_all に1つも存在しない: {raw_features}")
    
    extra_traits = []
    if globals().get("USE_MSSQ_FEATURE", False) and "MSSQ" in X_base.columns:
        extra_traits.append("MSSQ")
    if globals().get("USE_VIMSSQ_FEATURE", False) and "VIMSSQ" in X_base.columns:
        extra_traits.append("VIMSSQ")
    extra_traits = [f for f in extra_traits if f not in feature_order]
    
    feats_k = feature_order + extra_traits
    print(f"[Cell6] Using subset features from {os.path.basename(subset_json)}")
    print(f"[Cell6] JSON features (base) k={len(feature_order)}: {feature_order}")
    if extra_traits:
        print(f"[Cell6] 追加で使用する属性特徴: {extra_traits}")
    print(f"[Cell6] 最終的に使用する特徴数 = {len(feats_k)}")
    
    X_k = X_base[feats_k]
    
    # ---------------- 指標ユーティリティ（F1/BA をモードで切替） ----------------
    def _conf_from_preds(y_true: np.ndarray, y_pred: np.ndarray):
        TN, FP, FN, TP = skm.confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
        return TP, FP, FN, TN
    
    def _metric_from_conf(TP, FP, FN, TN) -> float:
        TP = float(TP); FP = float(FP); FN = float(FN); TN = float(TN)
        if MODE_TAG == "F1":
            denom = (2*TP + FP + FN)
            return (2*TP / denom) if denom > 0 else 0.0
        # BA
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        tnr = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        return 0.5 * (tpr + tnr)
    
    def _metric_binary(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        TP, FP, FN, TN = _conf_from_preds(y_true, y_pred)
        return _metric_from_conf(TP, FP, FN, TN)
    
    def _grid(l, r, steps):
        l = float(max(0.0, l)); r = float(min(1.0, r))
        if l > r: l, r = r, l
        return np.linspace(l, r, int(steps), dtype=float)
    
    # ---------------- τ最適化（Single / WG / Group / Attr） ----------------
    def _single_tau_opt(scores: np.ndarray, y: np.ndarray):
        cands = _grid(0.0, 1.0, COARSE_STEPS)
        vals = np.array([_metric_binary(y, (scores >= t).astype(int)) for t in cands])
        idx = int(np.nanargmax(vals)); tau0 = float(cands[idx]); best0 = float(vals[idx])
    
        left, right = max(0.0, tau0 - FINE_MARGIN), min(1.0, tau0 + FINE_MARGIN)
        cands2 = _grid(left, right, FINE_STEPS)
        vals2 = np.array([_metric_binary(y, (scores >= t).astype(int)) for t in cands2])
        idx2 = int(np.nanargmax(vals2)); tau = float(cands2[idx2]); best = float(vals2[idx2])
    
        return {
            "tau": tau,
            f"{METRIC_LABEL}_val": best,
            "tau_coarse": tau0,
            f"{METRIC_LABEL}_coarse": best0,
        }
    
    def _wg_opt_joint(scores: np.ndarray, y: np.ndarray, grp: np.ndarray):
        maskH = (grp == "High"); maskL = (grp == "Low")
        if (maskH.sum()==0) or (maskL.sum()==0):
            raise RuntimeError("[WG] 連結valに High/Low の両群が必要")
    
        sH, yH = scores[maskH], y[maskH]
        sL, yL = scores[maskL], y[maskL]
    
        candH = _grid(0.0, 1.0, COARSE_STEPS)
        candL = _grid(0.0, 1.0, COARSE_STEPS)
    
        def _metric_vec(s, yy, cands):
            return np.array([_metric_binary(yy, (s >= t).astype(int)) for t in cands])
    
        mH = _metric_vec(sH, yH, candH)
        mL = _metric_vec(sL, yL, candL)
    
        best = {"WG": -np.inf, "pooled": -np.inf, "tH": 0.5, "tL": 0.5, "mH": 0.0, "mL": 0.0}
    
        for i, tH in enumerate(candH):
            wg_row = np.minimum(mH[i], mL)
            j = int(np.nanargmax(wg_row))
            WG = float(wg_row[j])
    
            yhatH = (sH >= tH).astype(int)
            yhatL = (sL >= candL[j]).astype(int)
            pooled = _metric_binary(
                np.concatenate([yH, yL]),
                np.concatenate([yhatH, yhatL]),
            )
    
            cand = {
                "WG": WG,
                "pooled": float(pooled),
                "tH": float(tH),
                "tL": float(candL[j]),
                "mH": float(mH[i]),
                "mL": float(mL[j]),
            }
    
            def _is_better(cur, new):
                if new["WG"] > cur["WG"]: return True
                if new["WG"] < cur["WG"]: return False
                if new["pooled"] > cur["pooled"]: return True
                if new["pooled"] < cur["pooled"]: return False
                if abs(new["tH"]-new["tL"]) < abs(cur["tH"]-cur["tL"]): return True
                if abs(new["tH"]-new["tL"]) > abs(cur["tH"]-cur["tL"]): return False
                if (new["tH"], new["tL"]) < (cur["tH"], cur["tL"]): return True
                return False
    
            if _is_better(best, cand):
                best = cand
    
        # fine search
        lH, rH = max(0.0, best["tH"] - FINE_MARGIN), min(1.0, best["tH"] + FINE_MARGIN)
        lL, rL = max(0.0, best["tL"] - FINE_MARGIN), min(1.0, best["tL"] + FINE_MARGIN)
        candH2 = _grid(lH, rH, FINE_STEPS)
        candL2 = _grid(lL, rL, FINE_STEPS)
    
        best2 = dict(best)
    
        def _metric_vec2(s, yy, cands):
            return np.array([_metric_binary(yy, (s >= t).astype(int)) for t in cands])
    
        mH2 = _metric_vec2(sH, yH, candH2)
        mL2 = _metric_vec2(sL, yL, candL2)
    
        for i, tH in enumerate(candH2):
            wg_row = np.minimum(mH2[i], mL2)
            j = int(np.nanargmax(wg_row))
            WG = float(wg_row[j])
    
            yhatH = (sH >= tH).astype(int)
            yhatL = (sL >= candL2[j]).astype(int)
            pooled = _metric_binary(
                np.concatenate([yH, yL]),
                np.concatenate([yhatH, yhatL]),
            )
    
            cand = {
                "WG": WG,
                "pooled": float(pooled),
                "tH": float(tH),
                "tL": float(candL2[j]),
                "mH": float(mH2[i]),
                "mL": float(mL2[j]),
            }
    
            if (cand["WG"] > best2["WG"] or
                (cand["WG"] == best2["WG"] and
                 (cand["pooled"] > best2["pooled"] or
                  (cand["pooled"] == best2["pooled"] and
                   abs(cand["tH"]-cand["tL"]) <= abs(best2["tH"]-best2["tL"])) ))):
                best2 = cand
    
        return {
            "tauH": best2["tH"], "tauL": best2["tL"],
            f"{METRIC_LABEL}_H_val": best2["mH"],
            f"{METRIC_LABEL}_L_val": best2["mL"],
            f"WG_{METRIC_LABEL}_val": best2["WG"],
            f"{METRIC_LABEL}_pooled_val": best2["pooled"],
        }
    
    def _group_opt(scores: np.ndarray, y: np.ndarray, grp: np.ndarray):
        """全体（High+Low）の METRIC_LABEL を最大化する group-wise τ"""
        maskH = (grp == "High"); maskL = (grp == "Low")
        if (maskH.sum()==0) or (maskL.sum()==0):
            raise RuntimeError("[Group] 連結valに High/Low の両群が必要")
    
        sH, yH = scores[maskH], y[maskH]
        sL, yL = scores[maskL], y[maskL]
    
        candH = _grid(0.0, 1.0, COARSE_STEPS)
        candL = _grid(0.0, 1.0, COARSE_STEPS)
    
        def _pooled_for(tH, tL):
            yhatH = (sH >= tH).astype(int)
            yhatL = (sL >= tL).astype(int)
            return _metric_binary(
                np.concatenate([yH, yL]),
                np.concatenate([yhatH, yhatL]),
            )
    
        best = {"val": -np.inf, "tH": None, "tL": None}
        for tH in candH:
            vals = np.array([_pooled_for(tH, tL) for tL in candL])
            jmax = int(np.nanargmax(vals))
            if float(vals[jmax]) > best["val"]:
                best = {"val": float(vals[jmax]), "tH": float(tH), "tL": float(candL[jmax])}
    
        lH, rH = max(0.0, best["tH"] - FINE_MARGIN), min(1.0, best["tH"] + FINE_MARGIN)
        lL, rL = max(0.0, best["tL"] - FINE_MARGIN), min(1.0, best["tL"] + FINE_MARGIN)
        candH2 = _grid(lH, rH, FINE_STEPS)
        candL2 = _grid(lL, rL, FINE_STEPS)
    
        best2 = dict(best)
        for tH in candH2:
            vals2 = np.array([_pooled_for(tH, tL) for tL in candL2])
            jmax = int(np.nanargmax(vals2))
            if float(vals2[jmax]) > best2["val"]:
                best2 = {"val": float(vals2[jmax]), "tH": float(tH), "tL": float(candL2[jmax])}
    
        return {"tauH": best2["tH"], "tauL": best2["tL"], f"{METRIC_LABEL}_val": best2["val"]}
    
    def _attr_independent_opt(scores: np.ndarray, y: np.ndarray, grp: np.ndarray):
        """
        属性ごとに分割し，High / Low それぞれの METRIC_LABEL（F1 or BA）を
        独立に最大化する τ を求めるアルゴリズム。
        """
        maskH = (grp == "High"); maskL = (grp == "Low")
        if (maskH.sum()==0) or (maskL.sum()==0):
            raise RuntimeError("[Attr] 連結valに High/Low の両群が必要")
    
        def _best_for_mask(mask):
            s_g, y_g = scores[mask], y[mask]
            if s_g.size == 0:
                raise RuntimeError("[Attr] ある属性群の val が空です。")
            cands = _grid(0.0, 1.0, COARSE_STEPS)
            vals = np.array([_metric_binary(y_g, (s_g >= t).astype(int)) for t in cands])
            idx = int(np.nanargmax(vals)); tau0 = float(cands[idx])
    
            left, right = max(0.0, tau0 - FINE_MARGIN), min(1.0, tau0 + FINE_MARGIN)
            cands2 = _grid(left, right, FINE_STEPS)
            vals2 = np.array([_metric_binary(y_g, (s_g >= t).astype(int)) for t in cands2])
            idx2 = int(np.nanargmax(vals2)); tau = float(cands2[idx2]); best = float(vals2[idx2])
            return tau, best
    
        tauH, bestH = _best_for_mask(maskH)
        tauL, bestL = _best_for_mask(maskL)
    
        return {
            "tauH": tauH,
            "tauL": tauL,
            f"{METRIC_LABEL}_H_val": bestH,
            f"{METRIC_LABEL}_L_val": bestL,
        }
    
    # ---------------- outer LOSO with inner concatenation ----------------
    logo_outer = LeaveOneGroupOut()
    rows, pred_rows = [], []
    
    for fold_id, (tr_idx, te_idx) in enumerate(logo_outer.split(X_k, y_base.values, g_base.values), start=1):
        train_mask = pd.Series(False, index=g_base.index); train_mask.iloc[tr_idx] = True
        test_mask  = pd.Series(False, index=g_base.index);  test_mask.iloc[te_idx]  = True
        test_sid = g_base.iloc[te_idx].iloc[0]
    
        inner_ids   = sorted(g_base[train_mask].unique())
        inner_folds = choose_inner_folds_loso(inner_ids)
    
        val_scores_all, val_y_all, val_grp_all = [], [], []
    
        inner_train_groups = fair_groups[train_mask]
        if not (("High" in set(inner_train_groups)) and ("Low" in set(inner_train_groups))):
            raise RuntimeError(f"[Cell6] fold{fold_id}: inner-train に両群(High/Low)が必要")
    
        for inner_val in inner_folds:
            val_mask  = g_base.isin(inner_val) & train_mask
            trn_mask  = train_mask & (~val_mask)
            if not trn_mask.any() or not val_mask.any():
                continue
    
            X_tr, y_tr = X_k[trn_mask], y_base[trn_mask]
            X_vl, y_vl = X_k[val_mask], y_base[val_mask]
            grp_vl     = fair_groups[val_mask].to_numpy()
    
            model_inner = fit_classifier(X_tr, y_tr)
            sc_vl = predict_positive_score(model_inner, X_vl).astype(float)
    
            val_scores_all.append(sc_vl)
            val_y_all.append(y_vl.to_numpy())
            val_grp_all.append(grp_vl)
    
        if not val_scores_all:
            raise RuntimeError(f"[Cell6] fold{fold_id}: inner val が空です。")
    
        val_scores = np.concatenate(val_scores_all)
        val_y = np.concatenate(val_y_all)
        val_grp = np.concatenate(val_grp_all)
    
        res_single = _single_tau_opt(val_scores, val_y)
        res_wg     = _wg_opt_joint(val_scores, val_y, val_grp)
        res_group  = _group_opt(val_scores, val_y, val_grp)
        res_attr   = _attr_independent_opt(val_scores, val_y, val_grp)
    
        # outer train/test
        X_tr_out, y_tr_out = X_k[train_mask], y_base[train_mask]
        X_te_out, y_te_out = X_k[test_mask], y_base[test_mask]
        grp_te_out = fair_groups[test_mask].to_numpy()
    
        model_outer = fit_classifier(X_tr_out, y_tr_out)
        sc_te = predict_positive_score(model_outer, X_te_out).astype(float)
    
        tau_single      = float(res_single["tau"])
        tau_high_group  = float(res_group["tauH"])
        tau_low_group   = float(res_group["tauL"])
        tau_high_wg     = float(res_wg["tauH"])
        tau_low_wg      = float(res_wg["tauL"])
        tau_high_attr   = float(res_attr["tauH"])
        tau_low_attr    = float(res_attr["tauL"])
    
        yhat_single = (sc_te >= tau_single).astype(int)
        yhat_group  = np.where(
            grp_te_out == "High",
            (sc_te >= tau_high_group).astype(int),
            (sc_te >= tau_low_group).astype(int),
        )
        yhat_wg = np.where(
            grp_te_out == "High",
            (sc_te >= tau_high_wg).astype(int),
            (sc_te >= tau_low_wg).astype(int),
        )
        yhat_attr = np.where(
            grp_te_out == "High",
            (sc_te >= tau_high_attr).astype(int),
            (sc_te >= tau_low_attr).astype(int),
        )
    
        metric_single = _metric_binary(y_te_out, yhat_single)
        metric_group  = _metric_binary(y_te_out, yhat_group)
        metric_wg     = _metric_binary(y_te_out, yhat_wg)
        metric_attr   = _metric_binary(y_te_out, yhat_attr)
    
        pred_rows.append(pd.DataFrame({
            "fold_id": fold_id,
            "test_id": test_sid,
            "group": grp_te_out,
            "y_true": y_te_out,
            "proba": sc_te,
            "tau_single": tau_single,
            "tau_high_group": tau_high_group,
            "tau_low_group": tau_low_group,
            "tau_high_wg": tau_high_wg,
            "tau_low_wg": tau_low_wg,
            "tau_high_attr": tau_high_attr,
            "tau_low_attr": tau_low_attr,
            "y_pred_single": yhat_single,
            "y_pred_group": yhat_group,
            "y_pred_wg": yhat_wg,
            "y_pred_attr": yhat_attr,
            f"metric_single_{MODE_TAG}": metric_single,
            f"metric_group_{MODE_TAG}": metric_group,
            f"metric_wg_{MODE_TAG}": metric_wg,
            f"metric_attr_{MODE_TAG}": metric_attr,
        }))
    
        col_high_group = f"tau_high_Group{MODE_TAG}"
        col_low_group  = f"tau_low_Group{MODE_TAG}"
        col_high_wg    = f"tau_high_WG{MODE_TAG}"
        col_low_wg     = f"tau_low_WG{MODE_TAG}"
        col_high_attr  = f"tau_high_Attr{MODE_TAG}"
        col_low_attr   = f"tau_low_Attr{MODE_TAG}"
    
        rows.append({
            "fold_id": fold_id,
            "test_id": test_sid,
            "tau_single": tau_single,
            col_high_group: tau_high_group,
            col_low_group:  tau_low_group,
            col_high_wg:    tau_high_wg,
            col_low_wg:     tau_low_wg,
            col_high_attr:  tau_high_attr,
            col_low_attr:   tau_low_attr,
            f"val_single_{MODE_TAG}": res_single[f"{METRIC_LABEL}_val"],
            f"val_group_{MODE_TAG}":  res_group[f"{METRIC_LABEL}_val"],
            f"val_wg_{MODE_TAG}":     res_wg[f"WG_{METRIC_LABEL}_val"],
            f"val_attr_H_{MODE_TAG}": res_attr[f"{METRIC_LABEL}_H_val"],
            f"val_attr_L_{MODE_TAG}": res_attr[f"{METRIC_LABEL}_L_val"],
        })
    
    # ---------------- 結果 CSV 出力＋固定0.5閾値の参照値 ----------------
    df_pred = pd.concat(pred_rows, ignore_index=True)
    df_rows = pd.DataFrame(rows)

    # F1 at fixed threshold 0.5 (reference)
    y_all = df_pred["y_true"].to_numpy(int)
    y_pred_fixed05 = (df_pred["proba"] >= 0.5).astype(int)
    try:
        f1_fixed05 = skm.f1_score(y_all, y_pred_fixed05) if np.unique(y_all).size == 2 else float("nan")
    except Exception:
        f1_fixed05 = float("nan")
    df_pred["y_pred_fixed05"] = y_pred_fixed05
    print(f"[Cell6] F1@0.5 = {f1_fixed05:.4f} (overall)")
    
    pred_path = groupaware_out("GROUP_AWARE_PREDICTIONS.CSV")
    df_pred.to_csv(pred_path, index=False, encoding="utf-8-sig")
    fold_path = groupaware_out("GROUP_AWARE_THRESH_BY_FOLD.CSV")
    df_rows.to_csv(fold_path, index=False, encoding="utf-8-sig")
    
    print(f"[Cell6] predictions -> {pred_path}")
    print(f"[Cell6] thresholds  -> {fold_path}")
    print(f"[Cell6] DONE. MODE={MODE_TAG}, k={len(feats_k)} features.")
    


In [None]:
# ===== Cell 6-Neutral: F1/BA メトリクス計算＋混同行列図＋ROC 図 =====
RUN_CELL6_NEUTRAL = bool(globals().get('RUN_CELL6_NEUTRAL', True))
if not RUN_CELL6_NEUTRAL:
    print('[Cell6N-19] RUN_CELL6_NEUTRAL=False -> skip')
else:
    set_cell_output(6)
    
    import os
    import numpy as np
    import pandas as pd
    import sklearn.metrics as skm
    import matplotlib.pyplot as plt
    
    # ---------------- モード切替（F1 / BA） ----------------
    CELL6_MODE = str(globals().get("CELL6_MODE", "F1")).upper()
    if CELL6_MODE not in {"F1", "BA"}:
        raise ValueError(f"[Cell6N.1] CELL6_MODE は 'F1' または 'BA' を指定してください（今: {CELL6_MODE}）")
    MODE_TAG = CELL6_MODE
    METRIC_LABEL = MODE_TAG
    
    # 描画体裁
    LW = 1.5
    FS_TITLE, FS_LABEL, FS_LEGEND, FS_TICK = 30, 24, 20, 20
    
    # ---------------- パス設定 ----------------
    CELL6_ROOT = OUT_DIR              # 例: .../Cell6
    MODE_DIR   = os.path.join(CELL6_ROOT, MODE_TAG)
    GROUP_AWARE_DIR = os.path.join(MODE_DIR, "GROUP_AWARE")
    os.makedirs(GROUP_AWARE_DIR, exist_ok=True)
    
    def cell6_out(filename: str) -> str:
        path = os.path.join(MODE_DIR, filename)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path
    
    MAT_DIR = os.path.join(MODE_DIR, f"MATRIX_{MODE_TAG}")
    os.makedirs(MAT_DIR, exist_ok=True)
    
    pred_path = os.path.join(GROUP_AWARE_DIR, "GROUP_AWARE_PREDICTIONS.CSV")
    fold_path = os.path.join(GROUP_AWARE_DIR, "GROUP_AWARE_THRESH_BY_FOLD.CSV")
    if not (os.path.exists(pred_path) and os.path.exists(fold_path)):
        raise FileNotFoundError(f"[Cell6N.1] 必要CSVが見つからない（先に Cell6 {MODE_TAG} を実行しておくこと）")
    
    df_pred = pd.read_csv(pred_path, encoding="utf-8-sig")
    df_fold = pd.read_csv(fold_path, encoding="utf-8-sig")
    
    # ---------------- メトリクス用ユーティリティ ----------------
    def _conf_from_preds(y_true: np.ndarray, y_pred: np.ndarray):
        TN, FP, FN, TP = skm.confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
        return TP, FP, FN, TN
    
    def _metric_from_conf(TP, FP, FN, TN) -> float:
        TP = float(TP); FP = float(FP); FN = float(FN); TN = float(TN)
        if MODE_TAG == "F1":
            denom = (2 * TP + FP + FN)
            return (2 * TP / denom) if denom > 0 else 0.0
        # BA
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        tnr = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        return 0.5 * (tpr + tnr)
    
    def _metric_binary(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        TP, FP, FN, TN = _conf_from_preds(y_true, y_pred)
        return _metric_from_conf(TP, FP, FN, TN)
    
    # ---------------- AUC + ROC 図 ----------------
    y_pool = df_pred["y_true"].to_numpy()
    s_pool = df_pred["proba"].to_numpy()
    subj_pool = df_pred["test_id"].to_numpy()
    
    if len(np.unique(y_pool)) < 2:
        raise RuntimeError("[Cell6N.1] 真値が単一クラスのため ROC-AUC を計算できません。")
    
    auc_obs = float(skm.roc_auc_score(y_pool, s_pool))
    
    rng = np.random.default_rng(20251101)
    df_pool = pd.DataFrame({"subject": subj_pool, "y_true": y_pool, "y_score": s_pool})
    subj_ids = df_pool["subject"].unique()
    auc_boot = []
    for _ in range(2000):
        sampled = rng.choice(subj_ids, size=len(subj_ids), replace=True)
        df_boot = pd.concat(
            [df_pool[df_pool["subject"] == sid] for sid in sampled],
            ignore_index=True,
        )
        if df_boot["y_true"].nunique() < 2:
            continue
        auc_boot.append(float(skm.roc_auc_score(df_boot["y_true"], df_boot["y_score"])))
    
    if auc_boot:
        ci_low = float(np.quantile(auc_boot, 0.025))
        ci_high = float(np.quantile(auc_boot, 0.975))
    else:
        ci_low = ci_high = float("nan")
    
    pd.DataFrame([{
        "mode": MODE_TAG,
        "auc": auc_obs,
        "ci_low": ci_low,
        "ci_high": ci_high,
    }]).to_csv(cell6_out(f"AUC_K_CI_{MODE_TAG}.csv"), index=False, encoding="utf-8-sig")
    
    print(f"[Cell6N.1] AUC={auc_obs:.4f} (95% CI [{ci_low:.4f}, {ci_high:.4f}])")
    
    fpr, tpr, _ = skm.roc_curve(y_pool, s_pool)
    plt.figure(figsize=(7, 7))
    plt.plot(fpr, tpr, label=f"AUC = {auc_obs:.3f}")
    plt.plot([0, 1], [0, 1], linestyle="--", linewidth=LW, color="gray", label="Chance")
    plt.xlabel("False Positive Rate", fontsize=FS_LABEL)
    plt.ylabel("True Positive Rate", fontsize=FS_LABEL)
    plt.title(f"ROC Curve (Best Subset, {MODE_TAG})", fontsize=FS_TITLE)
    plt.legend(loc="lower right", fontsize=FS_LEGEND)
    plt.grid(True, alpha=0.4)
    plt.xticks(fontsize=FS_TICK)
    plt.yticks(fontsize=FS_TICK)
    plt.tight_layout()
    roc_path = cell6_out(f"AUC_K_CI_{MODE_TAG}.png")
    plt.savefig(roc_path, dpi=300)
    plt.close()
    print(f"[Cell6N.1] ROC 図を保存 -> {roc_path}")
    
    # ---------------- F1/BA メトリクス & 混同行列図 ----------------
    summary_rows = []
    y_true_all = df_pred["y_true"].to_numpy(int)
    
    # クラスラベル（必要なら「Not CS」「CS」などに変更）
    CLASS_LABELS = ["Non-Sick", "Sick"]
    
    def _plot_confusion(cm_counts: np.ndarray, key: str, metric_val: float):
        """
        cm_counts : [[TN, FP],
                     [FN, TP]]
        を入力として，
        - Trueラベルごとに正規化（行和=1）
        - x軸 = Predicted, y軸 = True
        の2×2ヒートマップを描画する。
        セルには TN/FP/FN/TP と 実数＋割合 を表示する。
        """
        # ---- Trueラベルごとに正規化（各行の合計=1） ----
        cm = cm_counts.astype(float)
        row_sum = cm.sum(axis=1, keepdims=True)
        cm_norm = np.divide(cm, row_sum, out=np.zeros_like(cm), where=row_sum != 0)
        cm_plot = cm_norm  # 行=True, 列=Pred
    
        # セル名（行=True, 列=Pred）
        CELL_NAMES = [["TN", "FP"],
                      ["FN", "TP"]]
    
        fig, ax = plt.subplots(figsize=(6, 6))
    
        # 0〜1固定スケール & カラーバーを少し短く細く
        im = ax.imshow(cm_plot, vmin=0.0, vmax=1.0, cmap="Blues")
        cbar = plt.colorbar(im, ax=ax, shrink=0.8, fraction=0.046, pad=0.04)
        cbar.ax.tick_params(labelsize=FS_TICK)
    
        # 軸目盛（x = Pred, y = True）
        ax.set_xticks([0, 1])
        ax.set_xticklabels(CLASS_LABELS, fontsize=FS_TICK)
        ax.set_yticks([0, 1])
        ax.set_yticklabels(CLASS_LABELS, fontsize=FS_TICK, rotation=90, va="center")
    
        # 各セルに「TN\n40 (50%)」のように表示
        fs_cell = FS_LEGEND - 2  # ちょっと小さめ
        for i in range(2):      # 行 = True
            for j in range(2):  # 列 = Pred
                val = cm_plot[i, j]          # 割合 (0〜1)
                cnt = int(cm_counts[i, j])   # 実数カウント
                name = CELL_NAMES[i][j]
                pct = val * 100.0
                txt_color = "white" if val >= 0.5 else "black"
                ax.text(
                    j, i,
                    f"{name}\n{cnt} ({pct:.0f}%)",
                    ha="center", va="center",
                    fontsize=fs_cell,
                    color=txt_color,
                )
    
        # 軸ラベル
        ax.set_xlabel("Predicted Label", fontsize=FS_LABEL)
        # True Label を少し右（プロット側）に寄せる → labelpad を小さく
        ax.set_ylabel("True Label", fontsize=FS_LABEL,
                      rotation=90, labelpad=10)
    
        # タイトルは少し上に
        FS_TITLE_CM = 26
        ax.set_title(
            f"{MODE_TAG} — {key.upper()}  ({METRIC_LABEL}={metric_val:.3f})",
            fontsize=FS_TITLE_CM,
            pad=20
        )
    
        plt.tight_layout()
        out_path = os.path.join(MAT_DIR, f"CONF_MATRIX_{MODE_TAG}_{key.upper()}.png")
        plt.savefig(out_path, dpi=300)
        plt.close()
        print(f"[Cell6N.1] Confusion matrix ({key}) -> {out_path}")
    
    
    for key, pred_col in [
        ("single", "y_pred_single"),
        ("group",  "y_pred_group"),
        ("wg",     "y_pred_wg"),
        ("attr",   "y_pred_attr"),
    ]:
        if pred_col not in df_pred.columns:
            print(f"[Cell6N.1] 列 {pred_col} が存在しないため {key} はスキップ")
            continue
    
        y_pred_all = df_pred[pred_col].to_numpy(int)
    
        # メトリクス用（カウント）
        TP, FP, FN, TN = _conf_from_preds(y_true_all, y_pred_all)
        metric_val = _metric_from_conf(TP, FP, FN, TN)
    
        # 混同行列のカウント（行=True, 列=Pred）
        cm_counts = skm.confusion_matrix(y_true_all, y_pred_all, labels=[0, 1])
    
        summary_rows.append({
            "mode": MODE_TAG,
            "decision": key,
            "TP": TP, "FP": FP, "FN": FN, "TN": TN,
            METRIC_LABEL: metric_val,
        })
    
        print(
            f"[Cell6N.1] {MODE_TAG}-{key.upper()} => "
            f"{METRIC_LABEL}={metric_val:.4f}  TP={TP}, FP={FP}, FN={FN}, TN={TN}"
        )
        _plot_confusion(cm_counts, key, metric_val)
    
    summary_df = pd.DataFrame(summary_rows)
    summary_df.to_csv(
        cell6_out(f"METRICS_SUMMARY_{MODE_TAG}.csv"),
        index=False, encoding="utf-8-sig"
    )
    print(f"[Cell6N.1] METRICS_SUMMARY_{MODE_TAG}.csv を保存しました。")
    


In [None]:
# ===== Cell 6-Neutral: 確率スコア分布図（F1/BA切替） =====
RUN_CELL6_NEUTRAL = bool(globals().get('RUN_CELL6_NEUTRAL', True))
if not RUN_CELL6_NEUTRAL:
    print('[Cell6N-20] RUN_CELL6_NEUTRAL=False -> skip')
else:
    set_cell_output(6)
    
    import os
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # ---------------- モード切替（F1 / BA） ----------------
    CELL6_MODE = str(globals().get("CELL6_MODE", "F1")).upper()
    if CELL6_MODE not in {"F1", "BA"}:
        raise ValueError(f"[Cell6N.2] CELL6_MODE は 'F1' または 'BA' を指定してください（今: {CELL6_MODE}）")
    
    # ---------- 設定 ----------  
    BINS = 40
    LW = 1.5
    FS_TITLE, FS_LABEL, FS_LEGEND, FS_TICK = 30, 24, 20, 20
    
    COLOR_SICK = "red"     # True:Sick
    COLOR_NON  = "blue"    # True:Non-Sick
    
    COLOR_SINGLE = "black" # single τ
    COLOR_GROUP  = "green" # group τ
    COLOR_WG     = "purple"# WG τ
    
    # ---------- ディレクトリ ----------  
    RUN_ROOT = OUT_DIR  # Level1/Cell6
    MODE_DIR = os.path.join(RUN_ROOT, CELL6_MODE)
    IMG_DIR  = os.path.join(MODE_DIR, f"PROBA_DIST_{CELL6_MODE}")
    os.makedirs(IMG_DIR, exist_ok=True)
    
    SAVE_OVERALL_SvG   = os.path.join(IMG_DIR, "OVERALL_SvGroup.png")
    SAVE_OVERALL_SvWG  = os.path.join(IMG_DIR, "OVERALL_SvWG.png")
    SAVE_BYGROUP_SvG   = os.path.join(IMG_DIR, "BYGROUP_SvGroup.png")
    SAVE_BYGROUP_SvWG  = os.path.join(IMG_DIR, "BYGROUP_SvWG.png")
    
    GROUP_AWARE_DIR = os.path.join(MODE_DIR, "GROUP_AWARE")
    os.makedirs(GROUP_AWARE_DIR, exist_ok=True)
    
    def groupaware_path(filename: str) -> str:
        return os.path.join(GROUP_AWARE_DIR, filename)
    
    pred_path = groupaware_path("GROUP_AWARE_PREDICTIONS.CSV")
    fold_path = groupaware_path("GROUP_AWARE_THRESH_BY_FOLD.CSV")
    if not (os.path.exists(pred_path) and os.path.exists(fold_path)):
        raise FileNotFoundError(f"[Cell6N.2] 必要CSVが見つからない（先に Cell6 {CELL6_MODE} を実行）")
    
    df_pred = pd.read_csv(pred_path, encoding="utf-8-sig")
    df_fold = pd.read_csv(fold_path, encoding="utf-8-sig")
    
    # ---------- モード自動判定（F1 or BA、列名で確認） ----------  
    cols_f1 = {"high":"tau_high_GroupF1", "low":"tau_low_GroupF1",
               "wgh":"tau_high_WGF1",     "wgl":"tau_low_WGF1"}
    cols_ba = {"high":"tau_high_GroupBA", "low":"tau_low_GroupBA",
               "wgh":"tau_high_WGBA",     "wgl":"tau_low_WGBA"}
    
    if all(c in df_fold.columns for c in [cols_f1["high"], cols_f1["low"]]):
        mode = "F1"
        c_high, c_low, c_wgh, c_wgl = (
            cols_f1["high"], cols_f1["low"], cols_f1["wgh"], cols_f1["wgl"]
        )
    elif all(c in df_fold.columns for c in [cols_ba["high"], cols_ba["low"]]):
        mode = "BA"
        c_high, c_low, c_wgh, c_wgl = (
            cols_ba["high"], cols_ba["low"], cols_ba["wgh"], cols_ba["wgl"]
        )
    else:
        raise RuntimeError("[Cell6N.2] しきい値列が見つからない（F1/BAどちらかのCell6出力が必要）")
    
    # ---------- 集約: 中央値/IQR（Q1〜Q3） ----------  
    def _qstats(s):
        s = pd.to_numeric(s, errors="coerce")
        s = s[np.isfinite(s)]
        if s.size == 0:
            return np.nan, np.nan, np.nan, np.nan
        med = float(np.nanmedian(s))
        q1, q3 = np.nanpercentile(s, [25, 75])
        half = float((q3 - q1)/2.0)
        return float(med), float(q1), float(q3), half
    
    tau_single_med, tau_single_q1, tau_single_q3, _ = _qstats(df_fold["tau_single"])
    tau_high_med,   tau_high_q1,   tau_high_q3,   _ = _qstats(df_fold[c_high])
    tau_low_med,    tau_low_q1,    tau_low_q3,    _ = _qstats(df_fold[c_low])
    tau_high_wg,    tau_high_wg_q1, tau_high_wg_q3, _ = (
        _qstats(df_fold[c_wgh]) if c_wgh in df_fold.columns else (np.nan, np.nan, np.nan, np.nan)
    )
    tau_low_wg,     tau_low_wg_q1,  tau_low_wg_q3,  _ = (
        _qstats(df_fold[c_wgl]) if c_wgl in df_fold.columns else (np.nan, np.nan, np.nan, np.nan)
    )
    
    # ---------- データ分解 ----------  
    proba = pd.to_numeric(df_pred["proba"], errors="coerce").values
    ytrue = pd.to_numeric(df_pred["y_true"], errors="coerce").values.astype(int)
    grp   = df_pred["group"].astype(str).str.strip()
    
    p_sick = proba[ytrue == 1]
    p_non  = proba[ytrue == 0]
    n_sick, n_non = len(p_sick), len(p_non)
    maskH = (grp == "High"); maskL = (grp == "Low")
    
    # ---------- ユーティリティ ----------  
    def _style_axes(ax, title=None):
        if title:
            ax.set_title(title, fontsize=FS_TITLE)
        ax.set_xlabel("Predicted probability", fontsize=FS_LABEL)
        ax.set_ylabel("Density", fontsize=FS_LABEL)
        ax.tick_params(axis="both", labelsize=FS_TICK)
        ax.set_xlim(0, 1)
    
    def _hist_overall(ax):
        ax.hist(p_sick, bins=BINS, density=True, alpha=0.5,
                label=f"True:Sick (n={n_sick})", color=COLOR_SICK)
        ax.hist(p_non,  bins=BINS, density=True, alpha=0.5,
                label=f"True:Non-Sick (n={n_non})", color=COLOR_NON)
    
    def _hist_bygroup(axes):
        p_sick_H = proba[(ytrue==1) & maskH]
        p_non_H  = proba[(ytrue==0) & maskH]
        axes[0].hist(p_sick_H, bins=BINS, density=True, alpha=0.5,
                     label=f"True:Sick (n={len(p_sick_H)})", color=COLOR_SICK)
        axes[0].hist(p_non_H,  bins=BINS, density=True, alpha=0.5,
                     label=f"True:Non-Sick (n={len(p_non_H)})", color=COLOR_NON)
        _style_axes(axes[0], "High group")
    
        p_sick_L = proba[(ytrue==1) & maskL]
        p_non_L  = proba[(ytrue==0) & maskL]
        axes[1].hist(p_sick_L, bins=BINS, density=True, alpha=0.5,
                     label=f"True:Sick (n={len(p_sick_L)})", color=COLOR_SICK)
        axes[1].hist(p_non_L,  bins=BINS, density=True, alpha=0.5,
                     label=f"True:Non-Sick (n={len(p_non_L)})", color=COLOR_NON)
        _style_axes(axes[1], "Low group")
    
    def _vline_with_iqr(ax, x_med, q1, q3, color, ls, label_core):
        if np.isfinite(x_med):
            if np.isfinite(q1) and np.isfinite(q3):
                ax.axvline(x_med, color=color, linestyle=ls, linewidth=LW,
                           label=f"{label_core} = {x_med:.3f} ± {(q3-q1)/2:.3f}")
            else:
                ax.axvline(x_med, color=color, linestyle=ls, linewidth=LW,
                           label=f"{label_core} = {x_med:.3f}")
        if np.isfinite(q1) and np.isfinite(q3):
            ax.axvspan(q1, q3, color=color, alpha=0.12)
    
    # ---------- 1) OVERALL: Single vs Group ----------  
    fig, ax = plt.subplots(figsize=(9,6))
    _hist_overall(ax)
    _vline_with_iqr(ax, tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(ax, tau_high_med,   tau_high_q1,   tau_high_q3,
                    COLOR_GROUP,  "--", f"τ_high_{mode}")
    _vline_with_iqr(ax, tau_low_med,    tau_low_q1,    tau_low_q3,
                    COLOR_GROUP,  "--", f"τ_low_{mode}")
    _style_axes(ax, title=f"Probability distribution (OVERALL) — Single vs Group [{mode}]")
    ax.legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    plt.savefig(SAVE_OVERALL_SvG, dpi=300)
    plt.close()
    print(f"[Cell6N.2] Saved -> {SAVE_OVERALL_SvG}")
    
    # ---------- 2) OVERALL: Single vs WG ----------  
    fig, ax = plt.subplots(figsize=(9,6))
    _hist_overall(ax)
    _vline_with_iqr(ax, tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(ax, tau_high_wg,    tau_high_wg_q1, tau_high_wg_q3,
                    COLOR_WG, ":", "τ_high_WG")
    _vline_with_iqr(ax, tau_low_wg,     tau_low_wg_q1,  tau_low_wg_q3,
                    COLOR_WG, ":", "τ_low_WG")
    _style_axes(ax, title=f"Probability distribution (OVERALL) — Single vs WG [{mode}]")
    ax.legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    plt.savefig(SAVE_OVERALL_SvWG, dpi=300)
    plt.close()
    print(f"[Cell6N.2] Saved -> {SAVE_OVERALL_SvWG}")
    
    # ---------- 3) BY_GROUP: Single vs Group ----------  
    fig, axes = plt.subplots(2, 1, figsize=(9,10), sharex=True)
    _hist_bygroup(axes)
    _vline_with_iqr(axes[0], tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(axes[1], tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(axes[0], tau_high_med, tau_high_q1, tau_high_q3,
                    COLOR_GROUP, "--", f"τ_high_{mode}")
    _vline_with_iqr(axes[1], tau_low_med,  tau_low_q1,  tau_low_q3,
                    COLOR_GROUP, "--", f"τ_low_{mode}")
    axes[0].legend(fontsize=FS_LEGEND)
    axes[1].legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    plt.savefig(SAVE_BYGROUP_SvG, dpi=300)
    plt.close()
    print(f"[Cell6N.2] Saved -> {SAVE_BYGROUP_SvG}")
    
    # ---------- 4) BY_GROUP: Single vs WG ----------  
    fig, axes = plt.subplots(2, 1, figsize=(9,10), sharex=True)
    _hist_bygroup(axes)
    _vline_with_iqr(axes[0], tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(axes[1], tau_single_med, tau_single_q1, tau_single_q3,
                    COLOR_SINGLE, "-", "τ_single")
    _vline_with_iqr(axes[0], tau_high_wg, tau_high_wg_q1, tau_high_wg_q3,
                    COLOR_WG, ":", "τ_high_WG")
    _vline_with_iqr(axes[1], tau_low_wg,  tau_low_wg_q1,  tau_low_wg_q3,
                    COLOR_WG, ":", "τ_low_WG")
    axes[0].legend(fontsize=FS_LEGEND)
    axes[1].legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    plt.savefig(SAVE_BYGROUP_SvWG, dpi=300)
    plt.close()
    print(f"[Cell6N.2] Saved -> {SAVE_BYGROUP_SvWG}")
    
    # ---------- 5) Fold単位：OVERALL の確率分布と各Foldの τ ----------  
    for _, row in df_fold.iterrows():
        fid = int(row.get("fold_id", -1)) if "fold_id" in row else None
        test_id = str(row.get("test_id", f"fold{fid}"))
        sub = df_pred[df_pred["fold_id"] == fid] if "fold_id" in df_pred.columns and fid is not None else df_pred.copy()
    
        p = pd.to_numeric(sub["proba"], errors="coerce").values
        yt = pd.to_numeric(sub["y_true"], errors="coerce").values.astype(int)
        p_s, p_n = p[yt==1], p[yt==0]
    
        t_single = float(row["tau_single"])
        t_high   = float(row[c_high]) if c_high in row else np.nan
        t_low    = float(row[c_low])  if c_low  in row else np.nan
        t_high_w = float(row[c_wgh])  if c_wgh  in row else np.nan
        t_low_w  = float(row[c_wgl])  if c_wgl  in row else np.nan
    
        p_sg  = os.path.join(IMG_DIR, f"FOLD{fid:02d}_{test_id}_OVERALL_SvGroup.png")
        p_wg  = os.path.join(IMG_DIR, f"FOLD{fid:02d}_{test_id}_OVERALL_SvWG.png")
    
        # Single vs Group
        fig, ax = plt.subplots(figsize=(9,6))
        ax.hist(p_s, bins=BINS, density=True, alpha=0.5,
                label=f"True:Sick (n={len(p_s)})", color=COLOR_SICK)
        ax.hist(p_n, bins=BINS, density=True, alpha=0.5,
                label=f"True:Non-Sick (n={len(p_n)})", color=COLOR_NON)
        if np.isfinite(t_single):
            ax.axvline(t_single, color=COLOR_SINGLE, linestyle="-", linewidth=LW,
                       label=f"τ_single = {t_single:.3f}")
        if np.isfinite(t_high):
            ax.axvline(t_high,   color=COLOR_GROUP,  linestyle="--", linewidth=LW,
                       label=f"τ_high_{mode} = {t_high:.3f}")
        if np.isfinite(t_low):
            ax.axvline(t_low,    color=COLOR_GROUP,  linestyle="--", linewidth=LW,
                       label=f"τ_low_{mode}  = {t_low:.3f}")
        _style_axes(ax, title=f"[Fold {fid}] OVERALL — Single vs Group [{mode}]  (test={test_id})")
        ax.legend(fontsize=FS_LEGEND)
        plt.tight_layout()
        plt.savefig(p_sg, dpi=300)
        plt.close()
        print(f"[Cell6N.2] Saved -> {p_sg}")
    
        # Single vs WG
        fig, ax = plt.subplots(figsize=(9,6))
        ax.hist(p_s, bins=BINS, density=True, alpha=0.5,
                label=f"True:Sick (n={len(p_s)})", color=COLOR_SICK)
        ax.hist(p_n, bins=BINS, density=True, alpha=0.5,
                label=f"True:Non-Sick (n={len(p_n)})", color=COLOR_NON)
        if np.isfinite(t_single):
            ax.axvline(t_single, color=COLOR_SINGLE, linestyle="-", linewidth=LW,
                       label=f"τ_single = {t_single:.3f}")
        if np.isfinite(t_high_w):
            ax.axvline(t_high_w, color=COLOR_WG,     linestyle=":", linewidth=LW,
                       label=f"τ_high_WG = {t_high_w:.3f}")
        if np.isfinite(t_low_w):
            ax.axvline(t_low_w,  color=COLOR_WG,     linestyle=":", linewidth=LW,
                       label=f"τ_low_WG  = {t_low_w:.3f}")
        _style_axes(ax, title=f"[Fold {fid}] OVERALL — Single vs WG [{mode}]  (test={test_id})")
        ax.legend(fontsize=FS_LEGEND)
        plt.tight_layout()
        plt.savefig(p_wg, dpi=300)
        plt.close()
        print(f"[Cell6N.2] Saved -> {p_wg}")
    
    print(f"[Cell6N.2] All images saved in: {IMG_DIR}")
    


In [None]:
# ===== Cell 6-Stratified: MSSQ層別 τ最適化（Singleのみ）＋F1 =====
RUN_CELL6_STRAT = bool(globals().get('RUN_CELL6_STRAT', True))
if not RUN_CELL6_STRAT:
    print('[Cell6S] RUN_CELL6_STRAT=False -> skip')
else:
    set_cell_output(6)

    import os
    import numpy as np
    import pandas as pd
    import sklearn.metrics as skm

    pred_path = cell_output_path(5, "MSSQ_SPLIT_PREDICTIONS.csv")
    if not os.path.exists(pred_path):
        raise FileNotFoundError("[Cell6S] MSSQ_SPLIT_PREDICTIONS.CSV が見つかりません。Cell5B を先に実行してください。")

    df_pred = pd.read_csv(pred_path, encoding="utf-8-sig")
    required = {"mssq_group", "y_true", "y_score"}
    missing = required - set(df_pred.columns)
    if missing:
        raise RuntimeError(f"[Cell6S] 予測CSVに必要列がありません -> {missing}")

    MSSQ_LOW_LABEL = str(globals().get("MSSQ_LOW_LABEL", "Low"))
    MSSQ_HIGH_LABEL = str(globals().get("MSSQ_HIGH_LABEL", "High"))

    df_pred = df_pred.copy()
    df_pred["mssq_group_norm"] = (
        df_pred["mssq_group"].astype(str).str.strip().map({
            MSSQ_LOW_LABEL: "Low",
            MSSQ_LOW_LABEL.lower(): "Low",
            MSSQ_HIGH_LABEL: "High",
            MSSQ_HIGH_LABEL.lower(): "High",
        })
    )
    if df_pred["mssq_group_norm"].isna().any():
        bad = df_pred.loc[df_pred["mssq_group_norm"].isna(), "mssq_group"].unique()
        raise RuntimeError(f"[Cell6S] MSSQ_group に未対応ラベルがあります: {bad}")

    scores = df_pred["y_score"].to_numpy(dtype=float)
    y_true = df_pred["y_true"].to_numpy(dtype=int)
    grp = df_pred["mssq_group_norm"].to_numpy()

    # overall ROC-AUC (reference)
    try:
        auc_overall = skm.roc_auc_score(y_true, scores) if np.unique(y_true).size == 2 else float("nan")
    except Exception:
        auc_overall = float("nan")
    print(f"[Cell6S] overall ROC-AUC = {auc_overall:.4f}")

    # F1 at fixed threshold 0.5 (reference)
    y_pred_fixed05 = (scores >= 0.5).astype(int)
    try:
        f1_fixed05 = skm.f1_score(y_true, y_pred_fixed05) if np.unique(y_true).size == 2 else float("nan")
    except Exception:
        f1_fixed05 = float("nan")
    print(f"[Cell6S] F1@0.5 = {f1_fixed05:.4f}")

    maskH = grp == "High"
    maskL = grp == "Low"
    if maskH.sum() == 0 or maskL.sum() == 0:
        raise RuntimeError("[Cell6S] High/Low 両群のデータが必要です。Cell5B の出力を確認してください。")

    COARSE_STEPS = int(globals().get("COARSE_STEPS", 51))
    FINE_STEPS   = int(globals().get("FINE_STEPS", 51))
    FINE_MARGIN  = float(globals().get("FINE_MARGIN", 0.05))

    def _conf_from_preds(y_t, y_p):
        TN, FP, FN, TP = skm.confusion_matrix(y_t, y_p, labels=[0, 1]).ravel()
        return TP, FP, FN, TN

    def _f1_score(y_t, y_p):
        TP, FP, FN, TN = _conf_from_preds(y_t, y_p)
        denom = 2 * TP + FP + FN
        return (2 * TP / denom) if denom > 0 else 0.0

    def _grid(l, r, steps):
        l = float(max(0.0, l)); r = float(min(1.0, r))
        if l > r:
            l, r = r, l
        return np.linspace(l, r, int(steps), dtype=float)

    def _best_tau(scores_arr, labels_arr):
        coarse = _grid(0.0, 1.0, COARSE_STEPS)
        vals = np.array([_f1_score(labels_arr, (scores_arr >= t).astype(int)) for t in coarse])
        idx = int(np.nanargmax(vals)); tau0 = float(coarse[idx]); best0 = float(vals[idx])

        left, right = max(0.0, tau0 - FINE_MARGIN), min(1.0, tau0 + FINE_MARGIN)
        fine = _grid(left, right, FINE_STEPS)
        vals_f = np.array([_f1_score(labels_arr, (scores_arr >= t).astype(int)) for t in fine])
        idx_f = int(np.nanargmax(vals_f)); tau = float(fine[idx_f]); best = float(vals_f[idx_f])
        return tau, best, tau0, best0

    tauH, f1H, tauH0, f1H0 = _best_tau(scores[maskH], y_true[maskH])
    tauL, f1L, tauL0, f1L0 = _best_tau(scores[maskL], y_true[maskL])

    y_pred = np.where(
        grp == "High",
        (scores >= tauH).astype(int),
        (scores >= tauL).astype(int),
    )
    f1_all = _f1_score(y_true, y_pred)

    summary = pd.DataFrame([
        {"group": "overall", "tau": None, "F1": f1_all},
        {"group": "High", "tau": tauH, "F1": f1H, "tau_coarse": tauH0, "F1_coarse": f1H0},
        {"group": "Low",  "tau": tauL, "F1": f1L, "tau_coarse": tauL0, "F1_coarse": f1L0},
        {"group": "overall_tau0.5", "tau": 0.5, "F1": f1_fixed05},
    ])

    # save predictions with applied thresholds
    df_pred_out = df_pred.copy()
    df_pred_out["y_pred_best"] = y_pred
    df_pred_out["tau_used"] = np.where(grp == "High", tauH, tauL)
    pred_with_tau_path = cell_output_path(6, "MSSQ_SPLIT_PREDICTIONS_WITH_TAU.csv")
    df_pred_out.to_csv(pred_with_tau_path, index=False, encoding="utf-8-sig")
    print(f"[Cell6S] predictions with tau -> {pred_with_tau_path}")

    out_csv = cell_output_path(6, "MSSQ_SPLIT_F1_SINGLE.csv")
    summary.to_csv(out_csv, index=False, encoding="utf-8-sig")
    print(f"[Cell6S] MSSQ-split thresholds -> High={tauH:.3f}, Low={tauL:.3f}, F1_overall={f1_all:.3f}")
    print(f"[Cell6S] 保存 -> {out_csv}")


In [None]:
# ===== Cell 6-Stratified: F1メトリクス＋混同行列図＋ROC =====
RUN_CELL6_STRAT = bool(globals().get('RUN_CELL6_STRAT', True))
if not RUN_CELL6_STRAT:
    print('[Cell6S-metrics] RUN_CELL6_STRAT=False -> skip')
else:
    set_cell_output(6)

    import os
    import numpy as np
    import pandas as pd
    import sklearn.metrics as skm
    import matplotlib.pyplot as plt

    pred_path = cell_output_path(6, "MSSQ_SPLIT_PREDICTIONS_WITH_TAU.csv")
    summary_path = cell_output_path(6, "MSSQ_SPLIT_F1_SINGLE.csv")
    if not (os.path.exists(pred_path) and os.path.exists(summary_path)):
        raise FileNotFoundError("[Cell6S-metrics] 必要ファイルがありません。Cell6-Stratified を先に実行してください。")

    df_pred = pd.read_csv(pred_path, encoding="utf-8-sig")
    df_sum = pd.read_csv(summary_path, encoding="utf-8-sig")

    y_true = df_pred["y_true"].to_numpy(int)
    y_pred = df_pred["y_pred_best"].to_numpy(int)
    scores = df_pred["y_score"].to_numpy(float)
    grp = df_pred["mssq_group_norm"].astype(str).to_numpy()

    if np.unique(y_true).size < 2:
        raise RuntimeError("[Cell6S-metrics] 真値が単一クラスのためメトリクス計算不可")

    def _conf(y_t, y_p):
        TN, FP, FN, TP = skm.confusion_matrix(y_t, y_p, labels=[0,1]).ravel()
        return TP, FP, FN, TN

    def _f1(y_t, y_p):
        TP, FP, FN, TN = _conf(y_t, y_p)
        denom = 2*TP + FP + FN
        return (2*TP/denom) if denom>0 else 0.0

    f1_overall = _f1(y_true, y_pred)
    maskH = grp == 'High'
    maskL = grp == 'Low'
    f1_H = _f1(y_true[maskH], y_pred[maskH]) if maskH.sum()>0 else float('nan')
    f1_L = _f1(y_true[maskL], y_pred[maskL]) if maskL.sum()>0 else float('nan')

    # ROC
    auc_overall = skm.roc_auc_score(y_true, scores)
    fpr, tpr, _ = skm.roc_curve(y_true, scores)

    # 混同行列図（overall）
    cm = skm.confusion_matrix(y_true, y_pred, labels=[0,1])
    fig, ax = plt.subplots(figsize=(6,6))
    im = ax.imshow(cm, cmap='Blues')
    for (i,j), val in np.ndenumerate(cm):
        ax.text(j, i, int(val), ha='center', va='center', color='black', fontsize=14)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_xticks([0,1]); ax.set_yticks([0,1]);
    ax.set_title('Confusion matrix (overall)')
    plt.tight_layout()
    cm_path = cell_output_path(6, 'MSSQ_SPLIT_CM_OVERALL.png')
    plt.savefig(cm_path, dpi=300)
    plt.close()

    # ROC 図
    plt.figure(figsize=(7,7))
    plt.plot(fpr, tpr, label=f'AUC = {auc_overall:.3f}')
    plt.plot([0,1],[0,1],'--',color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC (MSSQ-split models)')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.4)
    plt.tight_layout()
    roc_path = cell_output_path(6, 'MSSQ_SPLIT_ROC.png')
    plt.savefig(roc_path, dpi=300)
    plt.close()

    # 集計保存
    metrics_path = cell_output_path(6, 'MSSQ_SPLIT_METRICS.csv')
    pd.DataFrame([
        {'group':'overall','F1':f1_overall,'AUC':auc_overall},
        {'group':'High','F1':f1_H},
        {'group':'Low','F1':f1_L},
    ]).to_csv(metrics_path, index=False, encoding='utf-8-sig')

    print(f"[Cell6S-metrics] F1_overall={f1_overall:.3f}, F1_H={f1_H:.3f}, F1_L={f1_L:.3f}")
    print(f"[Cell6S-metrics] AUC_overall={auc_overall:.3f}")
    print(f"[Cell6S-metrics] saved cm->{cm_path}, roc->{roc_path}, metrics->{metrics_path}")


In [None]:
# ===== Cell 6-Stratified: 確率スコア分布図 =====
RUN_CELL6_STRAT = bool(globals().get('RUN_CELL6_STRAT', True))
if not RUN_CELL6_STRAT:
    print('[Cell6S-proba] RUN_CELL6_STRAT=False -> skip')
else:
    set_cell_output(6)

    import os
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    pred_path = cell_output_path(6, "MSSQ_SPLIT_PREDICTIONS_WITH_TAU.csv")
    if not os.path.exists(pred_path):
        raise FileNotFoundError("[Cell6S-proba] MSSQ_SPLIT_PREDICTIONS_WITH_TAU.csv がありません。Cell6-Stratified を実行してください。")

    df_pred = pd.read_csv(pred_path, encoding="utf-8-sig")
    proba = df_pred["y_score"].to_numpy(float)
    ytrue = df_pred["y_true"].to_numpy(int)
    grp   = df_pred["mssq_group_norm"].astype(str).to_numpy()
    tau_used = df_pred["tau_used"].to_numpy(float)

    BINS = 40
    LW = 1.5
    FS_TITLE, FS_LABEL, FS_LEGEND, FS_TICK = 24, 18, 16, 14

    def _style(ax, title=None):
        if title:
            ax.set_title(title, fontsize=FS_TITLE)
        ax.set_xlabel('Predicted probability', fontsize=FS_LABEL)
        ax.set_ylabel('Density', fontsize=FS_LABEL)
        ax.tick_params(axis='both', labelsize=FS_TICK)
        ax.set_xlim(0,1)

    # overall hist
    plt.figure(figsize=(7,5))
    plt.hist(proba[ytrue==1], bins=BINS, density=True, alpha=0.5, label=f'True=1 (n={(ytrue==1).sum()})', color='red')
    plt.hist(proba[ytrue==0], bins=BINS, density=True, alpha=0.5, label=f'True=0 (n={(ytrue==0).sum()})', color='blue')
    # draw representative tau (median of tau_used)
    tau_med = float(np.nanmedian(tau_used)) if tau_used.size else 0.5
    plt.axvline(tau_med, color='black', linestyle='--', linewidth=LW, label=f'tau_med={tau_med:.2f}')
    _style(plt.gca(), 'Probability distribution (overall)')
    plt.legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    path_overall = cell_output_path(6, 'MSSQ_SPLIT_PROBA_OVERALL.png')
    plt.savefig(path_overall, dpi=300)
    plt.close()

    # by group
    fig, axes = plt.subplots(1,2, figsize=(12,5), sharex=True, sharey=True)
    for ax, label, mask, color in [
        (axes[0], 'High', grp=='High', 'purple'),
        (axes[1], 'Low', grp=='Low', 'green'),
    ]:
        ax.hist(proba[(ytrue==1)&mask], bins=BINS, density=True, alpha=0.5, label=f'True=1 (n={( (ytrue==1)&mask ).sum()})', color='red')
        ax.hist(proba[(ytrue==0)&mask], bins=BINS, density=True, alpha=0.5, label=f'True=0 (n={( (ytrue==0)&mask ).sum()})', color='blue')
        tau_g = float(np.nanmedian(tau_used[mask])) if mask.any() else 0.5
        ax.axvline(tau_g, color=color, linestyle='--', linewidth=LW, label=f'tau_med={tau_g:.2f}')
        _style(ax, f'Group={label}')
        ax.legend(fontsize=FS_LEGEND)
    plt.tight_layout()
    path_bygrp = cell_output_path(6, 'MSSQ_SPLIT_PROBA_BYGROUP.png')
    plt.savefig(path_bygrp, dpi=300)
    plt.close()

    print(f"[Cell6S-proba] saved -> {path_overall}, {path_bygrp}")


In [None]:
# ===== Summary: ANALYSIS/機械学習 配下の実験を集約 =====
import os, json
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(層別学習)")


def load_subset_features(base_dir: Path, *, include=None, exclude=None):
    """base_dir 配下で *SUBSET_BEST*.json を探し、最初に読めた features を返す。"""
    def _ok(name: str) -> bool:
        lname = name.lower()
        if include and not any(tok.lower() in lname for tok in include):
            return False
        if exclude and any(tok.lower() in lname for tok in exclude):
            return False
        return True

    for cand in sorted(base_dir.glob('**/*SUBSET_BEST*.json')):
        if not _ok(cand.name):
            continue
        try:
            with open(cand, 'r', encoding='utf-8') as f:
                data = json.load(f)
            feats = data.get('features', []) or []
            return cand.name, feats
        except Exception:
            continue
    return None, []


def read_metrics_neutral(level1: Path, mode: str):
    """Cell6-Neutral の METRICS_SUMMARY_{mode}.csv を読む。"""
    res = {"single": None, "group": None, "wg": None, "attr": None}
    path = level1 / 'Cell6' / mode / f'METRICS_SUMMARY_{mode}.csv'
    if not path.exists():
        return res
    try:
        df = pd.read_csv(path)
        if mode in df.columns and 'decision' in df.columns:
            for dec in ['single', 'group', 'wg', 'attr']:
                row = df[df['decision'] == dec]
                if not row.empty and not pd.isna(row[mode].iloc[0]):
                    res[dec] = float(row[mode].iloc[0])
    except Exception:
        pass
    return res


def read_f1_fixed05_neutral(level1: Path):
    pred_path = level1 / 'Cell6' / 'F1' / 'GROUP_AWARE' / 'GROUP_AWARE_PREDICTIONS.CSV'
    if not pred_path.exists():
        return None
    try:
        df = pd.read_csv(pred_path)
        if 'y_pred_fixed05' in df.columns:
            y_pred = df['y_pred_fixed05']
        elif {'proba', 'y_true'}.issubset(df.columns):
            y_pred = (pd.to_numeric(df['proba'], errors='coerce') >= 0.5).astype(int)
        else:
            return None
        y_true = pd.to_numeric(df['y_true'], errors='coerce').astype(int)
        if y_true.nunique() < 2:
            return None
        tn, fp, fn, tp = pd.crosstab(y_true, y_pred, dropna=False).reindex(index=[0,1], columns=[0,1], fill_value=0).values.ravel()
        denom = 2*tp + fp + fn
        return float((2*tp/denom) if denom>0 else 0.0)
    except Exception:
        return None


def read_auc_neutral(level1: Path):
    path = level1 / 'Cell4' / 'AUC_K_CI.csv'
    if not path.exists():
        return None
    try:
        df = pd.read_csv(path)
        return float(df.get('auc', pd.Series([None])).iloc[0])
    except Exception:
        return None


def read_auc_strat(level1: Path):
    """Cell5A出力 AUC_K_CI_MSSQ_SPLIT.csv から overall/Low/High を取得。"""
    path = level1 / 'Cell5' / 'AUC_K_CI_MSSQ_SPLIT.csv'
    res = {"overall": None, "low": None, "high": None}
    if not path.exists():
        return res
    try:
        df = pd.read_csv(path)
        if 'group' in df.columns and 'auc' in df.columns:
            for _, row in df.iterrows():
                g = str(row['group']).lower()
                auc = row.get('auc', None)
                if pd.isna(auc):
                    continue
                if g == 'overall':
                    res['overall'] = float(auc)
                elif 'low' in g:
                    res['low'] = float(auc)
                elif 'high' in g:
                    res['high'] = float(auc)
    except Exception:
        pass
    return res


def read_metrics_strat(level1: Path):
    """Cell6-Stratified の集計 (F1/AUC + 0.5固定F1)。"""
    res = {"F1_overall": None, "F1_H": None, "F1_L": None, "F1_tau05": None, "AUC_overall": None}

    metrics_path = level1 / 'Cell6' / 'MSSQ_SPLIT_METRICS.csv'
    if metrics_path.exists():
        try:
            df = pd.read_csv(metrics_path)
            for _, row in df.iterrows():
                g = str(row.get('group', '')).lower()
                if 'overall' in g:
                    res['F1_overall'] = float(row.get('F1', None)) if not pd.isna(row.get('F1', None)) else None
                    res['AUC_overall'] = float(row.get('AUC', None)) if not pd.isna(row.get('AUC', None)) else res['AUC_overall']
                elif g == 'high':
                    res['F1_H'] = float(row.get('F1', None)) if not pd.isna(row.get('F1', None)) else None
                elif g == 'low':
                    res['F1_L'] = float(row.get('F1', None)) if not pd.isna(row.get('F1', None)) else None
        except Exception:
            pass

    f1single_path = level1 / 'Cell6' / 'MSSQ_SPLIT_F1_SINGLE.csv'
    if f1single_path.exists():
        try:
            df = pd.read_csv(f1single_path)
            row = df[df['group'] == 'overall_tau0.5']
            if not row.empty and 'F1' in row.columns and not pd.isna(row['F1'].iloc[0]):
                res['F1_tau05'] = float(row['F1'].iloc[0])
        except Exception:
            pass
    return res


rows = []
for level1 in ROOT.glob('*'):
    if not level1.is_dir():
        continue
    parts = level1.name.split('__')
    tags = {"FS_TAG": "", "H_TAG": "", "GRID_TAG": "", "CORR_TAG": "", "REG_TAG": ""}
    for p in parts:
        if p.startswith('FS'):
            tags['FS_TAG'] = p
        elif p.startswith('H'):
            tags['H_TAG'] = p
        elif p.startswith('GridC'):
            tags['GRID_TAG'] = p
        elif p.startswith('Corr'):
            if '正則化' in p:
                corr_part, reg_rest = p.split('正則化', 1)
                tags['CORR_TAG'] = corr_part
                tags['REG_TAG'] = '正則化' + reg_rest
            else:
                tags['CORR_TAG'] = p
        elif p.startswith('正則化'):
            tags['REG_TAG'] = p

    # ----- Subset（ニュートラル/層別で分けて取得） -----
    neutral_subset_file, neutral_feats = load_subset_features(level1 / 'Cell4', exclude=['mssq'])
    strat_subset_file, strat_feats = load_subset_features(level1 / 'Cell5', include=['mssq'])
    n_feat_neutral = len(neutral_feats)
    n_feat_strat = len(strat_feats)

    # ----- ROC-AUC -----
    auc_neutral = read_auc_neutral(level1)
    auc_strat = read_auc_strat(level1)

    # ----- F1/BA（ニュートラル） -----
    f1 = read_metrics_neutral(level1, 'F1')
    ba = read_metrics_neutral(level1, 'BA')
    f1_tau05_neutral = read_f1_fixed05_neutral(level1)

    # ----- 層別メトリクス -----
    strat = read_metrics_strat(level1)

    rows.append({
        'Path': str(level1),
        'FS_TAG': tags['FS_TAG'], 'H_TAG': tags['H_TAG'], 'GRID_TAG': tags['GRID_TAG'], 'CORR_TAG': tags['CORR_TAG'], 'REG_TAG': tags['REG_TAG'],
        'NFEAT_N': n_feat_neutral, 'FEAT_N': ','.join(neutral_feats),
        'NFEAT_S': n_feat_strat,   'FEAT_S': ','.join(strat_feats),
        'AUC_N': auc_neutral,
        'AUC_S': auc_strat.get('overall'), 'AUC_S_L': auc_strat.get('low'), 'AUC_S_H': auc_strat.get('high'),
        'F1N_single': f1['single'], 'F1N_group': f1['group'], 'F1N_wg': f1['wg'], 'F1N_attr': f1['attr'], 'F1N_tau05': f1_tau05_neutral,
        'F1S': strat['F1_overall'], 'F1S_H': strat['F1_H'], 'F1S_L': strat['F1_L'], 'F1S_tau05': strat['F1_tau05'],
        'BAN_single': ba['single'], 'BAN_group': ba['group'], 'BAN_wg': ba['wg'], 'BAN_attr': ba['attr'],
    })

summary_df = pd.DataFrame(rows)
out_path = ROOT / 'summary_all_runs.csv'
summary_df.to_csv(out_path, index=False, encoding='utf-8-sig')
print(f"[SUMMARY] rows={len(summary_df)} -> {out_path}")
summary_df.head()
