In [None]:
# ===== Cell 0: 環境設定（全セル共通で利用）=====

from __future__ import annotations

from typing import Any, Callable, Dict, Optional

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# ------------------------
# 実験スイッチ（Notebook全体で共有）
# ------------------------
FMS_THRESHOLD: int = 2            # FMS >= 1 を陽性ラベルとみなす
EPOCH_LEN: int = 30               # 30 / 60 / 120 のいずれか
MODEL_BACKEND: str = "xgb"        # "xgb" / "rf" / "svm"
USE_AP_FOR_K: bool = False         # APベースの best_k で上書きするか
METRIC: str = "ba"
METRIC_NAME: str = "BA"           # 表示用
SEED_BASE: int = 20251101

if EPOCH_LEN not in (30, 60, 120):
    raise ValueError("EPOCH_LEN は 30/60/120 から選択してください。")

# ------------------------
# ファイル入出力ルート
# ------------------------
BASE_INPUT_DIR = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果"
BASE_ANALYSIS_DIR = os.path.join(BASE_INPUT_DIR, "ANALYSIS")
OUT_DIR = os.path.join(BASE_ANALYSIS_DIR, "機械学習(MSSQ込み)", f"閾値FMS{FMS_THRESHOLD}")
os.makedirs(OUT_DIR, exist_ok=True)

def outpath(filename: str) -> str:
    return os.path.join(OUT_DIR, filename)

print(f"[OUT_DIR] {OUT_DIR}  |  EPOCH_LEN={EPOCH_LEN}s")

# ------------------------
# 対象被験者・時間窓
# ------------------------
SUBJECT_IDS = [
    "10061","10063","10064",
    "10071","10072","10073","10074",
    "10081","10082","10083",
    "10091","10092","10093","10094",
    "10101","10102","10103",
]

BASELINE_EPOCH = 1770               # ベースライン行（必須）
ML_START, ML_END = 1800, 2400       # 学習に使う epoch_start 範囲 [start, end)

# ------------------------
# 描画スタイル
# ------------------------
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "font.size": 20, "axes.titlesize": 26, "axes.labelsize": 22,
    "xtick.labelsize": 20, "ytick.labelsize": 20, "legend.fontsize": 20,
})

# ------------------------
# FMS二値化ヘルパ
# ------------------------
def binarize_fms(series: pd.Series, threshold: Optional[int] = None) -> pd.Series:
    th = FMS_THRESHOLD if threshold is None else int(threshold)
    return (series >= th).astype(int)

# ------------------------
# モデルレジストリ
# ------------------------
ModelBuilder = Callable[..., Any]
MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {}

def register_backend(name: str, params: Dict[str, Any], builder: ModelBuilder) -> None:
    MODEL_REGISTRY[name] = {"params": params, "builder": builder}

def _build_xgb(params: Dict[str, Any], *, scale_pos_weight: Optional[float] = None):
    cfg = params.copy()
    if scale_pos_weight is not None:
        cfg["scale_pos_weight"] = float(scale_pos_weight)
    return xgb.XGBClassifier(**cfg)

def _build_rf(params: Dict[str, Any], **_):
    return RandomForestClassifier(**params)

def _build_svm(params: Dict[str, Any], **_):
    return SVC(**params)

XGB_PARAMS: Dict[str, Any] = dict(
    n_estimators=100,
    eval_metric="logloss",
    subsample=1.0,
    colsample_bytree=1.0,
    n_jobs=1,
    tree_method="hist",
    device="cpu",
    seed=0,
    random_state=0,
)

RF_PARAMS: Dict[str, Any] = dict(
    n_estimators=439,
    max_depth=14,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features="sqrt",
    bootstrap=False,
    class_weight="balanced",
    random_state=SEED_BASE,
    n_jobs=1,
)

SVM_PARAMS: Dict[str, Any] = dict(
    C=1.0,
    kernel="rbf",
    gamma="scale",
    probability=True,
    class_weight="balanced",
    random_state=SEED_BASE,
)

register_backend("xgb", XGB_PARAMS, _build_xgb)
register_backend("rf",  RF_PARAMS,  _build_rf)
register_backend("svm", SVM_PARAMS, _build_svm)

def set_model_backend(name: str) -> None:
    name = name.lower()
    if name not in MODEL_REGISTRY:
        raise KeyError(f"[ERROR] backend '{name}' は未登録: {list(MODEL_REGISTRY.keys())}")
    global MODEL_BACKEND
    MODEL_BACKEND = name

def build_estimator(
    backend: Optional[str] = None,
    *,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    name = (backend or MODEL_BACKEND).lower()
    if name not in MODEL_REGISTRY:
        raise KeyError(f"[ERROR] backend '{name}' は未登録。")
    base = MODEL_REGISTRY[name]["params"].copy()
    if overrides:
        base.update(overrides)
    builder = MODEL_REGISTRY[name]["builder"]
    return builder(base, scale_pos_weight=scale_pos_weight)

def fit_estimator(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    *,
    backend: Optional[str] = None,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    model = build_estimator(
        backend=backend, scale_pos_weight=scale_pos_weight, overrides=overrides
    )
    model.fit(X_train, y_train)
    return model

def predict_positive_score(model, X: pd.DataFrame) -> np.ndarray:
    X = X.astype(np.float32, copy=False)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return np.asarray(model.decision_function(X), dtype=float)
    return model.predict(X).astype(float)

MODEL_ID = MODEL_BACKEND.upper()
print(f"[INFO] MODEL_BACKEND={MODEL_ID} / SEED={SEED_BASE} / backends={list(MODEL_REGISTRY.keys())}")


In [None]:
# ===== Cell 1: データ準備（CSV読込 → EPOCH合成 → SUBJECT_META → 行列出力）=====

import pandas as pd
import numpy as np

# --------------------------------------------
# ① 30秒EPOCH CSVの読み込み・検証
# --------------------------------------------
def subject_csv_path(sid: str) -> str:
    path = os.path.join(BASE_INPUT_DIR, sid, "EPOCH", f"{sid}_epoch.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"[Cell1] CSV missing for subject {sid}: {path}")
    return path

dfs = []
for sid in SUBJECT_IDS:
    df = pd.read_csv(subject_csv_path(sid))
    if df.shape[1] < 4:
        raise ValueError(f"[Cell1] {sid}: 列数が不足（>=4 必須）")
    df = df.copy()
    df.columns = list(df.columns[:3]) + [str(c) for c in df.columns[3:]]
    c1, c2, c3 = df.columns[:3]
    df = df.rename(columns={c1: "epoch_start", c2: "epoch_end", c3: "FMS"})
    df["epoch_start"] = pd.to_numeric(df["epoch_start"], errors="coerce").astype("Int64")
    df["epoch_end"]   = pd.to_numeric(df["epoch_end"],   errors="coerce").astype("Int64")
    df["FMS"]         = pd.to_numeric(df["FMS"],         errors="coerce").astype("Int64")
    if df[["epoch_start","epoch_end","FMS"]].isna().any().any():
        raise ValueError(f"[Cell1] {sid}: epoch_start/epoch_end/FMS に NaN")
    df.insert(0, "subject_id", sid)
    dfs.append(df)

combined_raw = pd.concat(dfs, ignore_index=True)
exclude_feats = {"HF_power", "LF_power", "LF_HF_ratio"}
feature_cols_all = [
    c for c in combined_raw.columns
    if c not in {"subject_id","epoch_start","epoch_end","FMS"} and c not in exclude_feats
]
if not feature_cols_all:
    raise RuntimeError("[Cell1] 特徴量列が0です。列名や除外設定を確認してください。")

print(f"[Cell1] Loaded subjects={len(SUBJECT_IDS)}, rows={len(combined_raw)}, features(after drop)={len(feature_cols_all)}")

# --------------------------------------------
# ② EPOCH_LEN 秒への合成 + baseline差分 + ラベル生成
# --------------------------------------------
if (ML_END - ML_START) % EPOCH_LEN != 0:
    raise ValueError(f"[Cell1] ML window {ML_END-ML_START} が EPOCH_LEN={EPOCH_LEN} で割り切れません。")

rows_per_bin = EPOCH_LEN // 30
df_out_list = []

for sid, sdf in combined_raw.groupby("subject_id", sort=False):
    base_row = sdf.loc[sdf["epoch_start"] == BASELINE_EPOCH]
    if len(base_row) != 1:
        raise ValueError(f"[Cell1] {sid}: baseline epoch_start=={BASELINE_EPOCH} が見つからない")
    base_vals = base_row[feature_cols_all].astype(float).iloc[0]
    if base_vals.isna().any():
        raise ValueError(f"[Cell1] {sid}: baselineにNaN -> {base_vals.index[base_vals.isna()].tolist()}")

    sdf_ml = sdf[(sdf["epoch_start"] >= ML_START) & (sdf["epoch_start"] < ML_END)].copy()
    if sdf_ml.empty:
        raise ValueError(f"[Cell1] {sid}: ML window [{ML_START},{ML_END}) が空です。")

    sdf_ml["bin_start"] = ML_START + ((sdf_ml["epoch_start"] - ML_START) // EPOCH_LEN) * EPOCH_LEN
    sdf_ml["bin_end"]   = sdf_ml["bin_start"] + EPOCH_LEN

    bin_counts = sdf_ml.groupby(["bin_start","bin_end"]).size()
    complete_bins = bin_counts[bin_counts == rows_per_bin].index
    sdf_ml = sdf_ml.set_index(["bin_start","bin_end"]).loc[complete_bins].reset_index()
    if sdf_ml.empty:
        raise ValueError(f"[Cell1] {sid}: EPOCH_LEN={EPOCH_LEN} で完全なbinが無い")

    agg_dict = {c: "mean" for c in feature_cols_all}
    agg_dict["FMS"] = "mean"
    g = sdf_ml.groupby(["subject_id","bin_start","bin_end"], as_index=False).agg(agg_dict)

    g_features = g[feature_cols_all].astype(float) - base_vals.values
    if g_features.isna().any().any():
        bad = g_features.columns[g_features.isna().any()].tolist()
        raise ValueError(f"[Cell1] {sid}: baseline差分後にNaN -> {bad}")

    g_out = pd.concat([g[["subject_id","bin_start","bin_end","FMS"]], g_features], axis=1)
    g_out = g_out.rename(columns={"bin_start":"epoch_start","bin_end":"epoch_end"})
    g_out["label"] = binarize_fms(g_out["FMS"])
    g_out = g_out[["subject_id","epoch_start","epoch_end","FMS","label"] + feature_cols_all]
    df_out_list.append(g_out)

df_ml_epoch = pd.concat(df_out_list, ignore_index=True)

# --------------------------------------------
# ③ SUBJECT_META & MSSQ group
# --------------------------------------------
CANDIDATE_SCORE_PATHS = [
    "/mnt/data/summary_scores.xlsx",
    os.path.join(BASE_ANALYSIS_DIR, "summary_scores.xlsx"),
    os.path.join(BASE_ANALYSIS_DIR, "機械学習", "summary_scores.xlsx"),
    os.path.join(BASE_INPUT_DIR, "summary_scores.xlsx"),
]
score_path = next((p for p in CANDIDATE_SCORE_PATHS if os.path.exists(p)), None)
if score_path is None:
    raise FileNotFoundError("[Cell1] summary_scores.xlsx が見つかりません。")
meta_raw = pd.read_excel(score_path, sheet_name="Summary")

required = ["ID", "MSSQ", "VIMSSQ"]
missing = [c for c in required if c not in meta_raw.columns]
if missing:
    raise ValueError(f"[Cell1] summary_scores.xlsx に必須列がありません -> {missing}")

meta = meta_raw[required].copy()
meta["ID"] = (
    meta["ID"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
)
for c in ["MSSQ", "VIMSSQ"]:
    meta[c] = pd.to_numeric(meta[c], errors="raise")

sid_set = set(map(str, SUBJECT_IDS))
meta = meta[meta["ID"].isin(sid_set)].copy()
if meta["ID"].duplicated().any():
    raise ValueError(f"[Cell1] ID 重複 -> {meta.loc[meta['ID'].duplicated(), 'ID'].tolist()}")

MSSQ_THRESHOLD_FIXED = 10.0
meta["MSSQ_group"] = np.where(meta["MSSQ"] >= MSSQ_THRESHOLD_FIXED, "High", "Low")
SUBJECT_META = (
    meta.rename(columns={"ID": "subject_id"})
        .set_index("subject_id")[["MSSQ", "VIMSSQ", "MSSQ_group"]]
        .copy()
)
SUBJECT_META.to_csv(outpath("subject_meta.csv"), encoding="utf-8-sig")
print(f"[Cell1] SUBJECT_META saved -> {outpath('subject_meta.csv')} (source='{score_path}')")

# --------------------------------------------
# ④ 学習行列＆行列保存
# --------------------------------------------
fname_raw = f"ML_DATA_DELTA_{EPOCH_LEN}S_RAW.CSV"
df_ml_epoch.to_csv(outpath(fname_raw), index=False, encoding="utf-8-sig")

X_all = df_ml_epoch[feature_cols_all].copy().astype(float)
y_all = df_ml_epoch["label"].copy().astype(int)
groups = df_ml_epoch["subject_id"].copy()

X_all.to_csv(outpath(f"X_RAW_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")
X_all.to_csv(outpath(f"X_SCALED_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")  # 木系でスケーリング不要
pd.DataFrame({"subject_id": groups, "label": y_all, "FMS_mean": df_ml_epoch["FMS"]}).to_csv(
    outpath(f"Y_AND_GROUPS_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig"
)

print(f"[Cell1] Saved -> {outpath(fname_raw)} / X_RAW_ALL / X_SCALED_ALL / Y_AND_GROUPS")
print(f"[Cell1] Matrices ready: X_all={X_all.shape}, y_all={y_all.shape}, SUBJECT_META={SUBJECT_META.shape}")


In [None]:
# ===== Cell 2: モデリング共通ヘルパ（fit / SHAP / 評価）=====

from typing import Dict, Optional, Tuple

import numpy as np
import pandas as pd
import shap
from sklearn.metrics import roc_auc_score, accuracy_score


# --------------------------------------------
# 学習ラッパー（Cell0のレジストリAPIを利用）
# --------------------------------------------
def fit_classifier(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    *,
    backend: Optional[str] = None,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    """
    Cell0 の fit_estimator を直接包む薄いラッパ。
    - SHAP/評価セルから backend を差し替えたい場合のみ backend / overrides を指定する。
    """
    if "fit_estimator" not in globals():
        raise RuntimeError("[Cell2] fit_estimator が未定義です。Cell0 を先に実行してください。")
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    return fit_estimator(
        X_train,
        y_train,
        backend=backend,
        scale_pos_weight=scale_pos_weight,
        overrides=overrides,
    )


# --------------------------------------------
# TreeSHAP ベースの特徴重要度算出
# --------------------------------------------
def compute_train_shap_abs_mean(model, X_ref: pd.DataFrame) -> pd.Series:
    """
    学習データ X_ref 上での平均絶対SHAP値（降順）。
    - XGB/RF 等の木モデルを想定（TreeSHAP）。
    - SVM など非対応モデルでは ValueError を送出する。
    """
    X_ref = X_ref.astype(np.float32, copy=False)

    # 背景データ（最大128行）
    bg_n = min(128, len(X_ref))
    X_bg = X_ref.sample(n=bg_n, random_state=SEED_BASE) if bg_n >= 2 else X_ref

    try:
        explainer = shap.TreeExplainer(
            model,
            data=X_bg,
            model_output="probability",
            feature_perturbation="interventional",
        )
        sv_any = explainer.shap_values(X_ref)
    except Exception:
        # probability指定が非対応な場合に raw へフォールバック
        explainer = shap.TreeExplainer(
            model,
            model_output="raw",
            feature_perturbation="tree_path_dependent",
        )
        sv_any = explainer.shap_values(X_ref)

    # shap_values の戻り値形状を統一（2D: n_samples × n_features）
    classes = getattr(model, "classes_", None)
    pos_idx = int(np.where(classes == 1)[0][0]) if classes is not None and 1 in list(classes) else -1

    if isinstance(sv_any, list):
        sv = sv_any[pos_idx]
    else:
        sv = getattr(sv_any, "values", sv_any)
        sv = np.asarray(sv)
        if sv.ndim == 3:
            sv = sv[..., pos_idx]
        elif sv.ndim == 1:
            sv = sv.reshape(-1, 1)

    if sv.shape[1] != X_ref.shape[1]:
        raise RuntimeError(
            f"[Cell2] SHAP shape mismatch: sv.shape={sv.shape}, X_ref.shape={X_ref.shape}"
        )

    abs_mean = np.mean(np.abs(sv), axis=0)
    return pd.Series(abs_mean, index=X_ref.columns, name="mean_abs").sort_values(ascending=False)


# --------------------------------------------
# 評価ユーティリティ
# --------------------------------------------
def _is_probability_like(scores: np.ndarray) -> bool:
    return np.isfinite(scores).all() and 0.0 <= scores.min() and scores.max() <= 1.0


def evaluate_fold(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    """
    - ROC AUC: 2クラス時のみ。
    - Accuracy: 確率なら 0.5、スコアなら 0.0 を閾値とする（詳細な最適化は別セル）。
    """
    X_test = X_test.astype(np.float32, copy=False)
    scores = predict_positive_score(model, X_test)

    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, scores)
    else:
        roc_auc = float("nan")

    thr = 0.5 if _is_probability_like(scores) else 0.0
    pred = (scores >= thr).astype(int)
    acc = accuracy_score(y_test.astype(int), pred)

    return {"roc_auc": float(roc_auc), "accuracy": float(acc)}


print("[Cell2] Modeling helpers ready (fit_classifier / compute_train_shap_abs_mean / evaluate_fold)")


# ===== Section: 特徴重要度と best_k 探索 =====

| Cell | 目的 | 主な出力 (OUT_DIR 配下) |
| ---- | ---- | ----------------------- |
| 3A | LOSO学習でSHAP重要度を算出しランキング化 | `SHAP_FEATURE_RANKING.CSV`, `SHAP_FEATURE_RANKING_LABELED.CSV`, `SHAP_RANKING_ALL.PNG`, `SHAP_TOP8_RANKING.PNG`, `LOSO_METRICS.CSV` |
| 3B | SHAP順の特徴を使って k 本ごとの pooled ROC-AUC を計測し best_k を決定 | `AUC_PER_K.CSV`, `AUC_VS_NUM_FEATURES.PNG`, `best_k` (グローバル変数) |
| 3C | 同様に AUPRC/AP で k を走査し、必要に応じ best_k を APベースで上書きし PR 曲線を出力 | `AUPRC_PER_K.CSV`, `AP_VS_NUM_FEATURES.PNG`, `PR_CURVE_AT_BEST_K.CSV`, `PR_CURVE_AT_BEST_K.PNG` |
| 3D | MSSQ High/Low 各群で in-group LOSO を行い、群別の AUC vs k と best_k を算出 | `AUC_VS_K_BY_GROUP.PNG`, `BEST_K_BY_GROUP.JSON` |


In [None]:
# ===== Cell 3A: SHAPランキング（LOSO学習側のみ）=====

from sklearn.model_selection import LeaveOneGroupOut

logo = LeaveOneGroupOut()
shap_frames = []
metrics_rows = []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_all, y_all, groups), start=1):
    X_tr, X_te = X_all.iloc[tr_idx], X_all.iloc[te_idx]
    y_tr, y_te = y_all.iloc[tr_idx], y_all.iloc[te_idx]
    if len(np.unique(y_tr)) < 2:
        raise RuntimeError(f"[Cell3A] fold{fold_id}: 学習側が単一クラス")

    model = fit_classifier(X_tr, y_tr)
    abs_mean = compute_train_shap_abs_mean(model, X_tr).rename(f"fold{fold_id}")
    shap_frames.append(abs_mean)

    m = evaluate_fold(model, X_te, y_te)
    metrics_rows.append({
        "fold_id": fold_id,
        "test_subject": groups.iloc[te_idx].iloc[0],
        "roc_auc": m["roc_auc"],
        "accuracy": m["accuracy"],
    })

shap_rank = pd.concat(shap_frames, axis=1)
shap_rank["mean_abs"] = shap_rank.mean(axis=1)
shap_rank = shap_rank.sort_values("mean_abs", ascending=False)

shap_rank.to_csv(outpath("SHAP_FEATURE_RANKING1.CSV"), encoding="utf-8-sig")
shap_rank.to_csv(outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")
pd.DataFrame(metrics_rows).to_csv(outpath("LOSO_METRICS.CSV"), index=False, encoding="utf-8-sig")

print("[Cell3A] Saved SHAP ranking & LOSO metrics")

plt.figure(figsize=(10, max(5, len(shap_rank)//3)))
plt.barh(shap_rank.index[::-1], shap_rank["mean_abs"][::-1])
plt.xlabel("Mean |SHAP|"); plt.ylabel("Feature"); plt.title("SHAP Ranking (All)")
plt.tight_layout(); plt.savefig(outpath("SHAP_RANKING_ALL.PNG"), dpi=300); plt.close()

topk = shap_rank.head(8).iloc[::-1]
plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.barh(topk.index, topk["mean_abs"])
mx = float(topk["mean_abs"].max()) if len(topk) else 1.0
ax.set_xlim(0, mx * 1.08)
ax.set_xlabel("Mean |SHAP value|", fontsize=26)
ax.set_ylabel("Feature", fontsize=26)
ax.tick_params(axis="both", labelsize=22)
ax.set_title("Top-8 SHAP Feature Ranking", fontsize=34, pad=10)
plt.tight_layout()
plt.savefig(outpath("SHAP_TOP8_RANKING.PNG"), dpi=300)
plt.close()


In [None]:
# ===== Cell 3A (Legacy): SHAPランキング（XGB pred_contribs版） =====
# - XGBoost専用（pred_contribs=True）でSHAP値を算出
# - 生成物は現行セルと同名ファイルに上書き保存

from sklearn.model_selection import LeaveOneGroupOut
from xgboost import XGBClassifier
import xgboost as xgb

logo = LeaveOneGroupOut()
shap_frames = []
metrics_rows = []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_all, y_all, groups), start=1):
    X_tr, X_te = X_all.iloc[tr_idx], X_all.iloc[te_idx]
    y_tr, y_te = y_all.iloc[tr_idx], y_all.iloc[te_idx]
    if len(np.unique(y_tr)) < 2:
        raise RuntimeError(f"[Cell3A-legacy] fold{fold_id}: 学習側が単一クラスです。")

    model = XGBClassifier(
        n_estimators=100,
        eval_metric="logloss",
        subsample=1.0,
        colsample_bytree=1.0,
        n_jobs=1,
        tree_method="hist",
        device="cpu",
        seed=0,
        random_state=0,
    )
    model.fit(X_tr.astype(np.float32), y_tr.astype(np.int32))

    dm = xgb.DMatrix(X_tr.astype(np.float32), feature_names=list(X_tr.columns))
    contribs = model.get_booster().predict(dm, pred_contribs=True)  # (n_samples, n_features+1)
    shap_vals = contribs[:, :-1]                                   # 最後の列はバイアス項
    abs_mean = np.abs(shap_vals).mean(axis=0)
    shap_frames.append(pd.Series(abs_mean, index=X_tr.columns, name=f"fold{fold_id}"))

    m = evaluate_fold(model, X_te, y_te)
    metrics_rows.append({
        "fold_id": fold_id,
        "test_subject": groups.iloc[te_idx].iloc[0],
        "roc_auc": m["roc_auc"],
        "accuracy": m["accuracy"],
    })

shap_rank = pd.concat(shap_frames, axis=1)
shap_rank["mean_abs"] = shap_rank.mean(axis=1)
shap_rank = shap_rank.sort_values("mean_abs", ascending=False)

# ★ 既存ファイル名へ上書き保存（現行版と同じパス）
shap_rank.to_csv(outpath("SHAP_FEATURE_RANKING.CSV"), encoding="utf-8-sig")
shap_rank.to_csv(outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")
pd.DataFrame(metrics_rows).to_csv(outpath("LOSO_METRICS.CSV"), index=False, encoding="utf-8-sig")

print("[Cell3A-legacy] SHAP_FEATURE_RANKING*.CSV を旧ロジックで上書きしました。")


In [None]:
# ===== Cell 3B: 全kで pooled ROC-AUC → best_k 決定 =====

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score

rank_df = pd.read_csv(outpath("SHAP_FEATURE_RANKING.CSV"), index_col=0, encoding="utf-8-sig")
feature_order = [f for f in rank_df.index if f in X_all.columns]
if not feature_order:
    raise RuntimeError("[Cell3B] ランキング上位特徴が X_all に存在しません。")

ks = list(range(len(feature_order), 0, -1))
logo = LeaveOneGroupOut()
auc_list = []

for k in ks:
    feats = feature_order[:k]
    X = X_all[feats].astype(np.float32)
    y = y_all.values
    g = groups.values

    y_true_all, proba_all = [], []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[Cell3B] 学習foldが単一クラス。閾値/期間の見直しが必要です。")
        model = fit_classifier(X_tr, pd.Series(y_tr))
        proba = predict_positive_score(model, X_te)
        y_true_all.append(y_te); proba_all.append(proba)

    y_true_k = np.concatenate(y_true_all)
    proba_k = np.concatenate(proba_all)
    if len(np.unique(y_true_k)) < 2:
        raise RuntimeError("[Cell3B] pooled 真値が単一クラスで AUC 計算不可。")
    auc_list.append(float(roc_auc_score(y_true_k, proba_k)))

pd.DataFrame({"k": ks, "auc_pooled": auc_list}).to_csv(outpath("AUC_PER_K.CSV"), index=False, encoding="utf-8-sig")
auc_array = np.asarray(auc_list, dtype=float)
best_idx = int(np.nanargmax(auc_array))
best_k = ks[best_idx]
best_auc = auc_list[best_idx]
print(f"[Cell3B] Best k (AUC) = {best_k}, AUC={best_auc:.3f}")
globals()["best_k"] = best_k  # Cell7 以降で使用

plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.plot(ks, auc_list, marker='o', linewidth=1.5)
ax.scatter([best_k], [best_auc], s=180, color="red", zorder=5)
ax.annotate(f"Max AUC = {best_auc:.3f} (k={best_k})", xy=(best_k, best_auc),
            xytext=(best_k, best_auc + 0.02), ha="center", va="bottom",
            fontsize=20, color="red")
ax.invert_xaxis()
ax.set_xlabel("Number of Features (k)", fontsize=26)
ax.set_ylabel("ROC AUC (pooled)", fontsize=26)
ax.tick_params(axis="both", labelsize=22)
ax.set_title("AUC vs Number of Features", fontsize=34, pad=10)
ax.grid(True, alpha=0.4)
plt.tight_layout()
plt.savefig(outpath("AUC_VS_NUM_FEATURES.PNG"), dpi=300)
plt.close()


In [None]:
# ===== Cell 3C: 全kで pooled AUPRC → best_k (AP) 決定 ＋ PR曲線 =====

from sklearn.metrics import average_precision_score, precision_recall_curve, auc
from sklearn.model_selection import LeaveOneGroupOut

rank_df = pd.read_csv(outpath("SHAP_FEATURE_RANKING.CSV"), index_col=0, encoding="utf-8-sig")
feature_order = [f for f in rank_df.index if f in X_all.columns]
if not feature_order:
    raise RuntimeError("[Cell3C] ランキング上位特徴が X_all に存在しません。")

ks = list(range(len(feature_order), 0, -1))
logo = LeaveOneGroupOut()
ap_list, prauc_list, pi_list = [], [], []

for k in ks:
    feats = feature_order[:k]
    X = X_all[feats].astype(np.float32)
    y = y_all.values
    g = groups.values

    y_true_all, proba_all = [], []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[Cell3C] 学習foldが単一クラス。")
        model = fit_classifier(X_tr, pd.Series(y_tr))
        proba = predict_positive_score(model, X_te)
        y_true_all.append(y_te); proba_all.append(proba)

    y_true_k = np.concatenate(y_true_all)
    proba_k = np.concatenate(proba_all)
    if len(np.unique(y_true_k)) < 2:
        raise RuntimeError("[Cell3C] pooled 真値が単一クラスで AUPRC 計算不可。")

    ap = float(average_precision_score(y_true_k, proba_k))
    prec, rec, _ = precision_recall_curve(y_true_k, proba_k)
    prauc = float(auc(rec, prec))
    ap_list.append(ap)
    prauc_list.append(prauc)
    pi_list.append(float((y_true_k == 1).mean()))

pd.DataFrame({"k": ks, "ap": ap_list, "prauc": prauc_list, "pi": pi_list}).to_csv(
    outpath("AUPRC_PER_K.CSV"), index=False, encoding="utf-8-sig"
)

ap_array = np.asarray(ap_list, dtype=float)
best_idx_ap = int(np.nanargmax(ap_array))
best_k_ap = ks[best_idx_ap]
print(f"[Cell3C] Best k (AP) = {best_k_ap}, AP={ap_list[best_idx_ap]:.3f}")

plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.plot(ks, ap_list, marker='o', linewidth=1.5, label="AP")
ax.scatter([best_k_ap], [ap_list[best_idx_ap]], s=150, color="red", zorder=5)
ax.invert_xaxis()
ax.set_xlabel("Number of Features (k)", fontsize=26)
ax.set_ylabel("Average Precision", fontsize=26)
ax.tick_params(axis="both", labelsize=22)
ax.set_title("AP vs Number of Features", fontsize=34, pad=10)
ax.grid(True, alpha=0.4)
plt.tight_layout()
plt.savefig(outpath("AP_VS_NUM_FEATURES.PNG"), dpi=300)
plt.close()

if USE_AP_FOR_K:
    globals()["best_k"] = best_k_ap
    print(f"[Cell3C] USE_AP_FOR_K=True → best_k を {best_k_ap} に上書き")
else:
    print("[Cell3C] USE_AP_FOR_K=False → Cell3Bの best_k を維持")

feats_best = feature_order[:globals()["best_k"]]
logo = LeaveOneGroupOut()
y_true_best, proba_best = [], []
for tr_idx, te_idx in logo.split(X_all[feats_best], y_all.values, groups.values):
    X_tr, X_te = X_all[feats_best].iloc[tr_idx], X_all[feats_best].iloc[te_idx]
    y_tr, y_te = y_all.values[tr_idx], y_all.values[te_idx]
    if len(np.unique(y_tr)) < 2:
        raise RuntimeError("[Cell3C] best_k fold が単一クラス。")
    model = fit_classifier(X_tr, pd.Series(y_tr))
    proba = predict_positive_score(model, X_te)
    y_true_best.append(y_te); proba_best.append(proba)

y_true_best = np.concatenate(y_true_best)
proba_best = np.concatenate(proba_best)
prec, rec, thr = precision_recall_curve(y_true_best, proba_best)
ap_best = float(average_precision_score(y_true_best, proba_best))
prauc_best = float(auc(rec, prec))
pi_best = float((y_true_best == 1).mean())

pd.DataFrame({"recall": rec, "precision": prec, "threshold": np.r_[np.nan, thr]}).to_csv(
    outpath("PR_CURVE_AT_BEST_K.CSV"), index=False, encoding="utf-8-sig"
)

plt.figure(figsize=(10, 7))
ax = plt.gca()
ax.step(rec, prec, where="post", linewidth=1.5,
        label=f"PR (AP={ap_best:.3f}, PR-AUC={prauc_best:.3f})")
ax.axhline(pi_best, linestyle="--", linewidth=1.5, label=f"Baseline π={pi_best:.3f}", alpha=0.8)
ax.set_xlabel("Recall", fontsize=24)
ax.set_ylabel("Precision", fontsize=24)
ax.tick_params(axis="both", labelsize=20)
ax.set_title(f"Precision–Recall at best k = {globals()['best_k']}", fontsize=30, pad=10)
ax.set_xlim([0.0, 1.0]); ax.set_ylim([0.0, 1.05])
ax.grid(True, alpha=0.4)
ax.legend(fontsize=20)
plt.tight_layout()
plt.savefig(outpath("PR_CURVE_AT_BEST_K.PNG"), dpi=300)
plt.close()


In [None]:
# ===== Cell 3D: MSSQ群別 AUC vs k（in-group LOSO）=====

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score
import json

req_vars = ["X_all", "y_all", "groups", "SUBJECT_META"]
missing = [v for v in req_vars if v not in globals()]
if missing:
    raise RuntimeError(f"[Cell3D] 必要変数が未定義: {missing}")
if "MSSQ_group" not in SUBJECT_META.columns:
    raise RuntimeError("[Cell3D] SUBJECT_META に MSSQ_group 列が必要です。")

rank_df = pd.read_csv(outpath("SHAP_FEATURE_RANKING.CSV"), index_col=0, encoding="utf-8-sig")
feature_order = [f for f in rank_df.index if f in X_all.columns]
if not feature_order:
    raise RuntimeError("[Cell3D] ランキング上位特徴が存在しません。")

fair_groups = groups.astype(str).map(SUBJECT_META["MSSQ_group"])
if fair_groups.isna().any():
    raise RuntimeError("[Cell3D] SUBJECT_META に存在しない subject_id があります。")

ks = list(range(len(feature_order), 0, -1))
logo = LeaveOneGroupOut()

def _in_group_auc(mask):
    aucs = []
    for k in ks:
        feats = feature_order[:k]
        X = X_all.loc[mask, feats].astype(np.float32)
        y = y_all.loc[mask].values
        g = groups.loc[mask].values
        if len(np.unique(y)) < 2:
            aucs.append(np.nan); continue
        y_true_all, proba_all = [], []
        for tr_idx, te_idx in logo.split(X, y, g):
            X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
            y_tr, y_te = y[tr_idx], y[te_idx]
            if len(np.unique(y_tr)) < 2:
                aucs.append(np.nan); break
            model = fit_classifier(X_tr, pd.Series(y_tr))
            proba = predict_positive_score(model, X_te)
            y_true_all.append(y_te); proba_all.append(proba)
        else:
            y_true = np.concatenate(y_true_all)
            proba = np.concatenate(proba_all)
            aucs.append(float(roc_auc_score(y_true, proba)) if len(np.unique(y_true)) > 1 else np.nan)
            continue
        # breakした場合
        if len(aucs) < len(ks):
            aucs.extend([np.nan] * (len(ks) - len(aucs)))
    return aucs

mask_H = (fair_groups == "High")
mask_L = (fair_groups == "Low")
auc_high = _in_group_auc(mask_H)
auc_low = _in_group_auc(mask_L)

def _best_k(ks_list, auc_vals):
    arr = np.asarray(auc_vals, dtype=float)
    if np.all(~np.isfinite(arr)):
        return None, np.nan
    maxv = np.nanmax(arr)
    cand = np.array(ks_list)[np.isclose(arr, maxv, rtol=1e-6, atol=1e-12)]
    return int(np.min(cand)), float(maxv)

best_k_high, best_auc_high = _best_k(ks, auc_high)
best_k_low, best_auc_low = _best_k(ks, auc_low)

with open(outpath("BEST_K_BY_GROUP.JSON"), "w", encoding="utf-8") as f:
    json.dump({
        "BEST_K_HIGH": best_k_high,
        "BEST_AUC_HIGH": best_auc_high,
        "BEST_K_LOW": best_k_low,
        "BEST_AUC_LOW": best_auc_low,
    }, f, ensure_ascii=False, indent=2)

plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.plot(ks, auc_high, marker="o", label="MSSQ High")
ax.plot(ks, auc_low, marker="s", label="MSSQ Low")
if best_k_high is not None:
    ax.scatter([best_k_high], [best_auc_high], s=160, zorder=5)
    ax.annotate(f"High max={best_auc_high:.3f} (k={best_k_high})",
                xy=(best_k_high, best_auc_high),
                xytext=(best_k_high, best_auc_high + 0.02),
                ha="center", va="bottom", fontsize=18)
if best_k_low is not None:
    ax.scatter([best_k_low], [best_auc_low], s=160, zorder=5)
    ax.annotate(f"Low max={best_auc_low:.3f} (k={best_k_low})",
                xy=(best_k_low, best_auc_low),
                xytext=(best_k_low, best_auc_low + 0.02),
                ha="center", va="bottom", fontsize=18)

ax.invert_xaxis()
ax.set_xlabel("Number of Features (k)")
ax.set_ylabel("ROC AUC (pooled, in-group LOSO)")
ax.set_title("AUC vs Number of Features by MSSQ Group")
ax.grid(True, alpha=0.4)
ax.legend(loc="best")
plt.tight_layout()
plt.savefig(outpath("AUC_VS_K_BY_GROUP.PNG"), dpi=300)
plt.close()

print(f"[Cell3D] BEST_K_HIGH={best_k_high}, BEST_K_LOW={best_k_low}")


# ===== Section: 固定ハイパラ・診断 =====

| Cell | 目的 | 主な出力 |
| ---- | ---- | -------- |
| 4A | best_k を確定し、閾値探索・検証ポリシー・スケール設定など実験パラメータを一括定義 | ー（設定ログのみ） |
| 4B | 被験者ごとのラベル分布（件数/陽性/陰性/MSSQ群）を集計 | `SUBJECT_LABEL_STATS.CSV` |
| 4C | 外側LOSOの学習被験者から inner-LOSO 検証者を1名ずつ回す fold リストを構築 | 関数 `choose_inner_folds_loso` |


In [None]:
# ===== Cell 4A: 実験設定（best_k fix / 閾値探索ハイパラ）=====

assert "best_k" in globals(), "[Cell4A] best_k が未定義です。特徴選択セクションを先に実行してください。"

USE_GLOBAL_BESTK: bool = True
BEST_K: int = int(best_k)

THRESH_COARSE_STEP: float = 0.01   # 粗探索刻み
THRESH_FINE_STEP:   float = 0.001  # 細探索刻み
THRESH_MARGIN:      float = 0.03   # 細探索範囲の±幅
THRESH_SEARCH_MODE: str = "exact"  # 閾値探索モード（例: "exact"）
THRESH_WG_MODE:     str = "min"    # Worst-group最適化時の目的（"min": min(TPR/TNR)を最大化等）
THRESH_CALIB:       str = "none"   # 校正方式（"none"/"platt"/"isotonic"など）

VAL_RETRY_MAX: int = 30                # inner検証用HLペア抽選の最大リトライ
VAL_REQUIRE_BOTH_CLASSES: bool = True  # 検証データ内で陽性/陰性の両方を要求
VAL_MIN_SAMPLES: int | None = None     # 検証総サンプルの下限（不要なら None）

def _scale_pos_weight_from_y(y_binary: np.ndarray) -> float:
    pos = int(np.sum(y_binary == 1))
    neg = int(np.sum(y_binary == 0))
    if pos == 0:
        print("[Cell4A][WARN] y に陽性が存在しないため scale_pos_weight=1.0")
        return 1.0
    return float(neg / max(pos, 1))

print("[Cell4A] Experiment settings:")
print(f"  BEST_K = {BEST_K} (USE_GLOBAL_BESTK={USE_GLOBAL_BESTK})")
print(f"  Threshold search: coarse={THRESH_COARSE_STEP}, fine={THRESH_FINE_STEP}, margin=±{THRESH_MARGIN}")
print(f"  Search mode={THRESH_SEARCH_MODE}, WG mode={THRESH_WG_MODE}, calib={THRESH_CALIB}")
print(f"  HL validator: retry_max={VAL_RETRY_MAX}, both_classes={VAL_REQUIRE_BOTH_CLASSES}, min_samples={VAL_MIN_SAMPLES}")
print(f"  XGB_PARAMS = {XGB_PARAMS}")
print(f"  SEED_BASE = {SEED_BASE}")


In [None]:
# ===== Cell 4B: 被験者ラベル分布の集計 =====

if groups.dtype != "O":
    groups = groups.astype(str)
y_bin = y_all.astype(int)

subj_stats = (
    pd.DataFrame({"subject_id": groups.values, "label": y_bin.values})
      .groupby("subject_id")["label"]
      .agg(n_total="count", pos="sum")
      .reset_index()
)
subj_stats["neg"] = subj_stats["n_total"] - subj_stats["pos"]

if "SUBJECT_META" in globals() and "MSSQ_group" in SUBJECT_META.columns:
    subj_stats = subj_stats.merge(
        SUBJECT_META[["MSSQ_group"]].reset_index(),
        on="subject_id",
        how="left"
    )

SUBJECT_LABEL_STATS = subj_stats.set_index("subject_id").sort_index()
label_stats_path = outpath("SUBJECT_LABEL_STATS.CSV")
SUBJECT_LABEL_STATS.to_csv(label_stats_path, encoding="utf-8-sig")
print(f"[Cell4B] SUBJECT_LABEL_STATS saved -> {label_stats_path} (subjects={len(SUBJECT_LABEL_STATS)})")

display(SUBJECT_LABEL_STATS.head())


In [None]:
# ===== Cell 4C: inner-LOSO folds builder =====

from typing import List

def choose_inner_folds_loso(train_subject_ids: List[str]) -> List[List[str]]:
    """
    外側LOSOで得た学習被験者（例: 16名）から、内側LOSOの検証者を1名ずつ回すfoldリストを返す。
    戻り値: [[sid1], [sid2], ..., [sidN]] （昇順ソート済み）
    """
    if not isinstance(train_subject_ids, (list, tuple)):
        raise RuntimeError("[Cell4C] train_subject_ids は list/tuple である必要があります。")
    uniq = list(pd.unique(pd.Series([str(sid) for sid in train_subject_ids])))
    if len(uniq) == 0:
        raise RuntimeError("[Cell4C] train_subject_ids が空です。")
    uniq_sorted = sorted(uniq, key=lambda x: (len(x), x))
    folds = [[sid] for sid in uniq_sorted]
    print(f"[Cell4C] inner-LOSO folds: {len(folds)} splits -> val subjects = {', '.join(uniq_sorted)}")
    return folds

print("[Cell4C] Function choose_inner_folds_loso ready.")


# ===== Section: モデリング本体 =====

| Cell | 位置づけ | 目的 | 主な出力 |
| ---- | -------- | ---- | -------- |
| 5A | メイン実験 | inner-LOSOで τ を探索し、outer-LOSOで Single / Group-GLOBAL / WG-1D / WG-2D の予測と評価を取得 | `GROUP_AWARE_THRESH_BY_FOLD.CSV`, `GROUP_AWARE_PREDICTIONS.CSV`, `GROUP_AWARE_SUMMARY.CSV` |
| 5B | 派生1 (post-hoc) | outer予測のみを使って post-hoc exact thresholding を実施 | `PREDICTIONS_OUTERONLY.CSV`, `FINAL_THRESHOLDS_POSTHOC.CSV`, `METRICS_POSTHOC.CSV` |
| 5C | 派生2 (群別best_k) | MSSQ High/Low で best_k を変えて Single τ を最適化し、群別指標と混同行列を出力 | `METRICS_SINGLE_BY_GROUPK.CSV`, `CONFMAT_SINGLE_GROUPK_*.png` |
| 5D | 派生3 (被験者ブートストラップ) | best_k の pooled 予測(OoF)を使い、被験者単位ブートストラップで AUC のCIを推定 | `OOF_PRED_BESTK.CSV`, `AUC_BOOTSTRAP_SUBJECT.csv`, `AUC_BOOTSTRAP_SUMMARY.csv`, `AUC_BOOTSTRAP_{HIST,ECDF}.png` |


In [None]:
# ===== Cell 5A: inner-LOSO τ最適化 → outer予測・評価 =====
# 前提: Cell1〜4 まで実行済み（X_all, y_all, groups, SUBJECT_META, BEST_K, choose_inner_folds_loso などが存在）

import numpy as np
import pandas as pd
import sklearn.metrics as skm
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score

# --- 必須オブジェクト確認 ---
req = ["X_all", "y_all", "groups", "SUBJECT_META", "BEST_K", "METRIC", "METRIC_NAME", "choose_inner_folds_loso"]
missing = [v for v in req if v not in globals()]
if missing:
    raise RuntimeError(f"[Cell5A] 未定義の変数/関数があります: {missing}")
if "MSSQ_group" not in SUBJECT_META.columns and "MSSQ_group" not in SUBJECT_META.index.names:
    raise RuntimeError("[Cell5A] SUBJECT_META に MSSQ_group 列（またはインデックス）が必要です。")

X_base = X_all.astype(np.float32)
y_base = y_all.astype(int)
g_base = groups.astype(str)

# MSSQ group マッピング
if "subject_id" in SUBJECT_META.columns:
    mapper = SUBJECT_META.set_index("subject_id")["MSSQ_group"].astype(str).to_dict()
else:
    mapper = SUBJECT_META["MSSQ_group"].astype(str).to_dict()
fair_groups = g_base.map(mapper).str.strip().str.lower().map({"high": "High", "low": "Low"})
if fair_groups.isna().any():
    raise RuntimeError(f"[Cell5A] MSSQ_group 未割当ID: {fair_groups[fair_groups.isna()].index.tolist()}")

# 特徴選抜（SHAP順の上位K）
rank_df = pd.read_csv(outpath("SHAP_FEATURE_RANKING.CSV"), index_col=0, encoding="utf-8-sig")
feature_order = [f for f in rank_df.index if f in X_base.columns]
feats_k = feature_order[:BEST_K]
if len(feats_k) < BEST_K:
    print(f"[Cell5A][WARN] ランキング上位に X に無い特徴が混在 → {len(feats_k)} 列で実行。")
X_k = X_base[feats_k]

# ---------- 内部ユーティリティ ----------
def _prep_sorted(scores, labels):
    order = np.argsort(-scores)
    s = scores[order]; yb = labels[order]
    pos = (yb == 1); neg = (yb == 0)
    cpos = np.cumsum(pos); cneg = np.cumsum(neg)
    return s, yb, cpos, cneg, int(pos.sum()), int(neg.sum())

def _conf_stats(tab, taus):
    taus = np.atleast_1d(taus).astype(float)
    sort_scores, _, cpos, cneg, P, N = tab
    k = np.searchsorted(-sort_scores, -taus, side="right")
    TP = np.where(k > 0, cpos[k-1], 0); FP = np.where(k > 0, cneg[k-1], 0)
    FN = P - TP; TN = N - FP
    return TP, FP, FN, TN

def _score_from_conf(metric, TP, FP, FN, TN):
    metric = metric.lower()
    TP = np.asarray(TP, float); FP = np.asarray(FP, float)
    FN = np.asarray(FN, float); TN = np.asarray(TN, float)
    if metric == "ba":
        TPR = np.divide(TP, TP+FN, out=np.zeros_like(TP), where=(TP+FN) > 0)
        TNR = np.divide(TN, TN+FP, out=np.zeros_like(TN), where=(TN+FP) > 0)
        return 0.5 * (TPR + TNR)
    if metric == "f1":
        PREC = np.divide(TP, TP+FP, out=np.zeros_like(TP), where=(TP+FP) > 0)
        REC  = np.divide(TP, TP+FN, out=np.zeros_like(TP), where=(TP+FN) > 0)
        return np.divide(2*PREC*REC, PREC+REC, out=np.zeros_like(PREC), where=(PREC+REC) > 0)
    raise ValueError(f"[Cell5A] metric '{metric}' 未対応")

def _best_single_threshold(tab):
    uniq = np.unique(tab[0])
    cands = np.concatenate([[np.nextafter(uniq.max(), np.inf)], uniq[::-1],
                            [np.nextafter(uniq.min(), -np.inf)]])
    TP, FP, FN, TN = _conf_stats(tab, cands)
    scores = _score_from_conf(METRIC, TP, FP, FN, TN)
    idx = int(np.nanargmax(scores))
    return float(cands[idx])

def _grid_search_group_thresholds_exact(sH, yH, sL, yL):
    tabH = _prep_sorted(sH, yH); tabL = _prep_sorted(sL, yL)
    uniqH = np.unique(tabH[0]); uniqL = np.unique(tabL[0])
    candsH = np.concatenate([[np.nextafter(uniqH.max(), np.inf)], uniqH[::-1],
                             [np.nextafter(uniqH.min(), -np.inf)]])
    candsL = np.concatenate([[np.nextafter(uniqL.max(), np.inf)], uniqL[::-1],
                             [np.nextafter(uniqL.min(), -np.inf)]])
    TP_H, FP_H, FN_H, TN_H = _conf_stats(tabH, candsH)
    TP_L, FP_L, FN_L, TN_L = _conf_stats(tabL, candsL)
    scrH = _score_from_conf(METRIC, TP_H, FP_H, FN_H, TN_H)
    scrL = _score_from_conf(METRIC, TP_L, FP_L, FN_L, TN_L)

    best_pooled = {"BA": -np.inf, "tauH": None, "tauL": None}
    best_wg = {"WG": -np.inf, "tauH": None, "tauL": None}

    for i, tauH in enumerate(candsH):
        TP_all = TP_H[i] + TP_L
        FP_all = FP_H[i] + FP_L
        FN_all = FN_H[i] + FN_L
        TN_all = TN_H[i] + TN_L
        BA_all = _score_from_conf("ba", TP_all, FP_all, FN_all, TN_all)
        j = int(np.nanargmax(BA_all))
        if BA_all[j] > best_pooled["BA"]:
            best_pooled.update({"BA": float(BA_all[j]),
                                "tauH": float(tauH), "tauL": float(candsL[j])})
        WG_vec = np.minimum(float(scrH[i]), scrL)
        jwg = int(np.nanargmax(WG_vec))
        if WG_vec[jwg] > best_wg["WG"]:
            best_wg.update({"WG": float(WG_vec[jwg]),
                            "tauH": float(tauH), "tauL": float(candsL[jwg])})
    return best_pooled, best_wg

def _fold_score(y_true, y_pred):
    TN, FP, FN, TP = skm.confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return float(_score_from_conf(METRIC, TP, FP, FN, TN))

# ---------- outer LOSO ----------
logo_outer = LeaveOneGroupOut()
rows, pred_rows = [], []

for fold_id, (tr_idx, te_idx) in enumerate(logo_outer.split(X_k, y_base.values, g_base.values), start=1):
    train_mask = pd.Series(False, index=g_base.index)
    test_mask  = pd.Series(False, index=g_base.index)
    train_mask.iloc[tr_idx] = True
    test_mask.iloc[te_idx] = True
    test_sid = g_base.iloc[te_idx].iloc[0]

    inner_ids = sorted(g_base[train_mask].unique())
    inner_folds = choose_inner_folds_loso(inner_ids)

    tau_single_list = []
    tauH_BA_list, tauL_BA_list = [], []
    tauH_WG_list, tauL_WG_list = [], []

    for inner_val in inner_folds:
        val_mask = g_base.isin(inner_val) & train_mask
        inner_train = train_mask & (~val_mask)
        if not inner_train.any() or not val_mask.any():
            continue

        model_inner = fit_classifier(X_k[inner_train], y_base[inner_train])
        scores_val = predict_positive_score(model_inner, X_k[val_mask]).astype(float)
        y_val = y_base[val_mask]
        grp_val = fair_groups[val_mask].to_numpy()

        tau_single_list.append(_best_single_threshold(_prep_sorted(scores_val, y_val.to_numpy())))

        sH = scores_val[grp_val == "High"]; yH = y_val[grp_val == "High"].to_numpy()
        sL = scores_val[grp_val == "Low"];  yL = y_val[grp_val == "Low"].to_numpy()
        if len(sH) == 0 or len(sL) == 0:
            continue
        best_pooled, best_wg = _grid_search_group_thresholds_exact(sH, yH, sL, yL)
        tauH_BA_list.append(best_pooled["tauH"]); tauL_BA_list.append(best_pooled["tauL"])
        tauH_WG_list.append(best_wg["tauH"]);     tauL_WG_list.append(best_wg["tauL"])

    if not tau_single_list:
        raise RuntimeError(f"[Cell5A] fold{fold_id}: inner validation が空です。")

    tau_single = float(np.nanmedian(tau_single_list))
    if tauH_BA_list:
        tauH_BA = float(np.nanmedian(tauH_BA_list))
        tauL_BA = float(np.nanmedian(tauL_BA_list))
    else:
        tauH_BA = tauL_BA = tau_single
    if tauH_WG_list:
        tauH_WG = float(np.nanmedian(tauH_WG_list))
        tauL_WG = float(np.nanmedian(tauL_WG_list))
    else:
        tauH_WG = tauL_WG = tau_single

    model_outer = fit_classifier(X_k[train_mask], y_base[train_mask])
    scores_test = predict_positive_score(model_outer, X_k[test_mask]).astype(float)
    y_test = y_base[test_mask]
    grp_test = fair_groups[test_mask].to_numpy()

    y_pred_single   = (scores_test >= tau_single).astype(int)
    y_pred_group_BA = (scores_test >= np.where(grp_test=="High", tauH_BA, tauL_BA)).astype(int)
    y_pred_group_WG = (scores_test >= np.where(grp_test=="High", tauH_WG, tauL_WG)).astype(int)

    rows.append({
        "fold_id": fold_id,
        "test_id": test_sid,
        "best_k": BEST_K,
        "tau_single": tau_single,
        "tau_high_BA": tauH_BA, "tau_low_BA": tauL_BA,
        "tau_high_WG": tauH_WG, "tau_low_WG": tauL_WG,
        "BA_single": _fold_score(y_test, y_pred_single),
        "BA_group": _fold_score(y_test, y_pred_group_BA),
        "BA_group_WG": _fold_score(y_test, y_pred_group_WG),
        "n_test": len(y_test),
    })

    for yy, ss, gg, ys, yb, yw in zip(y_test, scores_test, grp_test,
                                      y_pred_single, y_pred_group_BA, y_pred_group_WG):
        pred_rows.append({
            "fold_id": fold_id,
            "test_id": test_sid,
            "y_true": int(yy),
            "proba": float(ss),
            "group": str(gg),
            "y_pred_single": int(ys),
            "y_pred_group_BA": int(yb),
            "y_pred_group_WG": int(yw),
        })

df_fold = pd.DataFrame(rows)
df_pred = pd.DataFrame(pred_rows)
df_fold.to_csv(outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV"), index=False, encoding="utf-8-sig")
df_pred.to_csv(outpath("GROUP_AWARE_PREDICTIONS.CSV"), index=False, encoding="utf-8-sig")

y_pool = df_pred["y_true"].to_numpy()
s_pool = df_pred["proba"].to_numpy()
auc_pool = float(roc_auc_score(y_pool, s_pool))

def _pooled_ba(col):
    yhat = df_pred[col].to_numpy().astype(int)
    TN, FP, FN, TP = skm.confusion_matrix(y_pool, yhat, labels=[0,1]).ravel()
    return float(_score_from_conf("ba", TP, FP, FN, TN))

summary = {
    "best_k": BEST_K,
    "AUC_pooled": auc_pool,
    "BA_pooled_single": _pooled_ba("y_pred_single"),
    "BA_pooled_group_BA": _pooled_ba("y_pred_group_BA"),
    "BA_pooled_group_WG": _pooled_ba("y_pred_group_WG"),
    "metric": METRIC_NAME,
    "n_samples": len(df_pred),
    "n_pos": int((df_pred["y_true"] == 1).sum()),
    "n_neg": int((df_pred["y_true"] == 0).sum()),
}
pd.DataFrame([summary]).to_csv(outpath("GROUP_AWARE_SUMMARY.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell5A] Done: outer folds={len(df_fold)}, pooled AUC={auc_pool:.3f}")


In [None]:
# ===== Cell 5C: MSSQ群別 best_k で Single τ を最適化 =====
# 依存: Cell3D (BEST_K_BY_GROUP.JSON)

import json
import matplotlib.pyplot as plt

bestk_json = outpath("BEST_K_BY_GROUP.JSON")
if os.path.exists(bestk_json):
    with open(bestk_json, "r", encoding="utf-8") as f:
        best_k_meta = json.load(f)
    BEST_K_HIGH = int(best_k_meta.get("BEST_K_HIGH", BEST_K))
    BEST_K_LOW  = int(best_k_meta.get("BEST_K_LOW",  BEST_K))
else:
    BEST_K_HIGH = BEST_K_LOW = BEST_K
    print("[Cell5C][WARN] BEST_K_BY_GROUP.JSON が無いため両群で共通BEST_Kを使用")

def _feats_for_k(k):
    return feature_order[:max(1, min(int(k), len(feature_order)))]

def _single_tau_preds(feats, group_mask):
    X = X_all[feats].astype(np.float32)
    y = y_all.values
    g = groups.values
    logo = LeaveOneGroupOut()
    y_true_all, proba_all, group_all = [], [], []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        if len(np.unique(y_tr)) < 2:
            continue
        model = fit_classifier(X_tr, pd.Series(y_tr))
        proba = predict_positive_score(model, X_te)
        y_true_all.append(y_te); proba_all.append(proba); group_all.append(g[te_idx])
    return np.concatenate(y_true_all), np.concatenate(proba_all), np.concatenate(group_all)

y_true_high, proba_high, group_high = _single_tau_preds(_feats_for_k(BEST_K_HIGH), fair_groups.values=="High")
tau_single_high, BA_high = _best_tau(proba_high, y_true_high)
y_pred_high = (proba_high >= tau_single_high).astype(int)

y_true_low, proba_low, group_low = _single_tau_preds(_feats_for_k(BEST_K_LOW), fair_groups.values=="Low")
tau_single_low, BA_low = _best_tau(proba_low, y_true_low)
y_pred_low = (proba_low >= tau_single_low).astype(int)

df_single = pd.DataFrame({
    "y_true": np.concatenate([y_true_high, y_true_low]),
    "y_pred": np.concatenate([y_pred_high, y_pred_low]),
    "group": np.concatenate([np.repeat("High", len(y_true_high)), np.repeat("Low", len(y_true_low))]),
})
cm_all = confusion_matrix(df_single["y_true"], df_single["y_pred"], labels=[0,1])
TN, FP, FN, TP = cm_all.ravel()
BA_all = _score_from_conf(METRIC, TP, FP, FN, TN)
summary = {
    "best_k_high": BEST_K_HIGH, "tau_high": tau_single_high, "BA_high": BA_high,
    "best_k_low": BEST_K_LOW, "tau_low": tau_single_low, "BA_low": BA_low,
    "BA_all": BA_all,
}
pd.DataFrame([summary]).to_csv(outpath("METRICS_SINGLE_BY_GROUPK.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell5C] Saved METRICS_SINGLE_BY_GROUPK (BA_all={BA_all:.3f})")

def _draw_cm(ax, cm, title):
    TN, FP, FN, TP = cm.ravel()
    mat = np.array([[TN, FP],[FN, TP]])
    vmax = max(mat.max(), 1)
    ax.imshow(mat, cmap="Blues", vmin=0, vmax=vmax)
    labels = np.array([["TN","FP"],["FN","TP"]])
    for i in range(2):
        for j in range(2):
            val = mat[i,j]
            color = "white" if val > 0.6*vmax else "black"
            ax.text(j, i, f"{labels[i,j]} {val}", ha="center", va="center",
                    fontsize=22, fontweight="bold", color=color)
    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred: Non-Sick","Pred: Sick"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True: Non-Sick","True: Sick"], rotation=90, va="center")
    ax.set_title(title); ax.grid(False)

plt.figure(figsize=(9,9))
_draw_cm(plt.gca(), confusion_matrix(df_single["y_true"], df_single["y_pred"], labels=[0,1]),
         f"Single τ (All) — {METRIC_NAME}={BA_all:.3f}")
plt.tight_layout()
plt.savefig(outpath("CONFMAT_SINGLE_GROUPK_ALL.png"), dpi=300); plt.close()

print("[Cell5C] Confusion matrix saved (All). High/Low個別も必要なら追加描画してください。")
