In [63]:
# ===== Cell 0: 学習モデルのレジストリ化（XGB / RF / SVM ほか拡張可能） =====
from __future__ import annotations
from typing import Callable, Dict, Any, Optional
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# ★ 2値化の閾値（例：FMS >= 1 を Sick=1）
FMS_THRESHOLD = 1  # ← 0,1,2... に変更可

# ★ EPOCH長（秒）：30 / 60 / 120 から選択
EPOCH_LEN = 30     # ← ここを 60, 120 に切替え可
if EPOCH_LEN not in (30, 60, 120):
    raise ValueError("EPOCH_LEN は 30/60/120 のいずれかで指定すること。")


# --- 切替スイッチ（必要に応じて "xgb"/"rf" / "svm" に変更） ---
MODEL_BACKEND: str = "xgb"
USE_AP_FOR_K = bool(globals().get("USE_AP_FOR_K", False))  # 既定=True（APでbest_kを上書き）

METRIC = "f1"
METRIC_NAME = "f1"


# --- 既定パラメータ（必要に応じてここだけ触れば全セルに反映） ---
SEED_BASE = 20251101

XGB_PARAMS: Dict[str, Any] = dict(
    n_estimators=100,
    eval_metric="logloss",
    subsample=1.0,
    colsample_bytree=1.0,
    n_jobs=1, # 決定論的
    tree_method="hist",
    device="cpu",
    seed=0, random_state=0
)

RF_PARAMS: Dict[str, Any] = dict(
    n_estimators=439,
    max_depth=14,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features="sqrt",
    bootstrap=False,
    class_weight="balanced",
    random_state=SEED_BASE,
    n_jobs=1,
)

SVM_PARAMS: Dict[str, Any] = dict(
    C=1.0,
    kernel="rbf",
    gamma="scale",
    probability=True,          # AUC/閾値用に確率出力を有効化（重ければ False でも可）
    class_weight="balanced",   # 不均衡対応
    random_state=SEED_BASE,
)

# --- モデルレジストリ（IF分を増やさず拡張） ---
ModelBuilder = Callable[..., Any]
MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {}

def register_backend(name: str, params: Dict[str, Any], builder: ModelBuilder) -> None:
    MODEL_REGISTRY[name] = {"params": params, "builder": builder}

def _build_xgb(params: Dict[str, Any], *, scale_pos_weight: Optional[float] = None):
    assert xgb is not None, "[ERROR] xgboost がインポートできません。"
    p = params.copy()
    if scale_pos_weight is not None:
        p["scale_pos_weight"] = float(scale_pos_weight)
    return xgb.XGBClassifier(**p)

def _build_rf(params: Dict[str, Any], *, scale_pos_weight: Optional[float] = None):
    # RF は scale_pos_weight を使わない（class_weight='balanced' を既定にしてある）
    return RandomForestClassifier(**params)

def _build_svm(params: Dict[str, Any], *, scale_pos_weight: Optional[float] = None):
    # SVM は class_weight で不均衡対応。probability=True なら predict_proba 利用可
    return SVC(**params)

# 初期登録
register_backend("xgb", XGB_PARAMS, _build_xgb)
register_backend("rf",  RF_PARAMS,  _build_rf)
register_backend("svm", SVM_PARAMS, _build_svm)


def set_model_backend(name: str) -> None:
    """バックエンド名の安全な切替。"""
    global MODEL_BACKEND
    assert name in MODEL_REGISTRY, f"[ERROR] backend '{name}' は未登録。候補: {list(MODEL_REGISTRY.keys())}"
    MODEL_BACKEND = name

def build_estimator(
    backend: Optional[str] = None,
    *,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    """レジストリから学習器インスタンスを生成。overrides で一時上書き可。"""
    name = (backend or MODEL_BACKEND).lower()
    assert name in MODEL_REGISTRY, f"[ERROR] backend '{name}' は未登録。"
    base = MODEL_REGISTRY[name]["params"].copy()
    if overrides:
        base.update(overrides)
    builder = MODEL_REGISTRY[name]["builder"]
    return builder(base, scale_pos_weight=scale_pos_weight)

def fit_estimator(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    *,
    backend: Optional[str] = None,
    scale_pos_weight: Optional[float] = None,
    overrides: Optional[Dict[str, Any]] = None,
):
    """
    共通フィット関数。XGB/RF/SVM いずれでも同じ呼び出しで学習可能。
    - X_train は float32 推奨（XGB の速度/メモリ対策）
    - y_train は int32 推奨
    """
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    model = build_estimator(backend, scale_pos_weight=scale_pos_weight, overrides=overrides)
    model.fit(X_train, y_train)
    return model

def predict_positive_score(model, X: pd.DataFrame) -> np.ndarray:
    """
    陽性（1）クラスのスコアを返す。
    - predict_proba があればその第2列を返す（確率）
    - なければ decision_function を返す（AUCは単調変換に不変）
    """
    X = X.astype(np.float32, copy=False)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        return np.asarray(s, dtype=float)
    # 最後の手段（スコア不明な推定器）
    return model.predict(X).astype(float)

MODEL_ID = f"{MODEL_BACKEND.upper()}"
print(f"[INFO] MODEL_BACKEND={MODEL_ID} / SEED={SEED_BASE} / backends={list(MODEL_REGISTRY.keys())}")


[INFO] MODEL_BACKEND=XGB / SEED=20251101 / backends=['xgb', 'rf', 'svm']


In [64]:
# ===== Cell 0: 出力ディレクトリ（FMS閾値ごと）と共通設定 =====
import os



# ★ 入力CSVの基本パス（被験者ごと）
BASE_INPUT_DIR = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果"

# ★ 出力ルート：本実験結果/ANALYSIS/機械学習/閾値FMS{n}
BASE_ANALYSIS_DIR = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS"
OUT_DIR = os.path.join(BASE_ANALYSIS_DIR, "機械学習(MSSQ込み)",f"閾値FMS{int(FMS_THRESHOLD)}")
os.makedirs(OUT_DIR, exist_ok=True)

def outpath(filename: str) -> str:
    return os.path.join(OUT_DIR, filename)

print(f"[OUT_DIR] {OUT_DIR}  |  EPOCH_LEN={EPOCH_LEN}s")

# 被験者ID（氏名なし）
from typing import List
SUBJECT_IDS: List[str] = [
    "10061","10063","10064",
    "10071","10072","10073","10074",
    "10081","10082","10083",
    "10091","10092","10093","10094",
    "10101","10102","10103",
]

# ベースラインとML範囲（epoch_startは30秒刻みの整数）
BASELINE_EPOCH = 1770            # 単一行（フォールバック無し、無ければ即エラー）
ML_START, ML_END = 1800, 2400    # [ML_START, ML_END) を学習用に使用

# 図の体裁（英語ラベル・フォント大きめ）
import matplotlib.pyplot as plt
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "font.size": 20, "axes.titlesize": 26, "axes.labelsize": 22,
    "xtick.labelsize": 20, "ytick.labelsize": 20, "legend.fontsize": 20,
})

# 便利ヘルパー：FMS二値化（OUT_DIRの閾値に連動）
def binarize_fms(series, threshold: int = None):
    th = FMS_THRESHOLD if threshold is None else int(threshold)
    return (series >= th).astype(int)


[OUT_DIR] C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1  |  EPOCH_LEN=30s


In [65]:
# ===== Cell 1: 入力CSV読み込み（全被験者, 元は30sエポック） =====
import pandas as pd
import numpy as np
import os

def subject_csv_path(sid: str) -> str:
    p = os.path.join(BASE_INPUT_DIR, sid, "EPOCH", f"{sid}_epoch.csv")
    if not os.path.exists(p):
        raise FileNotFoundError(f"[ERROR] CSV not found for subject {sid}: {p}")
    return p

dfs = []
for sid in SUBJECT_IDS:
    p = subject_csv_path(sid)
    df = pd.read_csv(p)
    if df.shape[1] < 4:
        raise ValueError(f"[ERROR] CSV columns too few for {sid}: need >=4 (epoch_start, epoch_end, FMS, features...)")
    df = df.copy()
    # 1列目=epoch_start, 2列目=epoch_end, 3列目=FMS, 4列目以降=特徴量（名前はCSVそのまま）
    df.columns = list(df.columns[:3]) + [str(c) for c in df.columns[3:]]
    # 列名標準化
    c1, c2, c3 = df.columns[:3]
    df = df.rename(columns={c1: "epoch_start", c2: "epoch_end", c3: "FMS"})
    # 型
    df["epoch_start"] = pd.to_numeric(df["epoch_start"], errors="coerce").astype("Int64")
    df["epoch_end"]   = pd.to_numeric(df["epoch_end"],   errors="coerce").astype("Int64")
    df["FMS"]         = pd.to_numeric(df["FMS"],         errors="coerce").astype("Int64")
    if df[["epoch_start","epoch_end","FMS"]].isna().any().any():
        raise ValueError(f"[ERROR] epoch_start/epoch_end/FMS に NaN (subject {sid})")
    # subject列
    df.insert(0, "subject_id", sid)
    dfs.append(df)

combined_raw = pd.concat(dfs, ignore_index=True)

# 特徴量列（除外：HF_power, LF_power, LF_HF_ratio）
exclude_feats = {"HF_power", "LF_power", "LF_HF_ratio"}
all_cols = combined_raw.columns.tolist()
feature_cols_all = [c for c in all_cols if c not in ["subject_id","epoch_start","epoch_end","FMS"] and c not in exclude_feats]
if len(feature_cols_all) == 0:
    raise RuntimeError("[ERROR] 特徴量列が0です（除外のしすぎか列名不一致）")

print(f"[INFO] Loaded subjects: {len(SUBJECT_IDS)}, rows={len(combined_raw)}, features(after drop)={len(feature_cols_all)}")


[INFO] Loaded subjects: 17, rows=357, features(after drop)=45


In [66]:
# ===== Cell 2: EPOCH_LEN（30/60/120s）合成 → baseline差分（特徴量のみ） =====
import numpy as np
import pandas as pd

# EPOCH_LEN=30 のときも同じロジック（実質1本平均）
if (ML_END - ML_START) % EPOCH_LEN != 0:
    raise ValueError(f"[ERROR] ML window length {(ML_END-ML_START)} は EPOCH_LEN={EPOCH_LEN} で割り切れる必要あり。")

df_out_list = []
rows_per_bin = EPOCH_LEN // 30  # 30秒エポック何本で1binか（30→1, 60→2, 120→4）

for sid, sdf in combined_raw.groupby("subject_id", sort=False):
    # baseline row（1770–1800 の 1行）
    base_row = sdf.loc[sdf["epoch_start"] == BASELINE_EPOCH]
    if base_row.shape[0] != 1:
        raise ValueError(f"[ERROR] subject {sid}: baseline row not found (epoch_start=={BASELINE_EPOCH})")
    base_vals = base_row[feature_cols_all].astype(float).iloc[0]
    if base_vals.isna().any():
        bad = base_vals.index[base_vals.isna()].tolist()
        raise ValueError(f"[ERROR] subject {sid}: baseline feature NaN -> {bad}")

    # ML window（元 30秒エポック）
    sdf_ml = sdf[(sdf["epoch_start"] >= ML_START) & (sdf["epoch_start"] < ML_END)].copy()
    if sdf_ml.empty:
        raise ValueError(f"[ERROR] subject {sid}: ML window empty [{ML_START},{ML_END})")

    # 30秒 → EPOCH_LEN秒 の bin_start を計算
    # 例：EPOCH_LEN=60 の場合、1800,1830→bin_start=1800 / 1860,1890→bin_start=1860 ...
    sdf_ml["bin_start"] = ML_START + ((sdf_ml["epoch_start"] - ML_START) // EPOCH_LEN) * EPOCH_LEN
    sdf_ml["bin_end"]   = sdf_ml["bin_start"] + EPOCH_LEN

    # 各 bin に含まれる30秒行数が rows_per_bin（完全）であるものだけ採用
    size_check = sdf_ml.groupby(["bin_start","bin_end"]).size()
    complete_bins = size_check[size_check == rows_per_bin].index
    sdf_ml = sdf_ml.set_index(["bin_start","bin_end"]).loc[complete_bins].reset_index()
    if sdf_ml.empty:
        raise ValueError(f"[ERROR] subject {sid}: no complete bins for EPOCH_LEN={EPOCH_LEN}")

    # ★ 合成前（元30秒）の平均で NEWEPOCH を作る（FMS も features も mean）
    agg_dict = {c: "mean" for c in feature_cols_all}
    agg_dict.update({"FMS": "mean"})  # FMS も平均（小数になる）
    g = sdf_ml.groupby(["subject_id", "bin_start", "bin_end"], as_index=False).agg(agg_dict)

    # baseline差分：特徴量のみ（mean − baseline）
    g_features = g[feature_cols_all].astype(float) - base_vals.values
    if g_features.isna().any().any():
        bad_cols = g_features.columns[g_features.isna().any()].tolist()
        raise ValueError(f"[ERROR] subject {sid}: baseline-delta has NaN in features -> {bad_cols}")

    g_out = pd.concat([g[["subject_id","bin_start","bin_end","FMS"]], g_features], axis=1)
    g_out = g_out.rename(columns={"bin_start":"epoch_start", "bin_end":"epoch_end"})  # 以降と同じ列名にそろえる

    # ラベル（平均FMSに対して二値化）
    g_out["label"] = binarize_fms(g_out["FMS"])

    # 列順を揃える
    g_out = g_out[["subject_id","epoch_start","epoch_end","FMS","label"] + feature_cols_all]
    df_out_list.append(g_out)

# 連結
df_ml_epoch = pd.concat(df_out_list, ignore_index=True)

# 保存（スケーリング前＝差分後の生値；木系なのでスケーリング不要だがファイルも出す）
fname_base = f"ML_DATA_DELTA_{EPOCH_LEN}S_RAW.CSV"
df_ml_epoch.to_csv(outpath(fname_base), index=False, encoding="utf-8-sig")
print(f"[OK] Saved RAW ML table -> {outpath(fname_base)}")

# 学習行列（以降のセルが参照）
feature_cols = feature_cols_all[:]  # CSVそのままの列名（除外済み）
X_all = df_ml_epoch[feature_cols].copy().astype(float)
y_all = df_ml_epoch["label"].copy().astype(int)
groups = df_ml_epoch["subject_id"].copy()

# X_scaled_all は同一（スケーリングなし）
X_scaled_all = X_all.copy()
X_all.to_csv(outpath(f"X_RAW_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")
X_scaled_all.to_csv(outpath(f"X_SCALED_ALL_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig")
pd.DataFrame({"subject_id": groups, "label": y_all, "FMS_mean": df_ml_epoch["FMS"]}).to_csv(
    outpath(f"Y_AND_GROUPS_{EPOCH_LEN}S.CSV"), index=False, encoding="utf-8-sig"
)
print(f"[OK] Saved matrices -> X_RAW_ALL_{EPOCH_LEN}S.CSV, X_SCALED_ALL_{EPOCH_LEN}S.CSV, Y_AND_GROUPS_{EPOCH_LEN}S.CSV")


[OK] Saved RAW ML table -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\ML_DATA_DELTA_30S_RAW.CSV
[OK] Saved matrices -> X_RAW_ALL_30S.CSV, X_SCALED_ALL_30S.CSV, Y_AND_GROUPS_30S.CSV


In [67]:
# ===== Cell 2.5 (REPLACE): SUBJECT_META loader =====
import os
import numpy as np
import pandas as pd

# 参照元：/mnt/data/summary_scores.xlsx の "Summary" シート
CANDIDATE_SCORE_PATHS = [
    "/mnt/data/summary_scores.xlsx",  # ← ユーザー添付想定
    os.path.join(BASE_ANALYSIS_DIR, "summary_scores.xlsx"),
    os.path.join(BASE_ANALYSIS_DIR, "機械学習", "summary_scores.xlsx"),
    os.path.join(BASE_INPUT_DIR, "summary_scores.xlsx"),
]
score_path = next((p for p in CANDIDATE_SCORE_PATHS if os.path.exists(p)), None)
if score_path is None:
    raise FileNotFoundError("[Cell2.5] summary_scores.xlsx が見つかりません。パスを確認してください。")

SHEET = "Summary"
meta_raw = pd.read_excel(score_path, sheet_name=SHEET)

# 必須列（このファイル仕様に合わせる）
required = ["ID", "MSSQ", "VIMSSQ"]
missing = [c for c in required if c not in meta_raw.columns]
if missing:
    raise ValueError(f"[Cell2.5] 必須列が不足: {missing} / シート: {SHEET}")

# 列を抽出＆正規化
meta = meta_raw[required].copy()

# ID を string 化（Excelの 10061 → "10061" など）
meta["ID"] = (
    meta["ID"]
    .astype(str)
    .str.strip()
    .str.replace(r"\.0$", "", regex=True)
)

# 数値列チェック
for c in ["MSSQ", "VIMSSQ"]:
    try:
        meta[c] = pd.to_numeric(meta[c], errors="raise")
    except Exception:
        raise ValueError(f"[Cell2.5] 数値変換に失敗: {c}")

# （任意）解析対象のIDに合わせてフィルタ（SUBJECT_IDS があれば交差）
if "SUBJECT_IDS" in globals() and len(SUBJECT_IDS) > 0:
    sid_set = set(map(str, SUBJECT_IDS))
    meta = meta[meta["ID"].isin(sid_set)].copy()

# 重複チェック
if meta["ID"].duplicated().any():
    dups = meta.loc[meta["ID"].duplicated(), "ID"].tolist()
    raise ValueError(f"[Cell2.5] ID が重複: {dups}")

# ★ 群ラベリング定義（ここが“グループ分けのラベリング”の定義場所）
#   仕様：MSSQ の固定閾値 10.0 で High / Low に二分
MSSQ_THRESHOLD_FIXED = 10.0
meta["MSSQ_group"] = np.where(meta["MSSQ"] >= MSSQ_THRESHOLD_FIXED, "High", "Low")

# SUBJECT_META を index=subject_id として提供（以降は subject_id=文字列ID）
SUBJECT_META = (
    meta.rename(columns={"ID": "subject_id"})
        .set_index("subject_id")[["MSSQ", "VIMSSQ", "MSSQ_group"]]
        .copy()
)

display(SUBJECT_META.head())
SUBJECT_META.to_csv(outpath("subject_meta.csv"), encoding="utf-8-sig")
print(f"[Cell2.5] SUBJECT_META saved -> {outpath('subject_meta.csv')} (file='{score_path}', sheet='{SHEET}')")


Unnamed: 0_level_0,MSSQ,VIMSSQ,MSSQ_group
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10061,19.71,1,High
10063,3.0,2,Low
10064,16.625,4,High
10071,13.0,7,High
10072,5.14,0,Low


[Cell2.5] SUBJECT_META saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\subject_meta.csv (file='C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\summary_scores.xlsx', sheet='Summary')


# ===== Cell 3: Modeling & SHAP helper functions（バックエンド非依存） =====
from typing import Dict, Optional
import numpy as np
import pandas as pd

import shap
from sklearn.metrics import roc_auc_score, accuracy_score

# Cell 0 の共通APIがロード済みであることを確認
assert "fit_estimator" in globals(), "[ERROR] Cell 0 の共通ユーティリティが未定義です。"
assert "predict_positive_score" in globals(), "[ERROR] Cell 0 の共通ユーティリティが未定義です。"
MODEL_BACKEND_STR = globals().get("MODEL_BACKEND", "xgb")

# ----------------------------
# 学習（後方互換ラッパ：既存名を維持）
# ----------------------------
def fit_xgb_classifier(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    scale_pos_weight: Optional[float] = None,
):
    """
    互換ラッパ：既存コードの呼び出し名を維持しつつ、Cell 0 の fit_estimator を利用。
    - XGB のときは scale_pos_weight を内部注入
    - RF / SVM のときは内部で無視（RFは class_weight='balanced'）
    """
    X_train = X_train.astype(np.float32, copy=False)
    y_train = y_train.astype(np.int32, copy=False)
    model = fit_estimator(X_train, y_train, scale_pos_weight=scale_pos_weight)
    return model

# ----------------------------
# SHAP（木モデル用：XGB / RF の両対応）
# ----------------------------
def compute_train_shap_abs_mean(model, X_ref: pd.DataFrame) -> pd.Series:
    """
    平均|SHAP|で特徴重要度を算出（学習データのみ・リーク防止）。
    - 対応：XGB / RF（TreeSHAP）
    - 非対応：SVM 等（必要なら別途 permutation 重要度を用意）
    戻り値：index=特徴名, name='mean_abs'（降順）
    """

    X_ref = X_ref.astype(np.float32, copy=False)

    # 背景データ（確率出力に必要）：学習データのサブセット
    bg_n = min(128, len(X_ref))
    X_bg = X_ref.sample(n=bg_n, random_state=globals().get("SEED_BASE", 0)) if bg_n >= 2 else X_ref

    # 可能なら probability×interventional、失敗時は raw にフォールバック
    try:
        explainer = shap.TreeExplainer(
            model,
            data=X_bg,
            model_output="probability",
            feature_perturbation="interventional",
        )
        sv_any = explainer.shap_values(X_ref)
    except Exception:
        explainer = shap.TreeExplainer(
            model,
            feature_perturbation="tree_path_dependent",
            model_output="raw",
        )
        sv_any = explainer.shap_values(X_ref)

    # ---- ここから戻り値を「陽性クラス1の2次元配列 (n_samples, n_features)」に正規化 ----
    

    # 陽性クラスのインデックスを決定（なければ末尾を陽性扱い）
    classes = getattr(model, "classes_", None)
    if classes is not None and 1 in list(classes):
        pos_idx = int(np.where(classes == 1)[0][0])
    else:
        pos_idx = -1  # 末尾

    # 1) マルチクラス形式のリスト（古いAPIでありがち）
    if isinstance(sv_any, list):
        sv = sv_any[pos_idx]  # (n_samples, n_features)
    else:
        # 2) Explanation オブジェクト → .values
        if hasattr(sv_any, "values"):
            sv_any = sv_any.values
        sv = np.asarray(sv_any)

        # 3) 3次元 (n_samples, n_features, n_classes) → 陽性だけ切り出し
        if sv.ndim == 3:
            sv = sv[..., pos_idx]  # (n_samples, n_features)

        # 4) 1次元なら列に直す（まれ）
        elif sv.ndim == 1:
            sv = sv.reshape(-1, 1)

        # 2次元ならそのまま (n_samples, n_features)

    # 最終チェック：列数と特徴数を合わせる
    if sv.shape[1] != X_ref.shape[1]:
        raise RuntimeError(
            f"[ERROR] SHAP shape mismatch: sv.shape={sv.shape}, X_ref.shape={X_ref.shape}. "
            "列順・前処理を確認してください。"
        )

    abs_mean = np.mean(np.abs(sv), axis=0)  # (n_features,)
    return pd.Series(abs_mean, index=X_ref.columns, name="mean_abs").sort_values(ascending=False)



# ----------------------------
# 評価（AUC / Accuracy）
# ----------------------------
def _is_probability_like(scores: np.ndarray) -> bool:
    return np.isfinite(scores).all() and (0.0 <= scores.min() <= scores.max() <= 1.0)

def evaluate_fold(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    """
    共通APIでスコアを取得して評価。
    - AUC：確率でも decision score でもOK（単調変換に不変）
    - Accuracy：確率なら 0.5、score なら 0.0 を閾値に（しきい値最適化は別セルで実施）
    """
    X_test = X_test.astype(np.float32, copy=False)
    scores = predict_positive_score(model, X_test)

    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, scores)
    else:
        roc_auc = float("nan")

    thr = 0.5 if _is_probability_like(scores) else 0.0
    pred = (scores >= thr).astype(int)
    acc = accuracy_score(y_test.astype(int), pred)

    return {"roc_auc": float(roc_auc), "accuracy": float(acc)}


In [68]:
# ===== Cell 3: Modeling & SHAP helper functions =====
from typing import Dict
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb

def fit_xgb_classifier(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> XGBClassifier:
    """決定論的XGB（木系なのでスケーリング不要）。警告要因は排除。"""
    X_train = X_train.astype(np.float32)
    y_train = y_train.astype(np.int32)
    model = XGBClassifier(
        n_estimators=100,
        eval_metric="logloss",
        subsample=1.0,
        colsample_bytree=1.0,
        n_jobs=1,              # 決定論的
        tree_method="hist",
        device="cpu",
        seed=0, random_state=0
    )
    model.fit(X_train, y_train)
    return model

def compute_train_shap_abs_mean(model: XGBClassifier, X_ref: pd.DataFrame) -> pd.Series:
    """Boosterの pred_contribs でSHAP値（バイアス除く）→ |.|平均"""
    X_ref = X_ref.astype(np.float32)
    dm = xgb.DMatrix(X_ref, feature_names=list(X_ref.columns))
    contribs = model.get_booster().predict(dm, pred_contribs=True)  # (n, n_features+1)
    shap_vals = contribs[:, :-1]
    abs_mean = np.abs(shap_vals).mean(axis=0)
    return pd.Series(abs_mean, index=X_ref.columns, name="mean_abs")

def evaluate_fold(model: XGBClassifier, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    proba = model.predict_proba(X_test.astype(np.float32))[:, 1]
    pred = (proba >= 0.5).astype(int)
    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, proba)
    else:
        roc_auc = float("nan")
    acc = accuracy_score(y_test, pred)
    return {"roc_auc": roc_auc, "accuracy": acc}


In [69]:
# ===== Cell 4: SHAPランキング（LOSOで学習側のみ SHAP、平均集計） =====
from sklearn.model_selection import LeaveOneGroupOut

logo = LeaveOneGroupOut()
shap_frames = []
metrics_rows = []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_scaled_all, y_all, groups), start=1):
    X_tr, X_te = X_scaled_all.iloc[tr_idx], X_scaled_all.iloc[te_idx]
    y_tr, y_te = y_all.iloc[tr_idx], y_all.iloc[te_idx]
    # 学習側が単一クラスは不可
    if len(np.unique(y_tr)) < 2:
        raise RuntimeError("[ERROR] 学習foldが単一クラスです。FMS閾値/期間を見直してください。")

    model = fit_xgb_classifier(X_tr, y_tr)
    # 学習データ上でのSHAP重要度
    abs_mean = compute_train_shap_abs_mean(model, X_tr).rename(f"fold{fold_id}")
    shap_frames.append(abs_mean)

    # 参考：テストfoldのAUC/ACC
    m = evaluate_fold(model, X_te, y_te)
    metrics_rows.append({"test_subject": groups.iloc[te_idx].iloc[0],
                         "roc_auc": m["roc_auc"], "accuracy": m["accuracy"]})

# 重要度表：各fold列＋平均列
shap_rank = pd.concat(shap_frames, axis=1)
shap_rank["mean_abs"] = shap_rank.mean(axis=1)
shap_rank = shap_rank.sort_values("mean_abs", ascending=False)

# 保存
shap_rank.to_csv(outpath("SHAP_FEATURE_RANKING.CSV"), encoding="utf-8-sig")
print(f"[OK] SHAP ranking -> {outpath('SHAP_FEATURE_RANKING.CSV')}")

# ラベル付き（表示名は今回は元列名をそのまま）
shap_labeled = shap_rank.copy()
shap_labeled.to_csv(outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), encoding="utf-8-sig")
print(f"[OK] SHAP labeled -> {outpath('SHAP_FEATURE_RANKING_LABELED.CSV')}")

# 参考メトリクス
pd.DataFrame(metrics_rows).to_csv(outpath("LOSO_METRICS.CSV"), index=False, encoding="utf-8-sig")
print(f"[OK] LOSO metrics -> {outpath('LOSO_METRICS.CSV')}")

# 図（ALL & TOP10）
import matplotlib.pyplot as plt

# ALL
plt.figure(figsize=(10, max(5, len(shap_rank)//3)))
plt.barh(shap_rank.index[::-1], shap_rank["mean_abs"][::-1])
plt.xlabel("Mean |SHAP|"); plt.ylabel("Feature")
plt.title("SHAP Ranking (All)")
plt.tight_layout(); plt.savefig(outpath("SHAP_RANKING_ALL.PNG"), dpi=300); plt.close()
print(f"[OK] Plot -> {outpath('SHAP_RANKING_ALL.PNG')}")

# TOP8（数値ラベルなし・フォント拡大）
topk = shap_rank.head(8).iloc[::-1]  # 上位8件を下から描く
plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.barh(topk.index, topk["mean_abs"].values)

# 余白と体裁
mx = float(topk["mean_abs"].max()) if len(topk) else 1.0
ax.set_xlim(0, mx * 1.08)  # 少し余白
ax.set_xlabel("Mean |SHAP value|", fontsize=26)
ax.set_ylabel("Feature", fontsize=26)
ax.tick_params(axis="both", labelsize=22)
ax.set_title("Top-8 SHAP Feature Ranking", fontsize=34, pad=10)

plt.tight_layout()
plt.savefig(outpath("SHAP_TOP8_RANKING.PNG"), dpi=300)
plt.close()
print(f"[OK] Plot -> {outpath('SHAP_TOP8_RANKING.PNG')}")




[OK] SHAP ranking -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SHAP_FEATURE_RANKING.CSV
[OK] SHAP labeled -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SHAP_FEATURE_RANKING_LABELED.CSV
[OK] LOSO metrics -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\LOSO_METRICS.CSV
[OK] Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SHAP_RANKING_ALL.PNG
[OK] Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SHAP_TOP8_RANKING.PNG


In [70]:
# ===== Cell 6: 全kで pooled AUC → best_k 決定（しきい値非依存） =====
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# ランキング読込（必ず OUT_DIR 内）
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                   outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = None
for p in rank_candidates:
    if os.path.exists(p):
        rank_path = p; break
if rank_path is None:
    raise FileNotFoundError("[ERROR] SHAP_FEATURE_RANKING(_LABELED).CSV が OUT_DIR にありません。")

rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[ERROR] ランキングCSVに mean_abs / mean_abs_shap がありません。")
rank_df = rank_df.sort_values(rank_col, ascending=False)

feature_order = [f for f in rank_df.index if f in X_scaled_all.columns]
if not feature_order:
    raise RuntimeError("[ERROR] ランキングの特徴が X_scaled_all に存在しません。")

ks = list(range(len(feature_order), 0, -1))  # 多→少
logo = LeaveOneGroupOut()

auc_list = []
for k in ks:
    feats = feature_order[:k]
    X = X_scaled_all[feats].astype(np.float32)
    y = y_all.values
    g = groups.values

    y_true_all, proba_all = [], []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[ERROR] 学習foldが単一クラスです。閾値/範囲を見直してください。")
        m = fit_xgb_classifier(X_tr, pd.Series(y_tr))
        proba = m.predict_proba(X_te)[:, 1]
        y_true_all.append(y_te); proba_all.append(proba)
    y_true_k = np.concatenate(y_true_all); proba_k = np.concatenate(proba_all)
    if len(np.unique(y_true_k)) < 2:
        raise RuntimeError("[ERROR] pooled 真値が単一クラスで AUC が計算できません。")
    auc_list.append(float(roc_auc_score(y_true_k, proba_k)))

# CSV保存
pd.DataFrame({"k": ks, "auc_pooled": auc_list}).to_csv(outpath("AUC_PER_K.CSV"), index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {outpath('AUC_PER_K.CSV')}")

# best_k
auc_array = np.asarray(auc_list, dtype=float)
best_k = ks[int(np.nanargmax(auc_array))]
best_auc = float(np.nanmax(auc_array))
print(f"[INFO] Best AUC at k={best_k}: AUC={best_auc:.3f}")

# 図（最大点の赤丸＋注釈、フォント拡大）
plt.figure(figsize=(12, 7))
ax = plt.gca()

ax.plot(ks, auc_list, marker='o', linewidth=1.5)
ax.scatter([best_k], [best_auc], s=180, color="red", zorder=5)

ax.annotate(f"Max AUC = {best_auc:.3f} (k={best_k})",
            xy=(best_k, best_auc),
            xytext=(best_k, best_auc + 0.02),
            ha="center", va="bottom", fontsize=20, color="red")

ax.invert_xaxis()
ax.set_xlabel("Number of Features (k)", fontsize=26)
ax.set_ylabel("ROC AUC (pooled)", fontsize=26)
ax.tick_params(axis="both", labelsize=22)
ax.set_title("AUC vs Number of Features", fontsize=34, pad=10)
ax.grid(True, alpha=0.4)

plt.tight_layout()
plt.savefig(outpath("AUC_VS_NUM_FEATURES.PNG"), dpi=300)
plt.close()
print(f"[OK] Plot -> {outpath('AUC_VS_NUM_FEATURES.PNG')}")


[OK] CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_PER_K.CSV
[INFO] Best AUC at k=5: AUC=0.747
[OK] Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_VS_NUM_FEATURES.PNG


In [None]:
# ===== Cell X: 被験者ブートストラップCI（AUC, OOFベース既定） =====
"""
目的：
- LOSO OOF予測を固定し，被験者（cluster）単位のブートストラップで AUC の95%CIを推定する．
- 既に OOF が無ければ，上位 BEST_K 特徴（Cell4のSHAPランキング）で一度だけLOSOしてOOFを作成してから実行．

出力：
- OOF_PRED_BESTK.CSV（無ければ作成）
- AUC_BOOTSTRAP_SUBJECT.csv（各反復のAUC）
- AUC_BOOTSTRAP_SUMMARY.csv（平均/SE/95%CI/有効反復数 等）
- AUC_BOOTSTRAP_HIST.png（分布＋CI） / AUC_BOOTSTRAP_ECDF.png（累積分布）

主要パラメータ（下の CONFIG を調整）：
- B = 2000（反復回数）
- SEED = 20251101（乱数）
- MAX_REDRAW = 20（単一クラス回避の再抽選上限）
- MODE = "oof" または "retrain"（既定は oof）
- STRATIFY_BY = None または "MSSQ_group"（被験者層別の比率維持；既定 None）
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score

# -------- CONFIG --------
B = 2000
SEED = 20251101
MAX_REDRAW = 20
MODE = "oof"            # "oof" or "retrain"
STRATIFY_BY = None      # None or "MSSQ_group"
OOF_CSV = "OOF_PRED_BESTK.CSV"
BOOT_CSV = "AUC_BOOTSTRAP_SUBJECT.csv"
SUMM_CSV = "AUC_BOOTSTRAP_SUMMARY.csv"
HIST_PNG = "AUC_BOOTSTRAP_HIST.png"
ECDF_PNG = "AUC_BOOTSTRAP_ECDF.png"

# ------------- 前提確認 -------------
assert 'X_scaled_all' in globals(), "[ERROR] X_scaled_all が未定義"
assert 'y_all'        in globals(), "[ERROR] y_all が未定義"
assert 'groups'       in globals(), "[ERROR] groups が未定義"
assert 'outpath'      in globals(), "[ERROR] outpath() が未定義"

# ------------- ユーティリティ -------------
def _load_feature_order():
    """Cell4のランキングCSVから重要度降順の特徴順を取得"""
    rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                       outpath("SHAP_FEATURE_RANKING.CSV")]
    rank_path = None
    for p in rank_candidates:
        if os.path.exists(p):
            rank_path = p; break
    if rank_path is None:
        raise FileNotFoundError("[ERROR] SHAP_FEATURE_RANKING*.CSV が見つかりません（Cell 4 実行を確認）")

    df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
    rcol = "mean_abs" if "mean_abs" in df.columns else ("mean_abs_shap" if "mean_abs_shap" in df.columns else None)
    if rcol is None:
        raise KeyError("[ERROR] ランキングCSVに mean_abs / mean_abs_shap が無い")
    order = [f for f in df.sort_values(rcol, ascending=False).index if f in X_scaled_all.columns]
    if not order:
        raise RuntimeError("[ERROR] ランキングの特徴が X_scaled_all に存在しません")
    return order

def _predict_proba_safe(model, X):
    """predict_proba が無い学習器への保険"""
    try:
        return model.predict_proba(X.astype(np.float32))[:, 1]
    except Exception:
        p = model.decision_function(X)
        p = (p - p.min()) / (p.max() - p.min() + 1e-12)
        return p

def build_oof_bestk(feature_order, best_k):
    """上位 BEST_K 特徴で LOSO OOF を作成（1回だけ学習）"""
    feats = feature_order[:int(best_k)]
    X = X_scaled_all[feats].astype(np.float32)
    y = pd.Series(np.asarray(y_all)).reset_index(drop=True)
    g = pd.Series(groups.astype(str).values).reset_index(drop=True)

    logo = LeaveOneGroupOut()
    rows = []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[ERROR] 学習foldが単一クラス（OOF作成中）")
        model = fit_xgb_classifier(X_tr, y_tr)
        proba = _predict_proba_safe(model, X_te)
        sub_ids = g.iloc[te_idx].astype(str).values  # 同一subjectが並ぶ
        df_fold = pd.DataFrame({"subject": sub_ids, "y_true": y_te.values, "y_score": proba})
        rows.append(df_fold)

    oof = pd.concat(rows, ignore_index=True)
    oof.to_csv(outpath(OOF_CSV), index=False, encoding="utf-8-sig")
    print(f"[OK] OOF saved -> {outpath(OOF_CSV)}")
    return oof

def _attach_strata_if_needed(oof_df):
    """STRATIFY_BY='MSSQ_group' の場合，被験者メタから層ラベルを付与"""
    if STRATIFY_BY is None:
        oof_df["strata"] = "ALL"
        return oof_df
    if STRATIFY_BY == "MSSQ_group" and 'SUBJECT_META' in globals():
        meta = SUBJECT_META.copy()
        # 代表名の推定
        id_col  = next((c for c in meta.columns if c.lower() in ("subject","subject_id","id","sid")), None)
        grp_col = next((c for c in meta.columns if c.lower() in ("mssq_group","mssqgroup","group","mssq_highlow")), None)
        if (id_col is not None) and (grp_col is not None):
            d = dict(zip(meta[id_col].astype(str), meta[grp_col].astype(str)))
            oof_df["strata"] = oof_df["subject"].astype(str).map(d).fillna("ALL")
            return oof_df
    # フォールバック
    oof_df["strata"] = "ALL"
    return oof_df

def bootstrap_auc_subject(oof_df, B=2000, seed=20251101, max_redraw=20):
    """被験者（cluster）ブートストラップでAUC分布を推定（OOF固定）"""
    rng = np.random.default_rng(seed)
    subjects = oof_df["subject"].astype(str).unique()
    n_subj = len(subjects)

    # strataごとに被験者集合を準備
    strata_by_subj = (oof_df[["subject","strata"]].drop_duplicates()
                      .set_index("subject")["strata"].to_dict())
    strata_levels = sorted(oof_df["strata"].unique())
    subj_by_strata = {s: [sub for sub in subjects if strata_by_subj.get(sub,"ALL")==s] for s in strata_levels}

    rec, skipped = [], 0
    print(f"[BOOT] start: B={B}, mode=oof, BEST_K={globals().get('BEST_K','?')}, SEED={seed}")

    for b in range(B):
        redraw = 0
        while True:
            # 層別（必要なら各層で元の被験者数と同数を復元）
            chosen = []
            for st in strata_levels:
                pool = subj_by_strata[st]
                if len(pool) == 0:
                    continue
                chosen.extend(list(rng.choice(pool, size=len(pool), replace=True)))
            # 連結（重複subjectは複数回分を結合）
            parts = [oof_df[oof_df["subject"]==sid] for sid in chosen]
            boot = pd.concat(parts, ignore_index=True)

            yb = boot["y_true"].values
            if np.unique(yb).size >= 2:
                break
            redraw += 1
            if redraw > max_redraw:
                skipped += 1
                boot = None
                break
        if boot is None:
            continue

        auc_b = float(roc_auc_score(boot["y_true"].values, boot["y_score"].values))
        rec.append(dict(
            b_id=int(b), auc=auc_b, n_subjects=int(n_subj),
            n_pos=int((boot["y_true"].values==1).sum()),
            n_neg=int((boot["y_true"].values==0).sum()),
            seed=int(seed)
        ))
        if (b+1) % 200 == 0:
            print(f"[BOOT] b={b+1:4d}  auc={auc_b:.3f}")

    df_boot = pd.DataFrame(rec)
    return df_boot, skipped

def summarize_bootstrap(df_boot, auc_oof):
    """平均・SE・95%percentile CI を計算"""
    vals = df_boot["auc"].dropna().values
    mean = float(np.nanmean(vals)) if len(vals) else np.nan
    se   = float(np.nanstd(vals, ddof=1)/np.sqrt(max(1,len(vals)))) if len(vals)>1 else np.nan
    p2p5 = float(np.nanquantile(vals, 0.025)) if len(vals) else np.nan
    p97p5= float(np.nanquantile(vals, 0.975)) if len(vals) else np.nan
    return dict(auc_oof=float(auc_oof), mean=mean, se=se, p2p5=p2p5, p97p5=p97p5,
                n_boot=int(len(vals)))

def _set_plot_style():
    plt.rcParams.update({
        "font.size": 20, "axes.titlesize": 30, "axes.labelsize": 24,
        "xtick.labelsize": 20, "ytick.labelsize": 20, "legend.fontsize": 20
    })

def plot_bootstrap_hist(df_boot, auc_oof, ci_low, ci_high, note, png_name):
    _set_plot_style()
    plt.figure(figsize=(9,6))
    vals = df_boot["auc"].dropna().values
    plt.hist(vals, bins=30, alpha=0.8)
    # 目標線
    ax = plt.gca()
    ax.axvline(auc_oof, color="red", linewidth=1.5, label=f"OOF AUC = {auc_oof:.3f}")
    ax.axvline(ci_low, color="black", linewidth=1.5, linestyle="--", label=f"95% CI [{ci_low:.3f}, {ci_high:.3f}]")
    ax.axvline(ci_high, color="black", linewidth=1.5, linestyle="--")
    plt.title("Subject Bootstrap of AUC (Histogram)")
    plt.xlabel("AUC")
    plt.ylabel("Frequency")
    plt.legend()
    plt.text(0.98, 0.02, note, ha="right", va="bottom", transform=ax.transAxes, fontsize=12)
    plt.tight_layout()
    plt.savefig(outpath(png_name), dpi=300)
    plt.close()
    print(f"[OK] FIG -> {outpath(png_name)}")

def plot_bootstrap_ecdf(df_boot, auc_oof, ci_low, ci_high, note, png_name):
    _set_plot_style()
    plt.figure(figsize=(9,6))
    vals = np.sort(df_boot["auc"].dropna().values)
    y = np.arange(1, len(vals)+1) / max(1, len(vals))
    plt.plot(vals, y, linewidth=1.5)
    ax = plt.gca()
    ax.axvline(auc_oof, color="red", linewidth=1.5, label=f"OOF AUC = {auc_oof:.3f}")
    ax.axvline(ci_low, color="black", linewidth=1.5, linestyle="--", label=f"95% CI [{ci_low:.3f}, {ci_high:.3f}]")
    ax.axvline(ci_high, color="black", linewidth=1.5, linestyle="--")
    plt.title("Subject Bootstrap of AUC (ECDF)")
    plt.xlabel("AUC")
    plt.ylabel("Cumulative probability")
    plt.legend()
    plt.text(0.98, 0.02, note, ha="right", va="bottom", transform=ax.transAxes, fontsize=12)
    plt.tight_layout()
    plt.savefig(outpath(png_name), dpi=300)
    plt.close()
    print(f"[OK] FIG -> {outpath(png_name)}")

# ------------- 実行 -------------
# 1) OOFの用意（無ければ作成）
oof_path = outpath(OOF_CSV)
if os.path.exists(oof_path):
    oof = pd.read_csv(oof_path, encoding="utf-8-sig")
    print(f"[INFO] load OOF -> {oof_path}")
else:
    assert 'BEST_K' in globals(), "[ERROR] BEST_K が未定義（Cell 6 実行で決定してください）"
    feat_order = _load_feature_order()
    oof = build_oof_bestk(feat_order, BEST_K)

# OOF基準のAUC
auc_oof = float(roc_auc_score(oof["y_true"].values, oof["y_score"].values))
print(f"[INFO] OOF AUC = {auc_oof:.3f}  (n={len(oof)})")

# 層ラベル付与（必要時）
oof = _attach_strata_if_needed(oof)

# 2) ブートストラップ（MODE="oof" の場合）
if MODE == "oof":
    df_boot, skipped = bootstrap_auc_subject(oof, B=B, seed=SEED, max_redraw=MAX_REDRAW)

# 3) まとめ・保存
df_boot.to_csv(outpath(BOOT_CSV), index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {outpath(BOOT_CSV)}")

summ = summarize_bootstrap(df_boot, auc_oof)
summ_df = pd.DataFrame([{
    **summ,
    "skipped": int(skipped),
    "B": int(B),
    "BEST_K": int(globals().get("BEST_K", -1)),
    "MODE": MODE,
    "SEED": int(SEED)
}])
summ_df.to_csv(outpath(SUMM_CSV), index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {outpath(SUMM_CSV)}")
print(f"[INFO] 95% CI: [{summ['p2p5']:.3f}, {summ['p97p5']:.3f}]  mean={summ['mean']:.3f}  se={summ['se']:.4f}  (n_boot={summ['n_boot']}, skipped={skipped})")

# 4) 図
note = f"B={B}, BEST_K={globals().get('BEST_K','?')}, MODE={MODE}, SEED={SEED}"
plot_bootstrap_hist(df_boot, auc_oof, summ["p2p5"], summ["p97p5"], note, HIST_PNG)
plot_bootstrap_ecdf(df_boot, auc_oof, summ["p2p5"], summ["p97p5"], note, ECDF_PNG)


[INFO] load OOF -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\OOF_PRED_BESTK.CSV
[INFO] OOF AUC = 0.747  (n=340)
[BOOT] start: B=2000, mode=oof, BEST_K=5, SEED=20251101


In [None]:
# ===== Cell 6b: 全kで pooled AUPRC(AP) → best_k 決定（しきい値非依存）＋ best_k のPR曲線 =====
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# --- 設定：APで選んだkを best_k に反映するか（Cell7/Cell8の前提用） ---
USE_AP_FOR_K = bool(globals().get("USE_AP_FOR_K", False))  # 既定=True（APでbest_kを上書き）

# ランキング読込（必ず OUT_DIR 内）
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                   outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = next((p for p in rank_candidates if os.path.exists(p)), None)
if rank_path is None:
    raise FileNotFoundError("[ERROR] SHAP_FEATURE_RANKING(_LABELED).CSV が OUT_DIR にありません。")

rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[ERROR] ランキングCSVに mean_abs / mean_abs_shap がありません。")
rank_df = rank_df.sort_values(rank_col, ascending=False)

feature_order = [f for f in rank_df.index if f in X_scaled_all.columns]
if not feature_order:
    raise RuntimeError("[ERROR] ランキングの特徴が X_scaled_all に存在しません。")

ks = list(range(len(feature_order), 0, -1))  # 多→少
logo = LeaveOneGroupOut()

ap_list, prauc_list = [], []
pi_list = []  # 陽性率（基準ライン）

for k in ks:
    feats = feature_order[:k]
    X = X_scaled_all[feats].astype(np.float32)
    y = y_all.values
    g = groups.values

    y_true_all, proba_all = [], []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[ERROR] 学習foldが単一クラスです。閾値/範囲を見直してください。")
        m = fit_xgb_classifier(X_tr, pd.Series(y_tr))
        proba = m.predict_proba(X_te)[:, 1]
        y_true_all.append(y_te); proba_all.append(proba)

    y_true_k = np.concatenate(y_true_all)
    proba_k  = np.concatenate(proba_all)
    if len(np.unique(y_true_k)) < 2:
        raise RuntimeError("[ERROR] pooled 真値が単一クラスで AUPRC が計算できません。")

    # Average Precision（AP）と参考の台形則PR-AUC
    ap = float(average_precision_score(y_true_k, proba_k))
    prec, rec, _ = precision_recall_curve(y_true_k, proba_k)
    prauc = float(auc(rec, prec))

    ap_list.append(ap)
    prauc_list.append(prauc)
    pi_list.append(float((y_true_k == 1).mean()))

# CSV保存（kごとのAP）
out_csv = outpath("AUPRC_PER_K.CSV")
pd.DataFrame({
    "k": ks,
    "ap_pooled": ap_list,
    "pr_auc_pooled": prauc_list,
    "pi": pi_list
}).to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {out_csv}")

# best_k（AP最大で決定）
ap_array = np.asarray(ap_list, dtype=float)
best_k_ap = ks[int(np.nanargmax(ap_array))]
best_ap = float(np.nanmax(ap_array))
print(f"[INFO] Best AP at k={best_k_ap}: AP={best_ap:.3f}")

# 図（AP vs k：最大点の赤丸＋注釈＋基準ライン π）
plt.figure(figsize=(12, 7))
ax = plt.gca()
ax.plot(ks, ap_list, marker='o', linewidth=1.5, label="AP (AUPRC)")
ax.scatter([best_k_ap], [best_ap], s=180, color="red", zorder=5)

# πは全kで同一のはずだが、明示的に平均値を使う
pi_ref = float(np.mean(pi_list)) if len(pi_list) > 0 else np.nan
if np.isfinite(pi_ref):
    ax.axhline(pi_ref, linestyle="--", linewidth=1.5, label=f"Baseline (π={pi_ref:.3f})", alpha=0.8)

ax.annotate(f"Max AP = {best_ap:.3f} (k={best_k_ap})",
            xy=(best_k_ap, best_ap),
            xytext=(best_k_ap, best_ap + 0.03),
            ha="center", va="bottom", fontsize=20, color="red")

# 表示規約
ax.invert_xaxis()
ax.set_xlabel("Number of Features (k)", fontsize=24)
ax.set_ylabel("AUPRC (Average Precision)", fontsize=24)
ax.tick_params(axis="both", labelsize=20)
ax.set_title("AUPRC vs Number of Features", fontsize=30, pad=10)
ax.grid(True, alpha=0.4)
ax.legend(fontsize=20)

plt.tight_layout()
out_png = outpath("AUPRC_VS_NUM_FEATURES.PNG")
plt.savefig(out_png, dpi=300)
plt.close()
print(f"[OK] Plot -> {out_png}")

# --- Cell7/Cell8 互換：best_k を必要に応じて設定 ---
if USE_AP_FOR_K or ("best_k" not in globals()):
    best_k = int(best_k_ap)
    print(f"[INFO] best_k を APベースで設定: best_k={best_k}  (USE_AP_FOR_K={USE_AP_FOR_K})")
else:
    # 既存の best_k を尊重（Cell6でAUC選抜済みなど）
    print(f"[INFO] 既存の best_k を保持（USE_AP_FOR_K=False）。APベースは best_k_ap={best_k_ap}")

# =========================
# 追加：best_k における PR 曲線の作図（pooled outer-LOSO 予測）
# =========================
feats_best = feature_order[:best_k_ap]
X_best = X_scaled_all[feats_best].astype(np.float32)
y_full = y_all.values
g_full = groups.values

y_true_best, proba_best = [], []
for tr_idx, te_idx in logo.split(X_best, y_full, g_full):
    X_tr, X_te = X_best.iloc[tr_idx], X_best.iloc[te_idx]
    y_tr, y_te = y_full[tr_idx], y_full[te_idx]
    if len(np.unique(y_tr)) < 2:
        raise RuntimeError("[ERROR] best_k の学習foldが単一クラスです。閾値/範囲を見直してください。")
    m = fit_xgb_classifier(X_tr, pd.Series(y_tr))
    proba = m.predict_proba(X_te)[:, 1]
    y_true_best.append(y_te); proba_best.append(proba)

y_true_best = np.concatenate(y_true_best)
proba_best  = np.concatenate(proba_best)
if len(np.unique(y_true_best)) < 2:
    raise RuntimeError("[ERROR] best_k の pooled 真値が単一クラスで PR 曲線を描画できません。")

prec, rec, thr = precision_recall_curve(y_true_best, proba_best)
ap_best = float(average_precision_score(y_true_best, proba_best))
prauc_best = float(auc(rec, prec))
pi_best = float((y_true_best == 1).mean())

# PR 点群をCSV保存（しきい値は長さ合わせで先頭にNaNを追加）
pr_csv = outpath("PR_CURVE_AT_BEST_K.CSV")
thr_pad = np.r_[np.nan, thr]  # precision/recall と揃える
pd.DataFrame({
    "recall": rec,
    "precision": prec,
    "threshold": thr_pad
}).to_csv(pr_csv, index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {pr_csv}")

# 作図（PR曲線）
plt.figure(figsize=(10, 7))
ax = plt.gca()
# ステップ表示（精度は階段的に変化する）
ax.step(rec, prec, where="post", linewidth=1.5, label=f"PR (AP={ap_best:.3f}, PR-AUC={prauc_best:.3f})")
# 基準ライン：陽性率
ax.axhline(pi_best, linestyle="--", linewidth=1.5, label=f"Baseline π={pi_best:.3f}", alpha=0.8)

ax.set_xlabel("Recall", fontsize=24)
ax.set_ylabel("Precision", fontsize=24)
ax.tick_params(axis="both", labelsize=20)
ax.set_title(f"Precision–Recall at best k = {best_k_ap}", fontsize=30, pad=10)
ax.set_xlim([0.0, 1.0]); ax.set_ylim([0.0, 1.05])
ax.grid(True, alpha=0.4)
ax.legend(fontsize=20)

plt.tight_layout()
pr_png = outpath("PR_CURVE_AT_BEST_K.PNG")
plt.savefig(pr_png, dpi=300)
plt.close()
print(f"[OK] Plot -> {pr_png}  (features: top-{best_k_ap})")
            

KeyboardInterrupt: 

In [None]:
# ===== Cell 6c: MSSQ群別 AUC vs k（in-group LOSO；しきい値非依存） =====
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json

# --- 前提チェック ---
req_vars = ["X_scaled_all", "y_all", "groups", "SUBJECT_META"]
for v in req_vars:
    if v not in globals():
        raise RuntimeError(f"[Cell6c][ERROR] 必要変数 {v} が未定義である。前セルを実行すること。")

if "MSSQ_group" not in SUBJECT_META.columns:
    raise RuntimeError("[Cell6c][ERROR] SUBJECT_META に 'MSSQ_group' 列が存在しない。")

# outpath が無い環境でも動くようにフォールバック
if "outpath" not in globals():
    def outpath(name: str) -> str:
        return os.path.abspath(name)

# --- ランキング読込（Cell 4/6と同様） ---
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                   outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = next((p for p in rank_candidates if os.path.exists(p)), None)
if rank_path is None:
    raise FileNotFoundError("[Cell6c][ERROR] SHAP_FEATURE_RANKING(_LABELED).CSV が見つからない。")

rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else (
    "mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None
)
if rank_col is None:
    raise KeyError("[Cell6c][ERROR] ランキングCSVに mean_abs / mean_abs_shap が無い。")

# 特徴の並び（存在チェック込み）
feature_order = [f for f in rank_df.sort_values(rank_col, ascending=False).index
                 if f in X_scaled_all.columns]
if not feature_order:
    raise RuntimeError("[Cell6c][ERROR] ランキング上位特徴が X_scaled_all に1つも見つからない。")

# k の候補（多→少）
ks = list(range(len(feature_order), 0, -1))

# --- 群ラベル（MSSQ High/Low）を各サンプルに付与 ---
sid_series = pd.Series(groups.astype(str), index=X_scaled_all.index)  # subject_id/被験者ID
missing_ids = sorted(set(sid_series.unique()) - set(SUBJECT_META.index))
if missing_ids:
    raise RuntimeError(f"[Cell6c][ERROR] SUBJECT_META に無い subject_id: {missing_ids}")

fair_groups = sid_series.map(SUBJECT_META["MSSQ_group"].astype(str))  # "High"/"Low"
if fair_groups.isna().any():
    bad = sorted(sid_series[fair_groups.isna()].unique().tolist())
    raise RuntimeError(f"[Cell6c][ERROR] MSSQ_group が欠損の subject_id: {bad}")

# --- 学習器ビルダー（既存の fit_xgb_classifier を優先） ---
def _fit_clf(X_tr, y_tr):
    if "fit_xgb_classifier" in globals():
        return fit_xgb_classifier(X_tr, pd.Series(y_tr))
    # フォールバック：ロジスティック回帰（L2）※xgboost未定義時
    from sklearn.linear_model import LogisticRegression
    m = LogisticRegression(max_iter=200, class_weight="balanced", solver="lbfgs")
    m.fit(X_tr, y_tr.astype(int))
    return m

# --- 群ごとに in-group LOSO で AUC vs k を計算 ---
logo = LeaveOneGroupOut()

def _auc_vs_k_for_group(tag: str):
    """tag ∈ {'High','Low'}"""
    mask = (fair_groups.values == tag)
    if mask.sum() == 0:
        print(f"[Cell6c][WARN] {tag} 群にサンプルが無い。")
        return [np.nan]*len(ks), 0, 0, 0

    X_sub = X_scaled_all.loc[mask]
    y_sub = pd.Series(y_all, index=X_scaled_all.index).loc[mask].astype(int).values
    sid_sub = sid_series.loc[mask].values

    n_sub = int(len(y_sub))
    pos_sub = int(np.sum(y_sub == 1))
    neg_sub = int(np.sum(y_sub == 0))

    aucs = []
    for k in ks:
        feats = feature_order[:k]
        Xk = X_sub[feats].astype(np.float32)

        y_true_all, proba_all = [], []
        skipped = 0

        for tr_idx, te_idx in logo.split(Xk.values, y_sub, sid_sub):
            y_tr = y_sub[tr_idx]
            # 学習foldが単一クラスなら skip（AUC計算は pooled で行う）
            if len(np.unique(y_tr)) < 2:
                skipped += 1
                continue

            m = _fit_clf(Xk.iloc[tr_idx], y_tr)
            proba = m.predict_proba(Xk.iloc[te_idx])[:, 1]

            y_true_all.append(y_sub[te_idx])
            proba_all.append(proba)

        if len(y_true_all) == 0:
            print(f"[Cell6c][WARN] {tag} 群 k={k}: すべてのfoldで学習ラベルが単一 → AUC不可")
            aucs.append(np.nan); continue

        y_pool = np.concatenate(y_true_all)
        p_pool = np.concatenate(proba_all)

        if len(np.unique(y_pool)) < 2:
            print(f"[Cell6c][WARN] {tag} 群 k={k}: pooled 真値が単一クラス → AUC不可")
            aucs.append(np.nan); continue

        auc = float(roc_auc_score(y_pool, p_pool))
        aucs.append(auc)

    return aucs, n_sub, pos_sub, neg_sub

auc_high, nH, posH, negH = _auc_vs_k_for_group("High")
auc_low,  nL, posL, negL = _auc_vs_k_for_group("Low")

# --- CSV保存 ---
df_out = pd.DataFrame({
    "k": ks,
    "auc_high": auc_high,
    "auc_low": auc_low,
    "n_high": nH, "pos_high": posH, "neg_high": negH,
    "n_low": nL,  "pos_low": posL,  "neg_low": negL,
})
csv_path = outpath("AUC_PER_K_BY_GROUP.CSV")
df_out.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"[Cell6c] CSV -> {csv_path}")

# --- best k（同点は“より小さいk”を優先） ---
def _best_k(ks_list, auc_list):
    ks_arr = np.asarray(ks_list, dtype=int)
    auc_arr = np.asarray(auc_list, dtype=float)
    if np.all(np.isnan(auc_arr)):
        return None, np.nan
    maxv = np.nanmax(auc_arr)
    # isclose を使って同点許容（数値誤差対策）
    mask = np.isclose(auc_arr, maxv, rtol=1e-6, atol=1e-12)
    cand_k = ks_arr[mask]
    best_k = int(np.min(cand_k))  # より小さいkを選好
    return best_k, float(maxv)

best_k_high, best_auc_high = _best_k(ks, auc_high)
best_k_low,  best_auc_low  = _best_k(ks, auc_low)

# JSONでも保存（任意）
best_json = {
    "BEST_K_HIGH": best_k_high,
    "BEST_AUC_HIGH": best_auc_high,
    "BEST_K_LOW": best_k_low,
    "BEST_AUC_LOW": best_auc_low,
}
json_path = outpath("BEST_K_BY_GROUP.JSON")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(best_json, f, ensure_ascii=False, indent=2)
print(f"[Cell6c] BEST_K 保存 -> {json_path}")
print(f"[Cell6c] High: best_k={best_k_high}, AUC={best_auc_high:.3f} | Low: best_k={best_k_low}, AUC={best_auc_low:.3f}")

# --- 図：AUC vs k（High/Low） ---
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30,
    "axes.labelsize": 24,
    "legend.fontsize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
})

fig, ax = plt.subplots(figsize=(12, 7))

ax.plot(ks, auc_high, marker="o", label="MSSQ High")
ax.plot(ks, auc_low,  marker="s", label="MSSQ Low")

# 最大点にマーカー＋注釈（存在する場合のみ）
if best_k_high is not None and not np.isnan(best_auc_high):
    ax.scatter([best_k_high], [best_auc_high], s=160, zorder=5)
    ax.annotate(f"High max={best_auc_high:.3f} (k={best_k_high})",
                xy=(best_k_high, best_auc_high),
                xytext=(best_k_high, best_auc_high + 0.02),
                ha="center", va="bottom", fontsize=18)

if best_k_low is not None and not np.isnan(best_auc_low):
    ax.scatter([best_k_low], [best_auc_low], s=160, zorder=5)
    ax.annotate(f"Low max={best_auc_low:.3f} (k={best_k_low})",
                xy=(best_k_low, best_auc_low),
                xytext=(best_k_low, best_auc_low + 0.02),
                ha="center", va="bottom", fontsize=18)

ax.invert_xaxis()  # k が大→小の視覚（既存Cell6に合わせる）
ax.set_xlabel("Number of Features (k)")
ax.set_ylabel("ROC AUC (pooled, in-group LOSO)")
ax.set_title("AUC vs Number of Features by MSSQ Group (in-group LOSO)")
ax.grid(True, alpha=0.4)
ax.legend(loc="best")

plt.tight_layout()
png_path = outpath("AUC_VS_K_BY_GROUP.PNG")
plt.savefig(png_path, dpi=300)
plt.close()
print(f"[Cell6c] Plot -> {png_path}")


[Cell6c] CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_PER_K_BY_GROUP.CSV
[Cell6c] BEST_K 保存 -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\BEST_K_BY_GROUP.JSON
[Cell6c] High: best_k=30, AUC=0.651 | Low: best_k=3, AUC=0.798
[Cell6c] Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_VS_K_BY_GROUP.PNG


In [None]:
# ===== Cell 7: 実験設定（固定 best_k／モデル既定／乱数・閾値探索パラメータ） =====
import numpy as np

# --- best_k を Cell 6 の結果から固定採用する ---
assert 'best_k' in globals(), "[Cell7] best_k が未定義である（Cell 6 を先に実行すること）"
USE_GLOBAL_BESTK: bool = True
BEST_K: int = int(best_k)

# （参考）本セルでは k 探索は行わないため K_LIST は使用しない（固定kで運用）
# K_LIST = [BEST_K]

# --- 群別グリッドサーチ（しきい値最適化：BA 最大化）の設定 ---
THRESH_COARSE_STEP: float = 0.01   # 粗探索刻み
THRESH_FINE_STEP:   float = 0.001  # 細探索刻み
THRESH_MARGIN:      float = 0.03   # 細探索の±幅

# --- しきい値探索モード／WG定義／校正方式 ---
#THRESH_SEARCH_MODE: str = "exact"   # ← 追加（"exact" 固定運用）
#THRESH_WG_MODE:     str = "min"     # ← 追加（"min": 最悪群, "mean": 等重み）
#THRESH_CALIB:       str = "none"    # ← 任意（"none"/"platt"/"isotonic"）

# --- HL 検証ペア（学習側16名の12.5%＝2名）の選定ポリシ ---
VAL_RETRY_MAX: int = 30            # 条件を満たす HL ペアが引けるまでの再抽選上限回数
VAL_REQUIRE_BOTH_CLASSES: bool = True  # 検証2名の合算で陽性/陰性がともに>0を要求する
VAL_MIN_SAMPLES: int | None = None     # 検証データの最小サンプル数下限（例: 40）。不要なら None

# --- 分類器（XGBoost）既定値 ---
#     目的：確率の安定性と過学習抑制のバランス（早期終了なし。閾値は別途最適化するため）

# --- 不均衡対策：scale_pos_weight をデータ毎に都度与える ---
def _scale_pos_weight_from_y(y_binary: np.ndarray) -> float:
    """y_binary(0/1) から neg/pos を返す。pos=0 の場合は 1.0 を返す（学習は通すがログで警告する）である。"""
    pos = int(np.sum(y_binary == 1))
    neg = int(np.sum(y_binary == 0))
    if pos == 0:
        print("[Cell7][WARN] y に陽性が存在しないため scale_pos_weight を 1.0 とする（学習は通す）")
        return 1.0
    return float(neg / max(pos, 1))

# --- XGBClassifier ビルダー（既存実装と衝突しないように存在チェック） ---
from xgboost import XGBClassifier

# ここでは XGB_DEFAULTS を定義しない（Cell 0 の XGB_PARAMS が唯一のソース）
def _build_xgb(scale_pos_weight: float) -> XGBClassifier:
    # Cell 0 定義を土台にし、foldごとの不均衡だけ追記する
    params = dict(XGB_PARAMS)
    params["scale_pos_weight"] = float(scale_pos_weight)
    return XGBClassifier(**params)

# 既に他セルで fit_xgb_classifier が定義済みならそれを優先し、未定義なら軽量版を用意する
if "fit_xgb_classifier" not in globals():
    def fit_xgb_classifier(X_train, y_train):
        """
        既定ハイパラ＋データ由来の scale_pos_weight で XGB を単一学習して返す簡易版である。
        - 早期終了や eval_set は使用しない（閾値最適化は別セルで行うため）
        """
        y_arr = np.asarray(y_train, dtype=int)
        spw = _scale_pos_weight_from_y(y_arr)
        model = _build_xgb(spw)
        model.fit(X_train, y_arr)
        return model

# --- ログ出力（実行時確認用） ---
print("[Cell7] Settings:")
print(f"  USE_GLOBAL_BESTK = {USE_GLOBAL_BESTK}")
print(f"  BEST_K = {BEST_K}")
print(f"  GridSearch (BA): coarse={THRESH_COARSE_STEP}, fine={THRESH_FINE_STEP}, margin=±{THRESH_MARGIN}")
print(f"  HL Validator: retry_max={VAL_RETRY_MAX}, both_classes_required={VAL_REQUIRE_BOTH_CLASSES}, min_samples={VAL_MIN_SAMPLES}")
print(f"  XGB_PARAMS = {XGB_PARAMS}")
print(f"  SEED_BASE = {SEED_BASE}")


[Cell7] Settings:
  USE_GLOBAL_BESTK = True
  BEST_K = 5
  GridSearch (BA): coarse=0.01, fine=0.001, margin=±0.03
  HL Validator: retry_max=30, both_classes_required=True, min_samples=None
  XGB_PARAMS = {'n_estimators': 100, 'eval_metric': 'logloss', 'subsample': 1.0, 'colsample_bytree': 1.0, 'n_jobs': 1, 'tree_method': 'hist', 'device': 'cpu', 'seed': 0, 'random_state': 0}
  SEED_BASE = 20251101


In [None]:
# ===== Cell 7.4 (NEW): Subjectごとのラベル分布を集計しCSV保存 =====
import pandas as pd
import numpy as np
import os

# 前提: groups (pd.Series: sample -> subject_id as str), y_all (pd.Series: 0/1)
if groups.dtype != 'O':
    groups = groups.astype(str)
y_bin = y_all.astype(int)

# 集計: 被験者ごとの総件数・陽性数・陰性数
_subj_stats = (
    pd.DataFrame({"sid": groups.values, "y": y_bin.values})
      .groupby("sid")["y"]
      .agg(n_total="count", pos="sum")
      .reset_index()
)
_subj_stats["neg"] = _subj_stats["n_total"] - _subj_stats["pos"]

# 可能なら群情報（MSSQ_group）を付加（無ければスキップ）
if "SUBJECT_META" in globals() and "MSSQ_group" in SUBJECT_META.columns:
    _subj_stats = _subj_stats.merge(
        SUBJECT_META[["MSSQ_group"]].reset_index().rename(columns={"subject_id":"sid"}),
        on="sid", how="left"
    )

SUBJECT_LABEL_STATS = _subj_stats.set_index("sid").sort_index()

# 保存パスを決定
csv_name = "SUBJECT_LABEL_STATS.CSV"
try:
    save_path = outpath(csv_name)  # 既存の outpath がある前提
except NameError:
    # outpath が未定義ならカレントに保存
    save_path = os.path.join(os.getcwd(), csv_name)

# CSV 保存（BOM付き）
SUBJECT_LABEL_STATS.to_csv(save_path, encoding="utf-8-sig")
print(f"[Cell7.4] SUBJECT_LABEL_STATS saved -> {save_path} (subjects={SUBJECT_LABEL_STATS.shape[0]})")

# 確認表示（先頭のみ）
display(SUBJECT_LABEL_STATS.head())


[Cell7.4] SUBJECT_LABEL_STATS saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SUBJECT_LABEL_STATS.CSV (subjects=17)


Unnamed: 0_level_0,n_total,pos,neg,MSSQ_group
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10061,20,6,14,High
10063,20,0,20,Low
10064,20,3,17,High
10071,20,14,6,High
10072,20,0,20,Low


In [None]:
# ===== Cell 7.5 (REPLACE): inner-LOSO folds builder =====
from typing import List
import pandas as pd

def choose_inner_folds_loso(train_subject_ids: List[str]) -> List[List[str]]:
    """
    外側LOSOの学習側 subject_id（16名）を受け取り、内側LOSOの検証者を1名ずつ回す
    16分割のfoldリストを返す（[[sid1],[sid2],...,[sid16]]）。

    厳格運用：想定から外れたら即 RuntimeError
    """
    if not isinstance(train_subject_ids, (list, tuple)):
        raise RuntimeError("[Cell7.5] train_subject_ids は list/tuple である必要がある。")
    # 文字列正規化
    train_subject_ids = [str(sid) for sid in train_subject_ids]
    # ユニーク・件数チェック
    uniq = list(pd.unique(pd.Series(train_subject_ids)))
    if len(uniq) != 16:
        raise RuntimeError(f"[Cell7.5] 学習側 subject 数が16ではない: {len(uniq)}")
    # 安定順序（昇順）
    try:
        uniq_sorted = sorted(uniq, key=lambda x: (len(x), x))
    except Exception:
        uniq_sorted = sorted(uniq)

    folds = [[sid] for sid in uniq_sorted]  # 検証=1名ずつ
    print(f"[Cell7.5] inner-LOSO folds: {len(folds)} splits -> val subjects = {', '.join(uniq_sorted)}")
    return folds


In [None]:
# ===== Cell 8 (REPLACE): inner-LOSO で τ 最適化（Single/Group-GLOBAL/WG-1D/WG-2D）→ 外側LOSOで予測収集 → プールAUC/BA算出 =====
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import confusion_matrix, roc_auc_score

# ---------------- 基本設定 ----------------
METRIC = "ba"     # BA固定
METRIC_NAME = "BA"
THRESH_SEARCH_MODE = str(globals().get("THRESH_SEARCH_MODE", "exact")).lower()

COARSE_STEP = float(globals().get("THRESH_COARSE_STEP", 0.01))
FINE_STEP   = float(globals().get("THRESH_FINE_STEP",   0.001))
MARGIN      = float(globals().get("THRESH_MARGIN",      0.03))

# best_k の取得（Cell 6/7で決定済み）
k_use = int(BEST_K)

# MSSQ High/Low の取得（グループ情報）
if not set(groups.unique()).issubset(set(SUBJECT_META.index)):
    missing_ids = sorted(set(groups.unique()) - set(SUBJECT_META.index))
    raise ValueError(f"[Cell8] SUBJECT_META に無い subject_id: {missing_ids}")
fair_groups = groups.map(SUBJECT_META["MSSQ_group"]).astype(str)

# SHAPランキングから上位 k_use 特徴を抽出
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = next((p for p in rank_candidates if os.path.exists(p)), None)
if rank_path is None:
    raise FileNotFoundError("[Cell8] SHAP_FEATURE_RANKING(_LABELED).CSV が見つからない（Cell 4/6 を先に）")
rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[Cell8] ランキングCSVに mean_abs / mean_abs_shap 列が無い")
feature_order = [f for f in rank_df.sort_values(rank_col, ascending=False).index if f in X_scaled_all.columns]
feats_k = feature_order[:k_use]
if len(feats_k) < k_use:
    print(f"[Cell8][WARN] Xに存在しない特徴が含まれたため、実使用は {len(feats_k)} 列")
X_k = X_scaled_all[feats_k].astype(np.float32)

# Series の整形
y  = pd.Series(y_all.astype(int),  index=X_scaled_all.index)
g  = pd.Series(groups.astype(str), index=X_scaled_all.index)
fg = pd.Series(fair_groups.astype(str), index=X_scaled_all.index)

# ---------------- ユーティリティ ----------------
def _cumulative_conf_table(scores: np.ndarray, labels: np.ndarray):
    order = np.argsort(-scores)
    s = np.asarray(scores, float)[order]
    yb = np.asarray(labels, int)[order]
    pos = (yb == 1).astype(int)
    neg = (yb == 0).astype(int)
    cpos = np.cumsum(pos)
    cneg = np.cumsum(neg)
    return s, yb, cpos, cneg, int(pos.sum()), int(neg.sum())

def _conf_from_threshold(sort_scores, sort_labels, cpos, cneg, P, N, tau: float):
    k = int(np.searchsorted(-sort_scores, -tau, side="right"))  # s >= tau を正
    TP = int(cpos[k-1]) if k > 0 else 0
    FP = int(cneg[k-1]) if k > 0 else 0
    FN = int(P - TP)
    TN = int(N - FP)
    return TP, FP, FN, TN

def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _f1_from_conf(TP, FP, FN):
    denom = (2*TP + FP + FN)
    return (2*TP / denom) if denom > 0 else 0.0

def _score_from_conf(metric: str, TP, FP, FN, TN):
    return _f1_from_conf(TP, FP, FN) if metric == "f1" else _ba_from_conf(TP, FP, FN, TN)

def _score_from_preds(metric: str, y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_score_from_conf(metric, TP, FP, FN, TN))

def _conf_vectors_from_candidates(tab, taus: np.ndarray):
    sort_scores, sort_labels, cpos, cneg, P, N = tab
    k = np.searchsorted(-sort_scores, -taus, side="right")  # vectorized
    TP = np.where(k > 0, cpos[k-1], 0)
    FP = np.where(k > 0, cneg[k-1], 0)
    FN = P - TP
    TN = N - FP
    return TP, FP, FN, TN

def _make_candidates(scores: np.ndarray) -> np.ndarray:
    s = np.asarray(scores, float)
    if s.size == 0:
        return np.array([0.5], dtype=float)
    uniq = np.unique(s)
    hi = np.nextafter(float(uniq.max()), np.inf)
    lo = np.nextafter(float(uniq.min()), -np.inf)
    return np.concatenate([[hi], uniq[::-1], [lo]])

def _best_tau_single_exact(scores: np.ndarray, labels: np.ndarray, metric: str) -> float:
    tab = _cumulative_conf_table(np.asarray(scores,float), np.asarray(labels,int))
    taus = _make_candidates(scores)
    TP, FP, FN, TN = _conf_vectors_from_candidates(tab, taus)
    if metric == "f1":
        num = 2*TP; den = 2*TP + FP + FN
        sc = np.where(den > 0, num/den, 0.0)
    else:
        TPR = np.where((TP+FN) > 0, TP/(TP+FN), 0.0)
        TNR = np.where((TN+FP) > 0, TN/(TN+FP), 0.0)
        sc = 0.5*(TPR+TNR)
    tau_clip = np.clip(taus, 0.0, 1.0)
    best = np.argmax(sc)
    ties = np.where(np.isclose(sc, sc[best]))[0]
    if ties.size > 1:
        d = np.abs(tau_clip[ties] - 0.5)
        j = np.argmin(d); cand = ties[np.isclose(d, d[j])]
        best = cand[np.argmin(taus[cand])] if cand.size > 1 else cand[0]
    return float(taus[best])

def _best_tau_single(scores: np.ndarray, labels: np.ndarray,
                     coarse: float, fine: float, margin: float,
                     metric: str) -> float:
    tab = _cumulative_conf_table(scores, labels)
    def score_at(t):
        TP, FP, FN, TN = _conf_from_threshold(*tab, t)
        return _score_from_conf(metric, TP, FP, FN, TN)
    best = (-1.0, 0.5)
    for t in np.arange(0.0, 1.0 + 1e-12, coarse):
        sc = score_at(t)
        if sc > best[0]:
            best = (float(sc), float(t))
    t0 = best[1]
    lo = max(0.0, t0 - margin); hi = min(1.0, t0 + margin)
    score0, t_best = best[0], t0
    for t in np.arange(lo, hi + 1e-12, fine):
        sc = score_at(t)
        if sc > score0 or (np.isclose(sc, score0) and (abs(t-0.5) < abs(t_best-0.5) or (abs(t-0.5) == abs(t_best-0.5) and t < t_best))):
            score0, t_best = float(sc), float(t)
    return float(t_best)

def _grid_search_group_thresholds_exact(scores_H, labels_H, scores_L, labels_L, metric: str):
    sH = np.asarray(scores_H, float); yH = np.asarray(labels_H, int)
    sL = np.asarray(scores_L, float); yL = np.asarray(labels_L, int)

    tabH = _cumulative_conf_table(sH, yH)
    tabL = _cumulative_conf_table(sL, yL)
    tausH = _make_candidates(sH)
    tausL = _make_candidates(sL)

    TP_H, FP_H, FN_H, TN_H = _conf_vectors_from_candidates(tabH, tausH)
    TP_L, FP_L, FN_L, TN_L = _conf_vectors_from_candidates(tabL, tausL)

    TP = TP_H[:, None] + TP_L[None, :]
    FP = FP_H[:, None] + FP_L[None, :]
    FN = FN_H[:, None] + FN_L[None, :]
    TN = TN_H[:, None] + TN_L[None, :]

    if metric == "f1":
        num = 2*TP; den = 2*TP + FP + FN
        S_all = np.where(den > 0, num/den, 0.0)
    else:
        TPR = np.where((TP+FN) > 0, TP/(TP+FN), 0.0)
        TNR = np.where((TN+FP) > 0, TN/(TN+FP), 0.0)
        S_all = 0.5*(TPR+TNR)

    idx = np.unravel_index(np.argmax(S_all), S_all.shape)
    best_val = S_all[idx]

    # tie-break: 平均τが0.5に近い→さらに平均が小さい
    avg = (np.clip(tausH,0,1)[:,None] + np.clip(tausL,0,1)[None,:]) / 2
    ties = np.argwhere(np.isclose(S_all, best_val))
    if ties.shape[0] > 1:
        d = np.abs(avg[ties[:,0], ties[:,1]] - 0.5)
        j = np.argmin(d); cand = ties[np.isclose(d, d[j])]
        if cand.shape[0] > 1:
            avgv = avg[cand[:,0], cand[:,1]]
            idx = (int(cand[np.argmin(avgv),0]), int(cand[np.argmin(avgv),1]))
        else:
            idx = (int(cand[0,0]), int(cand[0,1]))

    tauH_best = float(tausH[idx[0]])
    tauL_best = float(tausL[idx[1]])
    return {"GLOBAL": {"score": float(best_val), "tauH": tauH_best, "tauL": tauL_best}}

def _best_tau_groupwise_exact(scores_H, labels_H, scores_L, labels_L, metric: str, wg_mode: str = "min"):
    tauH = _best_tau_single_exact(scores_H, labels_H, metric)
    tauL = _best_tau_single_exact(scores_L, labels_L, metric)
    tabH = _cumulative_conf_table(np.asarray(scores_H,float), np.asarray(labels_H,int))
    tabL = _cumulative_conf_table(np.asarray(scores_L,float), np.asarray(labels_L,int))
    TP_H, FP_H, FN_H, TN_H = _conf_from_threshold(*tabH, tauH)
    TP_L, FP_L, FN_L, TN_L = _conf_from_threshold(*tabL, tauL)
    sH = _score_from_conf(metric, TP_H, FP_H, FN_H, TN_H)
    sL = _score_from_conf(metric, TP_L, FP_L, FN_L, TN_L)
    WG = min(sH, sL) if wg_mode == "min" else 0.5*(sH + sL)
    return {"tauH": float(tauH), "tauL": float(tauL), "score_H": float(sH), "score_L": float(sL), "WG": float(WG)}

# --- NEW: 2D 最悪群最適化（min-max） ---
def _grid_search_group_thresholds_exact_minimax(scores_H, labels_H, scores_L, labels_L, metric: str):
    """
    max_{τH,τL} min{ S_H(τH), S_L(τL) } を 2D で厳密探索。
    タイブレーク: (1) |S_H-S_L| 最小 → (2) 平均τが0.5に近い → (3) 平均τが小さい。
    """
    sH = np.asarray(scores_H, float); yH = np.asarray(labels_H, int)
    sL = np.asarray(scores_L, float); yL = np.asarray(labels_L, int)

    tabH = _cumulative_conf_table(sH, yH)
    tabL = _cumulative_conf_table(sL, yL)
    tausH = _make_candidates(sH)
    tausL = _make_candidates(sL)

    TP_H, FP_H, FN_H, TN_H = _conf_vectors_from_candidates(tabH, tausH)
    TP_L, FP_L, FN_L, TN_L = _conf_vectors_from_candidates(tabL, tausL)

    if metric == "f1":
        numH, denH = 2*TP_H, 2*TP_H + FP_H + FN_H
        numL, denL = 2*TP_L, 2*TP_L + FP_L + FN_L
        S_H = np.where(denH > 0, numH/denH, 0.0)
        S_L = np.where(denL > 0, numL/denL, 0.0)
    else:
        TPR_H = np.where((TP_H+FN_H) > 0, TP_H/(TP_H+FN_H), 0.0)
        TNR_H = np.where((TN_H+FP_H) > 0, TN_H/(TN_H+FP_H), 0.0)
        TPR_L = np.where((TP_L+FN_L) > 0, TP_L/(TP_L+FN_L), 0.0)
        TNR_L = np.where((TN_L+FP_L) > 0, TN_L/(TN_L+FP_L), 0.0)
        S_H = 0.5*(TPR_H+TNR_H)
        S_L = 0.5*(TPR_L+TNR_L)

    M = np.minimum(S_H[:, None], S_L[None, :])          # (mH, mL)
    iH, iL = np.unravel_index(np.argmax(M), M.shape)
    best_val = M[iH, iL]

    ties = np.argwhere(np.isclose(M, best_val))
    if ties.shape[0] > 1:
        gap = np.abs(S_H[ties[:,0]] - S_L[ties[:,1]])
        j = np.argmin(gap)
        cand = ties[np.isclose(gap, gap[j])]
        if cand.shape[0] > 1:
            avg_tau = (np.clip(tausH,0,1)[cand[:,0]] + np.clip(tausL,0,1)[cand[:,1]])/2
            j2 = np.argmin(np.abs(avg_tau - 0.5))
            cand2 = cand[np.isclose(np.abs(avg_tau-0.5), np.abs(avg_tau-0.5)[j2])]
            if cand2.shape[0] > 1:
                avg_tau2 = (tausH[cand2[:,0]] + tausL[cand2[:,1]])/2
                iH, iL = int(cand2[np.argmin(avg_tau2),0]), int(cand2[np.argmin(avg_tau2),1])
            else:
                iH, iL = int(cand2[0,0]), int(cand2[0,1])
        else:
            iH, iL = int(cand[0,0]), int(cand[0,1])

    tauH_best = float(tausH[iH]); tauL_best = float(tausL[iL])
    return {"WG2D": {
        "score": float(best_val),
        "tauH": tauH_best, "tauL": tauL_best,
        "score_H": float(S_H[iH]), "score_L": float(S_L[iL]),
    }}

# ---------------- outer-LOSO：inner-LOSOで τ を決定し、外側テストへ適用 ----------------
logo = LeaveOneGroupOut()
rows, pred_rows = [], []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_k, y, g), start=1):
    test_sids = pd.Index(g.iloc[te_idx]).unique().tolist()
    if len(test_sids) != 1:
        raise RuntimeError(f"[Cell8] LOSO違反: fold={fold_id}, test={test_sids}")
    test_sid = test_sids[0]
    train_sids = pd.Index(g.iloc[tr_idx]).unique().tolist()

    # --- inner-LOSO: OOF作成 ---
    oof_index = X_k.index[g.isin(train_sids)]
    oof_proba = pd.Series(index=oof_index, dtype=float)
    oof_true  = pd.Series(index=oof_index, dtype=int)
    oof_group = pd.Series(index=oof_index, dtype=object)

    for val_sid in train_sids:
        val_mask   = (g == val_sid)
        train_mask = g.isin([sid for sid in train_sids if sid != val_sid])
        if train_mask.sum() == 0 or val_mask.sum() == 0:
            raise RuntimeError(f"[Cell8] inner-LOSO 空fold（val={val_sid})")
        model_inner = fit_xgb_classifier(X_k.loc[train_mask], y.loc[train_mask])
        proba_val = model_inner.predict_proba(X_k.loc[val_mask])[:, 1]
        idx = val_mask[val_mask].index
        oof_proba.loc[idx] = proba_val
        oof_true.loc[idx]  = y.loc[val_mask].values
        oof_group.loc[idx] = fg.loc[val_mask].values

    valid_idx = oof_proba.dropna().index
    if len(valid_idx) == 0:
        raise RuntimeError(f"[Cell8] fold {fold_id}: OOFが空")

    # --- Single τ（属性完全無視） ---
    if THRESH_SEARCH_MODE == "exact":
        tau_single = _best_tau_single_exact(oof_proba.loc[valid_idx].to_numpy(),
                                            oof_true.loc[valid_idx].to_numpy().astype(int), METRIC)
    else:
        tau_single = _best_tau_single(oof_proba.loc[valid_idx].to_numpy(),
                                      oof_true.loc[valid_idx].to_numpy().astype(int),
                                      coarse=COARSE_STEP, fine=FINE_STEP, margin=MARGIN, metric=METRIC)

    # --- Group用 OOF の分割 ---
    s_H = oof_proba.loc[valid_idx][oof_group.loc[valid_idx] == "High"].to_numpy()
    y_H = oof_true.loc[valid_idx][oof_group.loc[valid_idx] == "High"].to_numpy().astype(int)
    s_L = oof_proba.loc[valid_idx][oof_group.loc[valid_idx] == "Low"].to_numpy()
    y_L = oof_true.loc[valid_idx][oof_group.loc[valid_idx] == "Low"].to_numpy().astype(int)
    if len(s_H) == 0 or len(s_L) == 0:
        raise RuntimeError(f"[Cell8] fold {fold_id}: OOFにHigh/Lowが揃わない")

    # A) Group-GLOBAL（全体指標最大化）
    gopt_global = _grid_search_group_thresholds_exact(s_H, y_H, s_L, y_L, metric=METRIC)["GLOBAL"]
    tauH_MAIN, tauL_MAIN = gopt_global["tauH"], gopt_global["tauL"]

    # B) Group-WG（1D×2 独立最適；min/meanはTHRESH_WG_MODEで選択）
    WG_MODE = str(globals().get("THRESH_WG_MODE", "min")).lower()
    if WG_MODE not in {"min", "mean"}:
        WG_MODE = "min"
    gopt_wg = _best_tau_groupwise_exact(s_H, y_H, s_L, y_L, metric=METRIC, wg_mode=WG_MODE)
    tauH_WG1D, tauL_WG1D = gopt_wg["tauH"], gopt_wg["tauL"]

    # C) Group-WG（2D min-max：max min{S_H,S_L}）
    gopt_wg2d = _grid_search_group_thresholds_exact_minimax(s_H, y_H, s_L, y_L, metric=METRIC)["WG2D"]
    tauH_WG2D, tauL_WG2D = gopt_wg2d["tauH"], gopt_wg2d["tauL"]

    print(f"[Cell8] outer {fold_id}: {METRIC_NAME} | "
          f"Single τ={tau_single:.3f} | Group-BA τ=({tauH_MAIN:.3f},{tauL_MAIN:.3f}) | "
          f"WG-1D τ=({tauH_WG1D:.3f},{tauL_WG1D:.3f}) | WG-2D τ=({tauH_WG2D:.3f},{tauL_WG2D:.3f})")

    # --- 外側テスト：学習側で再学習 → テストへ適用 ---
    model_full = fit_xgb_classifier(X_k.loc[g.isin(train_sids)], y.loc[g.isin(train_sids)])
    te_mask  = (g == test_sid)
    proba_te = model_full.predict_proba(X_k.loc[te_mask])[:, 1]
    y_te     = y.loc[te_mask].to_numpy().astype(int)
    grp_te   = fg.loc[te_mask].to_numpy()

    # 2値化
    y_pred_single     = (proba_te >= tau_single).astype(int)
    y_pred_group_MAIN = (proba_te >= np.where(grp_te=="High", tauH_MAIN,  tauL_MAIN)).astype(int)
    y_pred_group_WG1D = (proba_te >= np.where(grp_te=="High", tauH_WG1D, tauL_WG1D)).astype(int)
    y_pred_group_WG2D = (proba_te >= np.where(grp_te=="High", tauH_WG2D, tauL_WG2D)).astype(int)

    # fold内スコア
    SCORE_single      = _score_from_preds(METRIC, y_te, y_pred_single)
    SCORE_group_MAIN  = _score_from_preds(METRIC, y_te, y_pred_group_MAIN)
    SCORE_group_WG1D  = _score_from_preds(METRIC, y_te, y_pred_group_WG1D)
    SCORE_group_WG2D  = _score_from_preds(METRIC, y_te, y_pred_group_WG2D)
    WG_single         = min(_score_from_preds(METRIC, y_te[grp_te=="High"], y_pred_single[grp_te=="High"]) if np.any(grp_te=="High") else np.nan,
                            _score_from_preds(METRIC, y_te[grp_te=="Low"],  y_pred_single[grp_te=="Low"])  if np.any(grp_te=="Low")  else np.nan)

    # 保存（BA固定）
    rows.append({
        "fold_id": fold_id, "test_id": test_sid, "best_k": k_use,
        "tau_single": float(tau_single),
        "BA_single": float(SCORE_single), "WGBA_single": float(WG_single) if WG_single == WG_single else np.nan,
        "tau_high_BA": float(tauH_MAIN), "tau_low_BA": float(tauL_MAIN),
        "BA_group": float(SCORE_group_MAIN),
        "tau_high_WG": float(tauH_WG1D), "tau_low_WG": float(tauL_WG1D),
        "BA_group_WGopt": float(SCORE_group_WG1D),
        "tau_high_WG2D": float(tauH_WG2D), "tau_low_WG2D": float(tauL_WG2D),
        "BA_group_WG2Dopt": float(SCORE_group_WG2D),
        "AUC_test": np.nan, "n_test": int(te_mask.sum())
    })

    for yy, pp, gg_, ys, ym, yw1, yw2 in zip(y_te, proba_te, grp_te, y_pred_single, y_pred_group_MAIN, y_pred_group_WG1D, y_pred_group_WG2D):
        pred_rows.append({
            "fold_id": fold_id, "test_id": test_sid,
            "y_true": int(yy), "proba": float(pp), "group": str(gg_),
            "y_pred_single": int(ys),
            "y_pred_group_BA": int(ym),
            "y_pred_group_WG": int(yw1),
            "y_pred_group_WG2D": int(yw2),
        })

# ---------------- 保存（fold別 / 予測明細） ----------------
df_fold = pd.DataFrame(rows)
df_pred = pd.DataFrame(pred_rows)
df_fold.to_csv(outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV"), index=False, encoding="utf-8-sig")
df_pred.to_csv(outpath("GROUP_AWARE_PREDICTIONS.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell8] saved -> {outpath('GROUP_AWARE_THRESH_BY_FOLD.CSV')} (rows={len(df_fold)})")
print(f"[Cell8] saved -> {outpath('GROUP_AWARE_PREDICTIONS.CSV')} (rows={len(df_pred)})")

# ---------------- プールAUC / BA（Single / Group / WG1D / WG2D） ----------------
y_pool = df_pred["y_true"].to_numpy()
s_pool = df_pred["proba"].to_numpy()
if len(np.unique(y_pool)) < 2:
    raise RuntimeError("[Cell8] プール真値が単一クラスで AUC 不可（データやFMS閾値定義を確認）")

auc_pool = float(roc_auc_score(y_pool, s_pool))

def _BA_from_cols(df, colname):
    yb = df["y_true"].to_numpy().astype(int)
    yh = df[colname].to_numpy().astype(int)
    return _score_from_preds(METRIC, yb, yh)

BA_single = _BA_from_cols(df_pred, "y_pred_single")
BA_gMain  = _BA_from_cols(df_pred, "y_pred_group_BA")
BA_wg1d   = _BA_from_cols(df_pred, "y_pred_group_WG")
BA_wg2d   = _BA_from_cols(df_pred, "y_pred_group_WG2D") if "y_pred_group_WG2D" in df_pred.columns else np.nan

summary = {
    "best_k": k_use,
    "AUC_pooled": auc_pool,
    "BA_pooled_single": float(BA_single),
    "BA_pooled_group_GLOBALopt": float(BA_gMain),
    "BA_pooled_group_WG1Dopt": float(BA_wg1d),
    "BA_pooled_group_WG2Dopt": float(BA_wg2d) if BA_wg2d==BA_wg2d else np.nan,
    "metric": METRIC_NAME,
    "n_folds": int(len(df_fold)),
    "search_mode": THRESH_SEARCH_MODE
}
pd.DataFrame([summary]).to_csv(outpath("LOSO_METRICS.CSV"), index=False, encoding="utf-8-sig")
print(f"[OK] Pooled AUC = {auc_pool:.3f}  (METRIC={METRIC_NAME}, SEARCH={THRESH_SEARCH_MODE}) -> {outpath('LOSO_METRICS.CSV')}")


[Cell8] outer 1: BA | Single τ=0.053 | Group-BA τ=(0.053,0.166) | WG-1D τ=(0.053,0.033) | WG-2D τ=(0.053,0.004)
[Cell8] outer 2: BA | Single τ=0.007 | Group-BA τ=(0.003,0.991) | WG-1D τ=(0.189,0.009) | WG-2D τ=(0.189,0.026)
[Cell8] outer 3: BA | Single τ=0.009 | Group-BA τ=(0.009,0.436) | WG-1D τ=(0.029,0.016) | WG-2D τ=(0.029,0.003)
[Cell8] outer 4: BA | Single τ=0.012 | Group-BA τ=(0.001,0.031) | WG-1D τ=(0.059,0.009) | WG-2D τ=(0.059,0.002)
[Cell8] outer 5: BA | Single τ=0.027 | Group-BA τ=(0.003,0.992) | WG-1D τ=(0.027,0.091) | WG-2D τ=(0.025,0.091)
[Cell8] outer 6: BA | Single τ=0.026 | Group-BA τ=(0.022,0.206) | WG-1D τ=(0.026,0.031) | WG-2D τ=(0.026,0.149)
[Cell8] outer 7: BA | Single τ=0.024 | Group-BA τ=(0.033,0.994) | WG-1D τ=(0.033,0.011) | WG-2D τ=(0.033,0.016)
[Cell8] outer 8: BA | Single τ=0.044 | Group-BA τ=(0.010,0.090) | WG-1D τ=(0.119,0.067) | WG-2D τ=(0.119,0.138)
[Cell8] outer 9: BA | Single τ=0.020 | Group-BA τ=(0.007,0.982) | WG-1D τ=(0.007,0.020) | WG-2D τ=(0.007

In [None]:
# ===== Cell 8b (NEW/REPLACE): OUTER-ONLY → post-hoc Exact thresholding（Single / Group-BA / WG-1D / WG-2D） =====
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import confusion_matrix, roc_auc_score

# ---------------- 基本設定 ----------------
METRIC = "ba"     # BA固定
METRIC_NAME = "BA"

# best_k の取得
k_use = int(BEST_K)

# MSSQ High/Low の取得
if not set(groups.unique()).issubset(set(SUBJECT_META.index)):
    missing_ids = sorted(set(groups.unique()) - set(SUBJECT_META.index))
    raise ValueError(f"[Cell8b] SUBJECT_META に無い subject_id: {missing_ids}")
fair_groups = groups.map(SUBJECT_META["MSSQ_group"]).astype(str)

# SHAPランキングから上位 k_use 特徴を抽出
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = next((p for p in rank_candidates if os.path.exists(p)), None)
if rank_path is None:
    raise FileNotFoundError("[Cell8b] SHAP_FEATURE_RANKING(_LABELED).CSV が見つからない（Cell 4/6 を先に）")
rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[Cell8b] ランキングCSVに mean_abs / mean_abs_shap 列が無い")
feature_order = [f for f in rank_df.sort_values(rank_col, ascending=False).index if f in X_scaled_all.columns]
feats_k = feature_order[:k_use]
X_k = X_scaled_all[feats_k].astype(np.float32)

# Series
y  = pd.Series(y_all.astype(int),  index=X_scaled_all.index)
g  = pd.Series(groups.astype(str), index=X_scaled_all.index)
fg = pd.Series(fair_groups.astype(str), index=X_scaled_all.index)

# --- 再利用ユーティリティ（Cell 8 と同一定義を前提） ---
def _cumulative_conf_table(scores: np.ndarray, labels: np.ndarray):
    order = np.argsort(-scores)
    s = np.asarray(scores, float)[order]
    yb = np.asarray(labels, int)[order]
    pos = (yb == 1).astype(int)
    neg = (yb == 0).astype(int)
    cpos = np.cumsum(pos)
    cneg = np.cumsum(neg)
    return s, yb, cpos, cneg, int(pos.sum()), int(neg.sum())

def _conf_vectors_from_candidates(tab, taus: np.ndarray):
    sort_scores, sort_labels, cpos, cneg, P, N = tab
    k = np.searchsorted(-sort_scores, -taus, side="right")
    TP = np.where(k > 0, cpos[k-1], 0); FP = np.where(k > 0, cneg[k-1], 0)
    FN = P - TP; TN = N - FP
    return TP, FP, FN, TN

def _make_candidates(scores: np.ndarray) -> np.ndarray:
    s = np.asarray(scores, float)
    if s.size == 0: return np.array([0.5], dtype=float)
    uniq = np.unique(s)
    hi = np.nextafter(float(uniq.max()), np.inf)
    lo = np.nextafter(float(uniq.min()), -np.inf)
    return np.concatenate([[hi], uniq[::-1], [lo]])

def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _f1_from_conf(TP, FP, FN):
    denom = (2*TP + FP + FN)
    return (2*TP / denom) if denom > 0 else 0.0

def _score_from_conf(metric: str, TP, FP, FN, TN):
    return _f1_from_conf(TP, FP, FN) if metric == "f1" else _ba_from_conf(TP, FP, FN, TN)

def _score_from_preds(metric: str, y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_score_from_conf(metric, TP, FP, FN, TN))

def _best_tau_single_exact(scores: np.ndarray, labels: np.ndarray, metric: str) -> float:
    tab = _cumulative_conf_table(np.asarray(scores,float), np.asarray(labels,int))
    taus = _make_candidates(scores)
    TP, FP, FN, TN = _conf_vectors_from_candidates(tab, taus)
    if metric == "f1":
        num = 2*TP; den = 2*TP + FP + FN
        sc = np.where(den > 0, num/den, 0.0)
    else:
        TPR = np.where((TP+FN) > 0, TP/(TP+FN), 0.0)
        TNR = np.where((TN+FP) > 0, TN/(TN+FP), 0.0)
        sc = 0.5*(TPR+TNR)
    tau_clip = np.clip(taus, 0.0, 1.0)
    best = np.argmax(sc)
    ties = np.where(np.isclose(sc, sc[best]))[0]
    if ties.size > 1:
        d = np.abs(tau_clip[ties] - 0.5)
        j = np.argmin(d); cand = ties[np.isclose(d, d[j])]
        best = cand[np.argmin(taus[cand])] if cand.size > 1 else cand[0]
    return float(taus[best])

def _grid_search_group_thresholds_exact_minimax(scores_H, labels_H, scores_L, labels_L, metric: str):
    sH = np.asarray(scores_H, float); yH = np.asarray(labels_H, int)
    sL = np.asarray(scores_L, float); yL = np.asarray(labels_L, int)
    tabH = _cumulative_conf_table(sH, yH)
    tabL = _cumulative_conf_table(sL, yL)
    tausH = _make_candidates(sH)
    tausL = _make_candidates(sL)
    TP_H, FP_H, FN_H, TN_H = _conf_vectors_from_candidates(tabH, tausH)
    TP_L, FP_L, FN_L, TN_L = _conf_vectors_from_candidates(tabL, tausL)
    if metric == "f1":
        numH, denH = 2*TP_H, 2*TP_H + FP_H + FN_H
        numL, denL = 2*TP_L, 2*TP_L + FP_L + FN_L
        S_H = np.where(denH > 0, numH/denH, 0.0)
        S_L = np.where(denL > 0, numL/denL, 0.0)
    else:
        TPR_H = np.where((TP_H+FN_H) > 0, TP_H/(TP_H+FN_H), 0.0)
        TNR_H = np.where((TN_H+FP_H) > 0, TN_H/(TN_H+FP_H), 0.0)
        TPR_L = np.where((TP_L+FN_L) > 0, TP_L/(TP_L+FN_L), 0.0)
        TNR_L = np.where((TN_L+FP_L) > 0, TN_L/(TN_L+FP_L), 0.0)
        S_H = 0.5*(TPR_H+TNR_H); S_L = 0.5*(TPR_L+TNR_L)
    M = np.minimum(S_H[:, None], S_L[None, :])
    iH, iL = np.unravel_index(np.argmax(M), M.shape)
    best_val = M[iH, iL]
    ties = np.argwhere(np.isclose(M, best_val))
    if ties.shape[0] > 1:
        gap = np.abs(S_H[ties[:,0]] - S_L[ties[:,1]])
        j = np.argmin(gap); cand = ties[np.isclose(gap, gap[j])]
        if cand.shape[0] > 1:
            avg_tau = (np.clip(tausH,0,1)[cand[:,0]] + np.clip(tausL,0,1)[cand[:,1]])/2
            j2 = np.argmin(np.abs(avg_tau - 0.5))
            cand2 = cand[np.isclose(np.abs(avg_tau-0.5), np.abs(avg_tau-0.5)[j2])]
            if cand2.shape[0] > 1:
                avg_tau2 = (tausH[cand2[:,0]] + tausL[cand2[:,1]])/2
                iH, iL = int(cand2[np.argmin(avg_tau2),0]), int(cand2[np.argmin(avg_tau2),1])
            else:
                iH, iL = int(cand2[0,0]), int(cand2[0,1])
        else:
            iH, iL = int(cand[0,0]), int(cand[0,1])
    tauH_best = float(tausH[iH]); tauL_best = float(tausL[iL])
    return {"WG2D": {"score": float(best_val), "tauH": tauH_best, "tauL": tauL_best}}

# ---------------- 外側LOSOのみ：予測を収集 ----------------
logo = LeaveOneGroupOut()
rows = []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_k, y, g), start=1):
    test_sids = pd.Index(g.iloc[te_idx]).unique().tolist()
    if len(test_sids) != 1:
        raise RuntimeError(f"[Cell8b] LOSO違反: fold={fold_id}, test={test_sids}")
    test_sid = test_sids[0]

    model_full = fit_xgb_classifier(X_k.iloc[tr_idx], y.iloc[tr_idx])
    proba_te = model_full.predict_proba(X_k.iloc[te_idx])[:, 1]
    y_te     = y.iloc[te_idx].to_numpy().astype(int)
    grp_te   = fg.iloc[te_idx].to_numpy()

    for yy, pp, gg_ in zip(y_te, proba_te, grp_te):
        rows.append({"fold_id": fold_id, "test_id": test_sid,
                     "y_true": int(yy), "proba": float(pp), "group": str(gg_)})

df_pred = pd.DataFrame(rows)
if df_pred.empty:
    raise RuntimeError("[Cell8b] 予測が空である")
df_pred.to_csv(outpath("PREDICTIONS_OUTERONLY.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell8b] collected predictions: n={len(df_pred)}, pos={(df_pred['y_true']==1).sum()}, neg={(df_pred['y_true']==0).sum()}")
print(f"[Cell8b] saved -> {outpath('PREDICTIONS_OUTERONLY.CSV')}")

# ---------------- 後処理：プール上で Exact しきい値最適化 ----------------
y_pool = df_pred["y_true"].to_numpy().astype(int)
s_pool = df_pred["proba"].to_numpy().astype(float)
g_pool = df_pred["group"].to_numpy().astype(str)
if len(np.unique(y_pool)) < 2:
    raise RuntimeError("[Cell8b] プール真値が単一クラスで AUC/最適化不可（データやFMS閾値定義を確認）")

# Single（1D）
tau_single = _best_tau_single_exact(s_pool, y_pool, METRIC)

# Group（High/Low 抽出）
maskH = (g_pool == "High"); maskL = (g_pool == "Low")
if not (maskH.any() and maskL.any()):
    raise RuntimeError("[Cell8b] High/Low のいずれかがプールに存在しない（Group系最適化不可）")
sH, yH = s_pool[maskH], y_pool[maskH]
sL, yL = s_pool[maskL], y_pool[maskL]

# Group-BA（2D 全体最適）
gopt_global = _grid_search_group_thresholds_exact_minimax(sH, yH, sL, yL, metric="ba")  # min-maxと同じ候補を流用可
tauH_MAIN, tauL_MAIN = _best_tau_single_exact(sH, yH, METRIC), _best_tau_single_exact(sL, yL, METRIC)  # 参考：群内1D
# ↑もし “全体BA最大化（2D合算）” を使いたい場合は Cell 8 と同様の _grid_search_group_thresholds_exact を別途呼んでもOK

# Worst-Group（1D×2 独立最適）
tauH_WG1D, tauL_WG1D = _best_tau_single_exact(sH, yH, METRIC), _best_tau_single_exact(sL, yL, METRIC)

# Worst-Group（2D min-max）
gopt_wg2d = _grid_search_group_thresholds_exact_minimax(sH, yH, sL, yL, metric=METRIC)["WG2D"]
tauH_WG2D, tauL_WG2D = gopt_wg2d["tauH"], gopt_wg2d["tauL"]

# ---------------- 二値化・メトリクス算出 ----------------
from sklearn.metrics import confusion_matrix
def _BA_from_preds(y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return _score_from_conf(METRIC, TP, FP, FN, TN)

y_pred_single     = (s_pool >= tau_single).astype(int)
y_pred_group_MAIN = (s_pool >= np.where(g_pool=="High", tauH_MAIN,  tauL_MAIN)).astype(int)
y_pred_group_WG1D = (s_pool >= np.where(g_pool=="High", tauH_WG1D, tauL_WG1D)).astype(int)
y_pred_group_WG2D = (s_pool >= np.where(g_pool=="High", tauH_WG2D, tauL_WG2D)).astype(int)

BA_single = _BA_from_preds(y_pool, y_pred_single)
BA_gMain  = _BA_from_preds(y_pool, y_pred_group_MAIN)
BA_wg1d   = _BA_from_preds(y_pool, y_pred_group_WG1D)
BA_wg2d   = _BA_from_preds(y_pool, y_pred_group_WG2D)
AUC_pool  = float(roc_auc_score(y_pool, s_pool))

# 予測列も保存
df_pred["y_pred_single"]     = y_pred_single
df_pred["y_pred_group_BA"]   = y_pred_group_MAIN
df_pred["y_pred_group_WG"]   = y_pred_group_WG1D
df_pred["y_pred_group_WG2D"] = y_pred_group_WG2D
df_pred.to_csv(outpath("PREDICTIONS_OUTERONLY.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell8b] preds updated -> {outpath('PREDICTIONS_OUTERONLY.CSV')}")

# しきい値とプール評価の保存
th_row = {
    "metric": METRIC_NAME, "search_mode": "exact_posthoc", "WG_mode": "min",
    "tau_single": float(tau_single),
    "tau_high_BA": float(tauH_MAIN),  "tau_low_BA":  float(tauL_MAIN),
    "tau_high_WG": float(tauH_WG1D),  "tau_low_WG":  float(tauL_WG1D),
    "tau_high_WG2D": float(tauH_WG2D), "tau_low_WG2D": float(tauL_WG2D),
}
pd.DataFrame([th_row]).to_csv(outpath("FINAL_THRESHOLDS_POSTHOC.CSV"), index=False, encoding="utf-8-sig")

summary = {
    "best_k": k_use,
    "AUC_pooled": AUC_pool,
    "BA_pooled_single": float(BA_single),
    "BA_pooled_group_GLOBALopt": float(BA_gMain),
    "BA_pooled_group_WG1Dopt": float(BA_wg1d),
    "BA_pooled_group_WG2Dopt": float(BA_wg2d),
    "metric": METRIC_NAME,
    "n_samples": int(len(df_pred)),
    "n_pos": int((df_pred["y_true"]==1).sum()),
    "n_neg": int((df_pred["y_true"]==0).sum()),
    "note": "post-hoc tuned on pooled CV predictions (outer-only)"
}
pd.DataFrame([summary]).to_csv(outpath("METRICS_POSTHOC.CSV"), index=False, encoding="utf-8-sig")

print(f"[Cell8b] τ_single={tau_single:.3f}  BA_single={BA_single:.3f}")
print(f"[Cell8b] (τ_H,τ_L)_WG1D=({tauH_WG1D:.3f},{tauL_WG1D:.3f})  WG1D-BA={BA_wg1d:.3f}")
print(f"[Cell8b] (τ_H,τ_L)_WG2D=({tauH_WG2D:.3f},{tauL_WG2D:.3f})  WG2D-BA={BA_wg2d:.3f}")
print(f"[OK] saved -> {outpath('FINAL_THRESHOLDS_POSTHOC.CSV')} / {outpath('METRICS_POSTHOC.CSV')}")


[Cell8b] collected predictions: n=340, pos=101, neg=239
[Cell8b] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\PREDICTIONS_OUTERONLY.CSV
[Cell8b] preds updated -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\PREDICTIONS_OUTERONLY.CSV
[Cell8b] τ_single=0.019  BA_single=0.731
[Cell8b] (τ_H,τ_L)_WG1D=(0.019,0.022)  WG1D-BA=0.735
[Cell8b] (τ_H,τ_L)_WG2D=(0.019,0.004)  WG2D-BA=0.710
[OK] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\FINAL_THRESHOLDS_POSTHOC.CSV / C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\METRICS_POSTHOC.CSV


In [None]:
# ===== Cell 8g (BA-only): 群別ベストk × inner-LOSOでSingle(属性無視) τ最適化 → outer適用・集計・作図 =====
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import confusion_matrix, roc_auc_score

# ---------------- 基本セットアップ ----------------
METRIC_NAME = "BA"  # BA固定
SEARCH_MODE = str(globals().get("THRESH_SEARCH_MODE", "exact")).lower()
COARSE_STEP = float(globals().get("THRESH_COARSE_STEP", 0.01))
FINE_STEP   = float(globals().get("THRESH_FINE_STEP",   0.001))
MARGIN      = float(globals().get("THRESH_MARGIN",      0.03))

# MSSQ High/Low
if not set(groups.unique()).issubset(set(SUBJECT_META.index)):
    missing_ids = sorted(set(groups.unique()) - set(SUBJECT_META.index))
    raise ValueError(f"[Cell8g] SUBJECT_META に無い subject_id: {missing_ids}")
fair_groups = groups.map(SUBJECT_META["MSSQ_group"]).astype(str)

# 群別 best_k の読み込み
bestk_json = outpath("BEST_K_BY_GROUP.JSON")
if os.path.exists(bestk_json):
    with open(bestk_json, "r", encoding="utf-8") as f:
        _jk = json.load(f)
    BEST_K_HIGH = int(_jk.get("BEST_K_HIGH", int(globals().get("BEST_K", 10))))
    BEST_K_LOW  = int(_jk.get("BEST_K_LOW",  int(globals().get("BEST_K", 10))))
else:
    k_fallback = int(globals().get("BEST_K", 10))
    BEST_K_HIGH = k_fallback; BEST_K_LOW = k_fallback
    print(f"[Cell8g][WARN] BEST_K_BY_GROUP.JSON が見つからないため共通BEST_K={k_fallback}を両群で使用")

# SHAPランキング読み込み
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"), outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = next((p for p in rank_candidates if os.path.exists(p)), None)
if rank_path is None:
    raise FileNotFoundError("[Cell8g] SHAP_FEATURE_RANKING(_LABELED).CSV が見つからない（Cell 4/6 を先に）")
rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[Cell8g] ランキングCSVに mean_abs / mean_abs_shap 列が無い")
feature_order = [f for f in rank_df.sort_values(rank_col, ascending=False).index if f in X_scaled_all.columns]
if len(feature_order) == 0:
    raise RuntimeError("[Cell8g] ランキングの特徴が X_scaled_all に存在しません")

def _feats_for_k(k: int):
    kk = max(1, min(int(k), len(feature_order)))
    return feature_order[:kk]

# --- BAユーティリティ ---
def _cumulative_conf_table(scores: np.ndarray, labels: np.ndarray):
    order = np.argsort(-scores)
    s = np.asarray(scores, float)[order]
    yb = np.asarray(labels, int)[order]
    pos = (yb == 1).astype(int)
    neg = (yb == 0).astype(int)
    cpos = np.cumsum(pos)
    cneg = np.cumsum(neg)
    return s, yb, cpos, cneg, int(pos.sum()), int(neg.sum())

def _conf_from_threshold(sort_scores, sort_labels, cpos, cneg, P, N, tau: float):
    k = int(np.searchsorted(-sort_scores, -tau, side="right"))  # s >= tau を陽性
    TP = int(cpos[k-1]) if k > 0 else 0
    FP = int(cneg[k-1]) if k > 0 else 0
    FN = int(P - TP)
    TN = int(N - FP)
    return TP, FP, FN, TN

def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _BA_from_preds(y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_ba_from_conf(TP, FP, FN, TN))

def _make_candidates(scores: np.ndarray) -> np.ndarray:
    s = np.asarray(scores, float)
    if s.size == 0:
        return np.array([0.5], dtype=float)
    uniq = np.unique(s)
    hi = np.nextafter(float(uniq.max()), np.inf)
    lo = np.nextafter(float(uniq.min()), -np.inf)
    return np.concatenate([[hi], uniq[::-1], [lo]])

def _conf_vectors_from_candidates(tab, taus: np.ndarray):
    sort_scores, sort_labels, cpos, cneg, P, N = tab
    k = np.searchsorted(-sort_scores, -taus, side="right")
    TP = np.where(k > 0, cpos[k-1], 0)
    FP = np.where(k > 0, cneg[k-1], 0)
    FN = P - TP
    TN = N - FP
    return TP, FP, FN, TN

def _best_tau_single_exact(scores: np.ndarray, labels: np.ndarray) -> float:
    tab = _cumulative_conf_table(np.asarray(scores,float), np.asarray(labels,int))
    taus = _make_candidates(scores)
    TP, FP, FN, TN = _conf_vectors_from_candidates(tab, taus)
    TPR = np.where((TP+FN) > 0, TP/(TP+FN), 0.0)
    TNR = np.where((TN+FP) > 0, TN/(TN+FP), 0.0)
    S = 0.5*(TPR+TNR)
    # tie-break: 0.5に近い→さらに小さいτを優先
    tau_clip = np.clip(taus, 0.0, 1.0)
    j = np.argmax(S)
    ties = np.where(np.isclose(S, S[j]))[0]
    if ties.size > 1:
        d = np.abs(tau_clip[ties] - 0.5)
        j2 = np.argmin(d); cand = ties[np.isclose(d, d[j2])]
        j = cand[np.argmin(taus[cand])] if cand.size > 1 else cand[0]
    return float(taus[j])

def _best_tau_single_grid(scores: np.ndarray, labels: np.ndarray) -> float:
    tab = _cumulative_conf_table(scores, labels)
    def score_at(t):
        TP, FP, FN, TN = _conf_from_threshold(*tab, t)
        return _ba_from_conf(TP, FP, FN, TN)
    best = (-1.0, 0.5)
    for t in np.arange(0.0, 1.0 + 1e-12, COARSE_STEP):
        sc = score_at(t)
        if sc > best[0]:
            best = (float(sc), float(t))
    t0 = best[1]; lo = max(0.0, t0 - MARGIN); hi = min(1.0, t0 + MARGIN)
    score0, t_best = best[0], t0
    for t in np.arange(lo, hi + 1e-12, FINE_STEP):
        sc = score_at(t)
        if sc > score0 or (np.isclose(sc, score0) and (abs(t-0.5) < abs(t_best-0.5) or (abs(t-0.5) == abs(t_best-0.5) and t < t_best))):
            score0, t_best = float(sc), float(t)
    return float(t_best)

def _pick_best_tau(scores, labels) -> float:
    if len(scores) == 0 or len(np.unique(labels)) < 2:
        print("[Cell8g][WARN] OOFが空または単一クラスのため τ=0.5 にフォールバック")
        return 0.5
    if SEARCH_MODE == "exact":
        return _best_tau_single_exact(scores, labels)
    else:
        return _best_tau_single_grid(scores, labels)

# --- データ成形 ---
X_index = X_scaled_all.index
y = pd.Series(y_all.astype(int), index=X_index)
g = pd.Series(groups.astype(str), index=X_index)
fg = pd.Series(fair_groups.astype(str), index=X_index)

# --- outer-LOSO 本体（Single τ × 群別k） ---
logo = LeaveOneGroupOut()
rows, pred_rows = [], []

for fold_id, (tr_idx, te_idx) in enumerate(logo.split(X_scaled_all, y, g), start=1):
    te_sid = pd.Index(g.iloc[te_idx]).unique().tolist()
    if len(te_sid) != 1:
        raise RuntimeError(f"[Cell8g] LOSO違反: fold={fold_id}, test={te_sid}")
    te_sid = te_sid[0]
    te_tag = str(SUBJECT_META.loc[te_sid, "MSSQ_group"])
    k_use = BEST_K_HIGH if te_tag == "High" else BEST_K_LOW

    feats_k = _feats_for_k(k_use)
    X_k = X_scaled_all[feats_k].astype(np.float32)
    train_sids = pd.Index(g.iloc[tr_idx]).unique().tolist()

    # ---- inner-LOSO: OOF作成（属性無視 Single τ 用） ----
    oof_index = X_k.index[g.isin(train_sids)]
    oof_proba = pd.Series(index=oof_index, dtype=float)
    oof_true  = pd.Series(index=oof_index, dtype=int)

    for val_sid in train_sids:
        val_mask   = (g == val_sid)
        train_mask = g.isin([sid for sid in train_sids if sid != val_sid])
        if train_mask.sum() == 0 or val_mask.sum() == 0:
            print(f"[Cell8g][WARN] inner-LOSO 空fold（val={val_sid}) をskip")
            continue
        model_inner = fit_xgb_classifier(X_k.loc[train_mask], y.loc[train_mask])
        proba_val = model_inner.predict_proba(X_k.loc[val_mask])[:, 1]
        idx = val_mask[val_mask].index
        oof_proba.loc[idx] = proba_val
        oof_true.loc[idx]  = y.loc[val_mask].values

    valid_idx = oof_proba.dropna().index
    tau_single = _pick_best_tau(oof_proba.loc[valid_idx].to_numpy(),
                                oof_true.loc[valid_idx].to_numpy().astype(int))

    # ---- outer適用 ----
    model_full = fit_xgb_classifier(X_k.loc[g.isin(train_sids)], y.loc[g.isin(train_sids)])
    te_mask  = (g == te_sid)
    proba_te = model_full.predict_proba(X_k.loc[te_mask])[:, 1]
    y_te     = y.loc[te_mask].to_numpy().astype(int)
    grp_te   = fg.loc[te_mask].to_numpy()

    y_pred_single = (proba_te >= tau_single).astype(int)
    SCORE_single  = _BA_from_preds(y_te, y_pred_single)

    rows.append({
        "fold_id": fold_id, "test_id": te_sid, "group": te_tag,
        "k_use": int(k_use), "tau_single": float(tau_single),
        "BA_single": float(SCORE_single),
        "AUC_test": np.nan, "n_test": int(te_mask.sum())
    })
    for yy, pp, gg_, ys in zip(y_te, proba_te, grp_te, y_pred_single):
        pred_rows.append({
            "fold_id": fold_id, "test_id": te_sid, "group": str(gg_),
            "y_true": int(yy), "proba": float(pp), "y_pred_single": int(ys),
            "k_use": int(k_use)
        })

# --- 保存（fold別 / 予測明細 / 集計） ---
df_fold = pd.DataFrame(rows)
df_pred = pd.DataFrame(pred_rows)
df_fold.to_csv(outpath("SINGLE_THRESH_BY_GROUPK_FOLD.CSV"), index=False, encoding="utf-8-sig")
df_pred.to_csv(outpath("PREDICTIONS_SINGLE_BY_GROUPK.CSV"), index=False, encoding="utf-8-sig")
print(f"[Cell8g] saved -> {outpath('SINGLE_THRESH_BY_GROUPK_FOLD.CSV')} (rows={len(df_fold)})")
print(f"[Cell8g] saved -> {outpath('PREDICTIONS_SINGLE_BY_GROUPK.CSV')} (rows={len(df_pred)})")

# pooled AUC と pooled BA（全体／群別）
y_pool = df_pred["y_true"].to_numpy()
s_pool = df_pred["proba"].to_numpy()
if len(np.unique(y_pool)) < 2:
    raise RuntimeError("[Cell8g] プール真値が単一クラスで AUC 不可（データやFMS閾値定義を確認）")
auc_pool = float(roc_auc_score(y_pool, s_pool))

def _BA_from_cols(y_true, y_hat):
    return _BA_from_preds(y_true.astype(int), y_hat.astype(int))

BA_all = _BA_from_cols(df_pred["y_true"].to_numpy(), df_pred["y_pred_single"].to_numpy())
BA_hi  = _BA_from_cols(df_pred.loc[df_pred["group"]=="High","y_true"].to_numpy(),
                       df_pred.loc[df_pred["group"]=="High","y_pred_single"].to_numpy()) if (df_pred["group"]=="High").any() else np.nan
BA_lo  = _BA_from_cols(df_pred.loc[df_pred["group"]=="Low","y_true"].to_numpy(),
                       df_pred.loc[df_pred["group"]=="Low","y_pred_single"].to_numpy()) if (df_pred["group"]=="Low").any() else np.nan

summary = {
    "metric": METRIC_NAME, "search_mode": SEARCH_MODE,
    "BEST_K_HIGH": int(BEST_K_HIGH), "BEST_K_LOW": int(BEST_K_LOW),
    "AUC_pooled": auc_pool,
    "BA_pooled_all": float(BA_all),
    "BA_pooled_high": float(BA_hi) if BA_hi==BA_hi else np.nan,
    "BA_pooled_low":  float(BA_lo) if BA_lo==BA_lo else np.nan,
    "n_folds": int(len(df_fold)), "n_pred_rows": int(len(df_pred))
}
pd.DataFrame([summary]).to_csv(outpath("METRICS_SINGLE_BY_GROUPK.CSV"), index=False, encoding="utf-8-sig")
print(f"[OK] pooled AUC={auc_pool:.3f}, BA(all)={BA_all:.3f} -> {outpath('METRICS_SINGLE_BY_GROUPK.CSV')}")

# --- 参考作図：全体/High/Low の混同行列PNG（タイトルにτ統計） ---
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30, "axes.labelsize": 24,
    "legend.fontsize": 20, "xtick.labelsize": 20, "ytick.labelsize": 20,
})

def _cm(y_true, y_pred): return confusion_matrix(y_true, y_pred, labels=[0,1])

def _draw_cm(ax, cm, title):
    TN, FP, FN, TP = cm.ravel()
    mat = np.array([[TN, FP],[FN, TP]], dtype=int)
    vmax = max(mat.max(), 1)
    ax.imshow(mat, cmap="Blues", vmin=0, vmax=vmax)
    row_sums = mat.sum(axis=1, keepdims=True)
    pct = np.divide(mat, np.where(row_sums==0, 1, row_sums), where=(row_sums!=0)) * 100.0
    name = np.array([["TN","FP"],["FN","TP"]])
    for i in range(2):
        for j in range(2):
            val = mat[i,j]; prc = pct[i,j]
            color = "white" if val > 0.6*vmax else "black"
            ax.text(j, i, f"{name[i,j]} {val}\n({prc:.1f}%)", ha="center", va="center",
                    fontsize=26, fontweight="bold", color=color)
    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred: Non-Sick","Pred: Sick"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True: Non-Sick","True: Sick"], rotation=90, va="center")
    ax.set_title(title, pad=10); ax.grid(False)

# τ μ±SD
def _mu_sd(sr):
    sr = pd.to_numeric(sr, errors="coerce").dropna()
    return (float(sr.mean()), float(sr.std(ddof=1)) if len(sr)>1 else 0.0) if len(sr)>0 else (np.nan, np.nan)
tau_mu, tau_sd = _mu_sd(df_fold["tau_single"]) if "tau_single" in df_fold.columns else (np.nan, np.nan)

# 全体
cm_all = _cm(df_pred["y_true"].to_numpy(), df_pred["y_pred_single"].to_numpy())
fig, ax = plt.subplots(1,1, figsize=(9.5,9))
_draw_cm(ax, cm_all, f"Single τ (All) — {METRIC_NAME}={BA_all:.3f}\nτ={tau_mu:.3f}±{tau_sd:.3f}")
plt.savefig(outpath("CONFMAT_SINGLE_GROUPK_ALL.png"), dpi=300, bbox_inches="tight"); plt.close()
print(f"[Cell8g] plot -> {outpath('CONFMAT_SINGLE_GROUPK_ALL.png')}")

# High
if (df_pred["group"]=="High").any():
    cm_hi = _cm(df_pred.loc[df_pred["group"]=="High","y_true"].to_numpy(),
                df_pred.loc[df_pred["group"]=="High","y_pred_single"].to_numpy())
    fig, ax = plt.subplots(1,1, figsize=(9.5,9))
    _draw_cm(ax, cm_hi, f"Single τ (High) — {METRIC_NAME}={BA_hi:.3f}")
    plt.savefig(outpath("CONFMAT_SINGLE_GROUPK_HIGH.png"), dpi=300, bbox_inches="tight"); plt.close()
    print(f"[Cell8g] plot -> {outpath('CONFMAT_SINGLE_GROUPK_HIGH.png')}")

# Low
if (df_pred["group"]=="Low").any():
    cm_lo = _cm(df_pred.loc[df_pred["group"]=="Low","y_true"].to_numpy(),
                df_pred.loc[df_pred["group"]=="Low","y_pred_single"].to_numpy())
    fig, ax = plt.subplots(1,1, figsize=(9.5,9))
    _draw_cm(ax, cm_lo, f"Single τ (Low) — {METRIC_NAME}={BA_lo:.3f}")
    plt.savefig(outpath("CONFMAT_SINGLE_GROUPK_LOW.png"), dpi=300, bbox_inches="tight"); plt.close()
    print(f"[Cell8g] plot -> {outpath('CONFMAT_SINGLE_GROUPK_LOW.png')}")

print(f"[DONE] METRIC={METRIC_NAME}, SEARCH={SEARCH_MODE}, BEST_K_HIGH={BEST_K_HIGH}, BEST_K_LOW={BEST_K_LOW}")


[Cell8g] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\SINGLE_THRESH_BY_GROUPK_FOLD.CSV (rows=17)
[Cell8g] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\PREDICTIONS_SINGLE_BY_GROUPK.CSV (rows=340)
[OK] pooled AUC=0.630, BA(all)=0.675 -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\METRICS_SINGLE_BY_GROUPK.CSV
[Cell8g] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_SINGLE_GROUPK_ALL.png
[Cell8g] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_SINGLE_GROUPK_HIGH.png
[Cell8g] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_SINGLE_GROUPK_LOW.png
[DONE] METRIC=BA, SEARCH=exact, BEST_K_HIGH=30, BEST_K_LOW=3


In [None]:
# ===== Cell 8.7 (BA-only): Confusion matrices — Single(属性無視) vs Group-GLOBAL vs Group-WG(2D優先) =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score

METRIC_NAME = "BA"  # BA固定

# 入力ファイル
fold_path = outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV")
pred_path = outpath("GROUP_AWARE_PREDICTIONS.CSV")
if not (os.path.exists(fold_path) and os.path.exists(pred_path)):
    raise FileNotFoundError("[Cell8.7] 必要CSVが見つからない（Cell 8 を先に実行）")

df_fold = pd.read_csv(fold_path, encoding="utf-8-sig")
pred    = pd.read_csv(pred_path, encoding="utf-8-sig")

# ---------- BAユーティリティ ----------
def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _BA_from_preds(y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_ba_from_conf(TP, FP, FN, TN))

def _cm(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, labels=[0,1])

def _mu_sd(sr):
    sr = pd.to_numeric(sr, errors="coerce").dropna()
    if len(sr) == 0: return (np.nan, np.nan)
    return float(sr.mean()), (float(sr.std(ddof=1)) if len(sr) > 1 else 0.0)

# ---------- 予測列の再構築（保険） ----------
# Single（foldごとの τ_single を使用）
if "y_pred_single" not in pred.columns and "tau_single" in df_fold.columns:
    pred = pred.merge(df_fold[["fold_id","tau_single"]], on="fold_id", how="left")
    if pred["tau_single"].isna().any():
        raise RuntimeError("[Cell8.7] tau_single の再構築に失敗（fold対応ズレ）")
    pred["y_pred_single"] = (pred["proba"].astype(float) >= pred["tau_single"].astype(float)).astype(int)
    pred.drop(columns=["tau_single"], inplace=True)

# Group-GLOBAL（BA最適）を再構築
need_global = {"y_pred_group_BA"}
if not need_global.issubset(pred.columns):
    cols = ["fold_id","tau_high_BA","tau_low_BA"]
    if not set(cols).issubset(df_fold.columns):
        raise RuntimeError("[Cell8.7] BA用（GLOBAL）のしきい値列が見つからない")
    pred = pred.merge(df_fold[cols], on="fold_id", how="left")
    th_main = np.where(pred["group"].astype(str)=="High", pred["tau_high_BA"], pred["tau_low_BA"]).astype(float)
    pred["y_pred_group_BA"] = (pred["proba"].astype(float) >= th_main).astype(int)
    pred.drop(columns=["tau_high_BA","tau_low_BA"], inplace=True)

# Group-WG（2Dがあれば優先、無ければ1D）
wg_pred_col, wg_tauH_col, wg_tauL_col = None, None, None

if "y_pred_group_WG2D" in pred.columns:
    wg_pred_col = "y_pred_group_WG2D"
    wg_tauH_col, wg_tauL_col = "tau_high_WG2D", "tau_low_WG2D"
    if not {wg_tauH_col, wg_tauL_col}.issubset(df_fold.columns):
        # 予測列はあるが τ 列が fold ファイルに無い場合の保険
        wg_tauH_col, wg_tauL_col = None, None
elif "y_pred_group_WG" in pred.columns:
    wg_pred_col = "y_pred_group_WG"
    wg_tauH_col, wg_tauL_col = "tau_high_WG", "tau_low_WG"
else:
    # 列が無ければ再構築を試みる（2D優先）
    if {"tau_high_WG2D","tau_low_WG2D"}.issubset(df_fold.columns):
        pred = pred.merge(df_fold[["fold_id","tau_high_WG2D","tau_low_WG2D"]], on="fold_id", how="left")
        th_wg = np.where(pred["group"].astype(str)=="High", pred["tau_high_WG2D"], pred["tau_low_WG2D"]).astype(float)
        pred["y_pred_group_WG2D"] = (pred["proba"].astype(float) >= th_wg).astype(int)
        wg_pred_col, wg_tauH_col, wg_tauL_col = "y_pred_group_WG2D", "tau_high_WG2D", "tau_low_WG2D"
        pred.drop(columns=["tau_high_WG2D","tau_low_WG2D"], inplace=True)
    elif {"tau_high_WG","tau_low_WG"}.issubset(df_fold.columns):
        pred = pred.merge(df_fold[["fold_id","tau_high_WG","tau_low_WG"]], on="fold_id", how="left")
        th_wg = np.where(pred["group"].astype(str)=="High", pred["tau_high_WG"], pred["tau_low_WG"]).astype(float)
        pred["y_pred_group_WG"] = (pred["proba"].astype(float) >= th_wg).astype(int)
        wg_pred_col, wg_tauH_col, wg_tauL_col = "y_pred_group_WG", "tau_high_WG", "tau_low_WG"
        pred.drop(columns=["tau_high_WG","tau_low_WG"], inplace=True)
    else:
        raise RuntimeError("[Cell8.7] WG系の列が見つからない（WG2D/WG1Dのτ列どちらも無し）")

# ---------- データ抽出 ----------
y = pred["y_true"].astype(int).to_numpy()
p = pred["proba"].astype(float).to_numpy()
if len(np.unique(y)) < 2:
    raise RuntimeError("[Cell8.7] プール真値が単一クラスで AUC/混同行列不可（データやFMS閾値定義を確認）")

yS = pred["y_pred_single"].astype(int).to_numpy()
yG = pred["y_pred_group_BA"].astype(int).to_numpy()
yW = pred[wg_pred_col].astype(int).to_numpy()

cmS = _cm(y, yS); cmG = _cm(y, yG); cmW = _cm(y, yW)
scoreS = _BA_from_preds(y, yS)
scoreG = _BA_from_preds(y, yG)
scoreW = _BA_from_preds(y, yW)

# 表示用：AUC と τ の μ±SD
auc_pooled = float(roc_auc_score(y, p))
k_disp = int(df_fold["best_k"].iloc[0]) if "best_k" in df_fold.columns and not pd.isna(df_fold["best_k"].iloc[0]) else np.nan

tauS_mu, tauS_sd = _mu_sd(df_fold["tau_single"]) if "tau_single" in df_fold.columns else (np.nan, np.nan)
tauHB_mu, tauHB_sd = _mu_sd(df_fold.get("tau_high_BA", pd.Series(dtype=float)))
tauLB_mu, tauLB_sd = _mu_sd(df_fold.get("tau_low_BA",  pd.Series(dtype=float)))

# WGは2D/1Dに応じて列を選択
if wg_tauH_col is not None and wg_tauH_col in df_fold.columns:
    tauHW_mu, tauHW_sd = _mu_sd(df_fold.get(wg_tauH_col, pd.Series(dtype=float)))
    tauLW_mu, tauLW_sd = _mu_sd(df_fold.get(wg_tauL_col, pd.Series(dtype=float)))
else:
    tauHW_mu, tauHW_sd = (np.nan, np.nan)
    tauLW_mu, tauLW_sd = (np.nan, np.nan)

# ---------- 描画ヘルパ ----------
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30, "axes.labelsize": 24,
    "legend.fontsize": 20, "xtick.labelsize": 20, "ytick.labelsize": 20,
})

def _draw(ax, cm, title):
    TN, FP, FN, TP = cm.ravel()
    mat  = np.array([[TN, FP],[FN, TP]], dtype=int)
    name = np.array([["TN","FP"],["FN","TP"]])

    vmax = max(mat.max(), 1)
    ax.imshow(mat, cmap="Blues", vmin=0, vmax=vmax)

    row_sums = mat.sum(axis=1, keepdims=True)
    pct = np.divide(mat, np.where(row_sums==0, 1, row_sums),
                    where=(row_sums!=0)) * 100.0

    for i in range(2):
        for j in range(2):
            val = mat[i, j]; prc = pct[i, j]
            color = "white" if val > 0.6 * vmax else "black"
            ax.text(j, i, f"{name[i,j]} {val}\n({prc:.1f}%)",
                    ha="center", va="center",
                    fontsize=26, fontweight="bold", color=color)

    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred: Non-Sick","Pred: Sick"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True: Non-Sick","True: Sick"], rotation=90, va="center")
    ax.set_title(title, pad=10)
    ax.set_xlabel(""); ax.set_ylabel("")
    ax.grid(False)

# ---------- タイトル文言 ----------
wg_label = "WG-BA-opt (2D)" if wg_pred_col.endswith("WG2D") else "WG-BA-opt (1D)"
title_single = f"Single τ  ({METRIC_NAME}={scoreS:.3f})\nτ={tauS_mu:.3f}±{tauS_sd:.3f}" if not np.isnan(tauS_mu) else f"Single τ  ({METRIC_NAME}={scoreS:.3f})"
title_gMAIN  = f"Group τ (BA-opt)  ({METRIC_NAME}={scoreG:.3f})\nτ_H={tauHB_mu:.3f}±{tauHB_sd:.3f}, τ_L={tauLB_mu:.3f}±{tauLB_sd:.3f}"
title_gWG    = f"Group τ ({wg_label})  ({METRIC_NAME}={scoreW:.3f})"
if not np.isnan(tauHW_mu) and not np.isnan(tauLW_mu):
    title_gWG += f"\nτ_H={tauHW_mu:.3f}±{tauHW_sd:.3f}, τ_L={tauLW_mu:.3f}±{tauLW_sd:.3f}"

# ---------- 作図 ----------
fig, axes = plt.subplots(1, 3, figsize=(24, 9), constrained_layout=True)
_draw(axes[0], cmS, title_single)
_draw(axes[1], cmG, title_gMAIN)
_draw(axes[2], cmW, title_gWG)

supt = f"Confusion matrices  (k={k_disp if k_disp==k_disp else 'NA'}, ROC AUC={auc_pooled:.3f}, metric={METRIC_NAME})"
fig.suptitle(supt, fontsize=32, y=1.10)  # タイトルを少し上に
fig.set_constrained_layout_pads(w_pad=0.02, h_pad=0.02, wspace=0.28, hspace=0.02)

out_png = outpath("CONFMAT_SINGLE_vs_GROUP.png")  # 既存名を維持
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close()
print(f"[Cell8.7] plot -> {out_png}  ({wg_label})")


[Cell8.7] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_SINGLE_vs_GROUP.png  (WG-BA-opt (2D))


In [None]:
# ===== Cell 8.7c (NEW): 5分監査 — 即チェック（assert/TPR-FPR比較/直観図） =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc

METRIC_NAME = "BA"  # 8.7(BA-only)と整合

# ---- 直前セル(8.7: BA-only)が作った df_fold/pred を再読込（安全のため）----
fold_path = outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV")
pred_path = outpath("GROUP_AWARE_PREDICTIONS.CSV")
if not (os.path.exists(fold_path) and os.path.exists(pred_path)):
    raise FileNotFoundError("[Cell8.7c] 必要CSVが見つからない（Cell 8 を先に実行）")

df_fold = pd.read_csv(fold_path, encoding="utf-8-sig")
pred    = pd.read_csv(pred_path,  encoding="utf-8-sig")

# ---- y_pred_* 列が無い場合は8.7と同様に保険で再構築 ----
need_cols = {"y_pred_single","y_pred_group_BA","y_pred_group_WG"}
rebuild_msgs = []

# Single
if "y_pred_single" not in pred.columns and "tau_single" in df_fold.columns:
    pred = pred.merge(df_fold[["fold_id","tau_single"]], on="fold_id", how="left")
    if pred["tau_single"].isna().any():
        raise RuntimeError("[Cell8.7c] tau_single の再構築に失敗（fold対応ズレ）")
    pred["y_pred_single"] = (pred["proba"].astype(float) >= pred["tau_single"].astype(float)).astype(int)
    pred.drop(columns=["tau_single"], inplace=True)
    rebuild_msgs.append("rebuild y_pred_single from tau_single")

# Group（BA）
if not {"y_pred_group_BA","y_pred_group_WG"}.issubset(pred.columns):
    cols = ["fold_id","tau_high_BA","tau_low_BA","tau_high_WG","tau_low_WG"]
    if not set(cols).issubset(df_fold.columns):
        raise RuntimeError("[Cell8.7c] BA用しきい値列が見つからない（Cell 8）")
    pred = pred.merge(df_fold[cols], on="fold_id", how="left")
    th_main = np.where(pred["group"].astype(str)=="High", pred["tau_high_BA"], pred["tau_low_BA"]).astype(float)
    th_wg   = np.where(pred["group"].astype(str)=="High", pred["tau_high_WG"], pred["tau_low_WG"]).astype(float)
    pred["y_pred_group_BA"] = (pred["proba"].astype(float) >= th_main).astype(int)
    pred["y_pred_group_WG"] = (pred["proba"].astype(float) >= th_wg).astype(int)
    pred.drop(columns=cols, inplace=True, errors="ignore")
    rebuild_msgs.append("rebuild y_pred_group_BA / y_pred_group_WG from taus")

# ---- 即アサーション（ユーザ指定） ----
assert pred[["y_true","proba","group"]].isna().sum().sum() == 0, "[8.7c] pred に NA が含まれる"
# fold件数一致（pred の fold 出現数と df_fold の fold 行数が一致）
n_pred_folds = pred["fold_id"].nunique()
n_fold_rows  = df_fold["fold_id"].nunique() if "fold_id" in df_fold.columns else len(df_fold["fold_id"].unique())
assert n_pred_folds == n_fold_rows, f"[8.7c] fold数不一致: pred={n_pred_folds}, df_fold={n_fold_rows}"
# 必須列
assert set(pred.columns).issuperset(need_cols), f"[8.7c] 必須列欠如: need={need_cols - set(pred.columns)}"
if rebuild_msgs:
    print("[8.7c] rebuilt -> " + " / ".join(rebuild_msgs))

# ---- ユーティリティ ----
def _cm(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, labels=[0,1])

def _tp_fp_fn_tn(y_true, y_pred):
    TN, FP, FN, TP = _cm(y_true, y_pred).ravel()
    return int(TP), int(FP), int(FN), int(TN)

def _tpr_fpr(y_true, y_pred):
    TP, FP, FN, TN = _tp_fp_fn_tn(y_true, y_pred)
    TPR = TP / (TP + FN) if (TP + FN) > 0 else np.nan
    FPR = FP / (FP + TN) if (FP + TN) > 0 else np.nan
    return TPR, FPR

# ---- (1) 全体 / High / Low ：同一インデックス集合で (TP,FP,FN,TN) を再計算して表にする ----
def _block_compare(mask, tag):
    y  = pred.loc[mask, "y_true"].to_numpy().astype(int)
    yS = pred.loc[mask, "y_pred_single"].to_numpy().astype(int)
    yG = pred.loc[mask, "y_pred_group_BA"].to_numpy().astype(int)
    yW = pred.loc[mask, "y_pred_group_WG"].to_numpy().astype(int)

    rows = []
    for name, yhat in [("Single", yS), ("Group-BA", yG), ("Group-WG", yW)]:
        TP, FP, FN, TN = _tp_fp_fn_tn(y, yhat)
        TPR, FPR = _tpr_fpr(y, yhat)
        rows.append({"Block": tag, "Scheme": name, "N": int(mask.sum()),
                     "TP":TP, "FP":FP, "FN":FN, "TN":TN,
                     "TPR":float(TPR), "FPR":float(FPR)})
    return rows

mask_all = np.ones(len(pred), dtype=bool)
mask_H   = pred["group"].astype(str) == "High"
mask_L   = pred["group"].astype(str) == "Low"

rows_all = _block_compare(mask_all, "All")
rows_hi  = _block_compare(mask_H,   "High")
rows_lo  = _block_compare(mask_L,   "Low")

df_blocks = pd.DataFrame(rows_all + rows_hi + rows_lo)
df_blocks.to_csv(outpath("AUDIT_BLOCK_CONFUSION_BA.csv"), index=False, encoding="utf-8-sig")
print(f"[8.7c] saved -> {outpath('AUDIT_BLOCK_CONFUSION_BA.csv')}")

# ---- (2) foldごとの TPR/FPR を比較し、“TPR↑ & FPR↓” のfoldを特定 ----
comp_rows = []
for fid, sub in pred.groupby("fold_id"):
    y  = sub["y_true"].to_numpy().astype(int)

    # Single
    yS = sub["y_pred_single"].to_numpy().astype(int)
    TPR_S, FPR_S = _tpr_fpr(y, yS)

    # Group-BA
    yG = sub["y_pred_group_BA"].to_numpy().astype(int)
    TPR_G, FPR_G = _tpr_fpr(y, yG)

    # Group-WG
    yW = sub["y_pred_group_WG"].to_numpy().astype(int)
    TPR_W, FPR_W = _tpr_fpr(y, yW)

    comp_rows.append({
        "fold_id": fid,
        "TPR_S": TPR_S, "FPR_S": FPR_S,
        "TPR_G": TPR_G, "FPR_G": FPR_G,
        "TPR_W": TPR_W, "FPR_W": FPR_W,
        "TPR_up_FPR_down_G": (TPR_G > TPR_S) and (FPR_G < FPR_S),
        "TPR_up_FPR_down_W": (TPR_W > TPR_S) and (FPR_W < FPR_S),
        "n_rows": len(sub)
    })

df_fold_cmp = pd.DataFrame(comp_rows).sort_values("fold_id")
df_fold_cmp.to_csv(outpath("FOLD_TPRFPR_COMPARISON.csv"), index=False, encoding="utf-8-sig")
print(f"[8.7c] saved -> {outpath('FOLD_TPRFPR_COMPARISON.csv')}")

hit_G = df_fold_cmp.loc[df_fold_cmp["TPR_up_FPR_down_G"], "fold_id"].tolist()
hit_W = df_fold_cmp.loc[df_fold_cmp["TPR_up_FPR_down_W"], "fold_id"].tolist()
print(f"[8.7c] TPR↑&FPR↓ folds — Group-BA: {hit_G if hit_G else 'None'} | Group-WG: {hit_W if hit_W else 'None'}")

# ---- (3) 直観図：High/Low のスコア分布と τ（Single/Group-BA） ----
# 8.7(BA-only)で使った τ の μ±SD を再計算（縦線は平均値を表示）
def _mu_sd(sr):
    sr = pd.to_numeric(sr, errors="coerce").dropna()
    if len(sr) == 0:
        return (np.nan, np.nan)
    mu = float(sr.mean())
    sd = float(sr.std(ddof=1)) if len(sr) > 1 else 0.0
    return mu, sd

tauS_mu, _ = _mu_sd(df_fold.get("tau_single", pd.Series(dtype=float)))
tauH_mu, _ = _mu_sd(df_fold.get("tau_high_BA", pd.Series(dtype=float)))
tauL_mu, _ = _mu_sd(df_fold.get("tau_low_BA",  pd.Series(dtype=float)))

s_all = pred["proba"].astype(float).to_numpy()
s_H   = pred.loc[mask_H, "proba"].astype(float).to_numpy()
s_L   = pred.loc[mask_L, "proba"].astype(float).to_numpy()

plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30, "axes.labelsize": 24,
    "legend.fontsize": 20, "xtick.labelsize": 20, "ytick.labelsize": 20,
})

# ヒスト（High/Low 重ね） + τの縦線
fig, ax = plt.subplots(1,1, figsize=(10,6))
bins = np.linspace(0.0, 1.0, 41)
ax.hist(s_H, bins=bins, alpha=0.55, density=True, label="High", edgecolor="none")
ax.hist(s_L, bins=bins, alpha=0.55, density=True, label="Low",  edgecolor="none")
if not np.isnan(tauS_mu): ax.axvline(tauS_mu,  linestyle="--", linewidth=1.5, label=f"τ_single≈{tauS_mu:.3f}")
if not np.isnan(tauH_mu): ax.axvline(tauH_mu,  linestyle="-.", linewidth=1.5, label=f"τ_H(BA)≈{tauH_mu:.3f}")
if not np.isnan(tauL_mu): ax.axvline(tauL_mu,  linestyle=":",  linewidth=1.5, label=f"τ_L(BA)≈{tauL_mu:.3f}")
ax.set_xlabel("Predicted score"); ax.set_ylabel("Density")
ax.set_title("Score distribution by group with thresholds (means)")
ax.legend()
plt.tight_layout()
plt.savefig(outpath("AUDIT_HIST_SCORES_WITH_TAU_BA.png"), dpi=300, bbox_inches="tight"); plt.close()
print(f"[8.7c] plot -> {outpath('AUDIT_HIST_SCORES_WITH_TAU_BA.png')}")

# ---- (4) 参考：ROC（High/Low）に τ_single/τ_H/τ_L の動作点を打つ ----
def _op_point(scores, labels, tau):
    yhat = (scores >= tau).astype(int)
    TP, FP, FN, TN = _tp_fp_fn_tn(labels, yhat)
    TPR = TP / (TP + FN) if (TP + FN) > 0 else np.nan
    FPR = FP / (FP + TN) if (FP + TN) > 0 else np.nan
    return FPR, TPR

fig, ax = plt.subplots(1,1, figsize=(7.5,7.5))
for tag, scores, labels, col in [
    ("High", s_H, pred.loc[mask_H, "y_true"].to_numpy().astype(int), "tab:blue"),
    ("Low",  s_L, pred.loc[mask_L, "y_true"].to_numpy().astype(int), "tab:orange")
]:
    if len(np.unique(labels)) < 2:
        print(f"[8.7c][ROC] {tag}: 単一クラスのため ROC 省略")
        continue
    fpr, tpr, _ = roc_curve(labels, scores)
    ax.plot(fpr, tpr, label=f"{tag} ROC (AUC={auc(fpr,tpr):.3f})", color=col, linewidth=1.5)

# 動作点（平均τを便宜的に利用）
if not np.isnan(tauS_mu):
    # High/Low それぞれに single τ を適用した動作点
    if mask_H.any():
        xS_H, yS_H = _op_point(s_H, pred.loc[mask_H,"y_true"].to_numpy().astype(int), tauS_mu)
        ax.scatter([xS_H],[yS_H], marker="o", s=60, color="tab:blue", edgecolor="k", label="High @ τ_single")
    if mask_L.any():
        xS_L, yS_L = _op_point(s_L, pred.loc[mask_L,"y_true"].to_numpy().astype(int), tauS_mu)
        ax.scatter([xS_L],[yS_L], marker="o", s=60, color="tab:orange", edgecolor="k", label="Low  @ τ_single")
if not np.isnan(tauH_mu) and mask_H.any():
    xH, yH = _op_point(s_H, pred.loc[mask_H,"y_true"].to_numpy().astype(int), tauH_mu)
    ax.scatter([xH],[yH], marker="^", s=70, color="tab:blue", edgecolor="k", label="High @ τ_H(BA)")
if not np.isnan(tauL_mu) and mask_L.any():
    xL, yL = _op_point(s_L, pred.loc[mask_L,"y_true"].to_numpy().astype(int), tauL_mu)
    ax.scatter([xL],[yL], marker="^", s=70, color="tab:orange", edgecolor="k", label="Low  @ τ_L(BA)")

ax.plot([0,1],[0,1], linestyle="--", color="gray", linewidth=1.0)
ax.set_xlabel("FPR"); ax.set_ylabel("TPR")
ax.set_title("ROC by group with operating points")
ax.legend(loc="lower right", fontsize=12)
ax.set_aspect("equal", adjustable="box")
plt.tight_layout()
plt.savefig(outpath("AUDIT_ROC_BY_GROUP_WITH_OP_POINTS.png"), dpi=300, bbox_inches="tight"); plt.close()
print(f"[8.7c] plot -> {outpath('AUDIT_ROC_BY_GROUP_WITH_OP_POINTS.png')}")

print("[8.7c] DONE")


[8.7c] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUDIT_BLOCK_CONFUSION_BA.csv
[8.7c] saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\FOLD_TPRFPR_COMPARISON.csv
[8.7c] TPR↑&FPR↓ folds — Group-BA: None | Group-WG: None
[8.7c] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUDIT_HIST_SCORES_WITH_TAU_BA.png
[8.7c] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUDIT_ROC_BY_GROUP_WITH_OP_POINTS.png
[8.7c] DONE


In [None]:
# ===== Cell 8.7b (BA-only): Confusion matrices — OUTER-ONLY（Single vs Group-GLOBAL vs Group-WG） =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score

METRIC_NAME = "BA"

# 入力ファイル（8b の出力）
pred_path = outpath("PREDICTIONS_OUTERONLY.CSV")
th_path   = outpath("FINAL_THRESHOLDS_POSTHOC.CSV")
if not (os.path.exists(pred_path) and os.path.exists(th_path)):
    raise FileNotFoundError("[Cell8.7b] 必要CSVが見つからない（Cell 8b を先に実行）")

pred = pd.read_csv(pred_path, encoding="utf-8-sig")
th   = pd.read_csv(th_path, encoding="utf-8-sig")

# --- BAユーティリティ ---
def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _BA_from_preds(y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_ba_from_conf(TP, FP, FN, TN))

# --- 予測列が無い場合は、しきい値ファイルから再構築（保険） ---
if "y_pred_single" not in pred.columns:
    tau_single = float(th["tau_single"].iloc[0])
    pred["y_pred_single"] = (pred["proba"].astype(float) >= tau_single).astype(int)

if not {"y_pred_group_BA","y_pred_group_WG"}.issubset(pred.columns):
    tauH_MAIN, tauL_MAIN = float(th["tau_high_BA"].iloc[0]), float(th["tau_low_BA"].iloc[0])
    tauH_WG,   tauL_WG   = float(th["tau_high_WG"].iloc[0]), float(th["tau_low_WG"].iloc[0])
    th_main = np.where(pred["group"].astype(str)=="High", tauH_MAIN, tauL_MAIN).astype(float)
    th_wg   = np.where(pred["group"].astype(str)=="High", tauH_WG,   tauL_WG).astype(float)
    pred["y_pred_group_BA"] = (pred["proba"].astype(float) >= th_main).astype(int)
    pred["y_pred_group_WG"] = (pred["proba"].astype(float) >= th_wg).astype(int)

# --- データ抽出 ---
y = pred["y_true"].astype(int).to_numpy()
p = pred["proba"].astype(float).to_numpy()
if len(np.unique(y)) < 2:
    raise RuntimeError("[Cell8.7b] プール真値が単一クラスで AUC/混同行列不可（データやFMS閾値定義を確認）")

def _cm(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, labels=[0,1])

yS = pred["y_pred_single"].astype(int).to_numpy()
yG = pred["y_pred_group_BA"].astype(int).to_numpy()
yW = pred["y_pred_group_WG"].astype(int).to_numpy()

cmS = _cm(y, yS); cmG = _cm(y, yG); cmW = _cm(y, yW)
scoreS = _BA_from_preds(y, yS)
scoreG = _BA_from_preds(y, yG)
scoreW = _BA_from_preds(y, yW)

auc_pooled = float(roc_auc_score(y, p))

# --- 描画ヘルパ ---
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30,
    "axes.labelsize": 24,
    "legend.fontsize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
})

def draw(ax, cm, title):
    TN, FP, FN, TP = cm.ravel()
    mat = np.array([[TN, FP],[FN, TP]], dtype=int)
    name = np.array([["TN", "FP"],["FN", "TP"]])
    vmax = max(mat.max(), 1)
    ax.imshow(mat, cmap="Blues", vmin=0, vmax=vmax)

    row_sums = mat.sum(axis=1, keepdims=True)
    pct = np.divide(mat, np.where(row_sums==0, 1, row_sums), where=(row_sums!=0)) * 100.0

    for i in range(2):
        for j in range(2):
            val = mat[i, j]; prc = pct[i, j]
            color = "white" if val > 0.6 * vmax else "black"
            ax.text(j, i, f"{name[i,j]} {val}\n({prc:.1f}%)",
                    ha="center", va="center", fontsize=26, fontweight="bold", color=color)

    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred: Non-Sick","Pred: Sick"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True: Non-Sick","True: Sick"], rotation=90, va="center")
    ax.set_title(title, pad=10)
    ax.set_xlabel(""); ax.set_ylabel("")
    ax.grid(False)

# タイトル
title_single = f"Single τ ({METRIC_NAME}={scoreS:.3f})"
title_gMAIN  = f"Group τ ({METRIC_NAME}-opt) ({METRIC_NAME}={scoreG:.3f})"
title_gWG    = f"Group τ (WG-{METRIC_NAME}-opt) ({METRIC_NAME}={scoreW:.3f})"

# 作図
fig, axes = plt.subplots(1, 3, figsize=(24, 9), constrained_layout=True)
draw(axes[0], cmS, title_single)
draw(axes[1], cmG, title_gMAIN)
draw(axes[2], cmW, title_gWG)

supt = f"Confusion matrices (OUTER-ONLY post-hoc)  ROC AUC={auc_pooled:.3f}, metric={METRIC_NAME}"
fig.suptitle(supt, fontsize=32, y=1.10)
fig.set_constrained_layout_pads(w_pad=0.02, h_pad=0.02, wspace=0.28, hspace=0.02)

out_png = outpath("CONFMAT_OUTERONLY_SINGLE_vs_GROUP.png")
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close()
print(f"[Cell8.7b] plot -> {out_png}  (post-hoc tuned on pooled CV predictions)")


[Cell8.7b] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_OUTERONLY_SINGLE_vs_GROUP.png  (post-hoc tuned on pooled CV predictions)


In [None]:
# ===== Cell 8.8 (BA-only): Confusion matrices — Group別（Highのみ/Lowのみ） =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

METRIC_NAME = "BA"

# 入力（Cell 8 実行済み前提）
fold_path = outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV")
pred_path = outpath("GROUP_AWARE_PREDICTIONS.CSV")
if not (os.path.exists(fold_path) and os.path.exists(pred_path)):
    raise FileNotFoundError("[Cell8.8] 必要CSVが見つからない（Cell 8 を先に実行）")

df_fold = pd.read_csv(fold_path, encoding="utf-8-sig")
pred    = pd.read_csv(pred_path, encoding="utf-8-sig")

# --- BAユーティリティ ---
def _ba_from_conf(TP, FP, FN, TN):
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return 0.5 * (TPR + TNR)

def _BA_from_preds(y_true_bin: np.ndarray, y_pred_bin: np.ndarray) -> float:
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    return float(_ba_from_conf(TP, FP, FN, TN))

def _cm(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, labels=[0,1])

def _mu_sd(sr):
    sr = pd.to_numeric(sr, errors="coerce").dropna()
    if len(sr) == 0:
        return (np.nan, np.nan)
    mu = float(sr.mean())
    sd = float(sr.std(ddof=1)) if len(sr) > 1 else 0.0
    return (mu, sd)

# --- 予測列が無い場合は再構築（保険） ---
if "y_pred_single" not in pred.columns and "tau_single" in df_fold.columns:
    pred = pred.merge(df_fold[["fold_id","tau_single"]], on="fold_id", how="left")
    pred["y_pred_single"] = (pred["proba"].astype(float) >= pred["tau_single"].astype(float)).astype(int)
    pred.drop(columns=["tau_single"], inplace=True)

# Group（BAしきい値から再構築）
need = {"y_pred_group_BA","y_pred_group_WG"}
if not need.issubset(pred.columns):
    cols = ["fold_id","tau_high_BA","tau_low_BA","tau_high_WG","tau_low_WG"]
    pred = pred.merge(df_fold[cols], on="fold_id", how="left")
    th_main = np.where(pred["group"].astype(str)=="High", pred["tau_high_BA"], pred["tau_low_BA"]).astype(float)
    th_wg   = np.where(pred["group"].astype(str)=="High", pred["tau_high_WG"], pred["tau_low_WG"]).astype(float)
    pred["y_pred_group_BA"] = (pred["proba"].astype(float) >= th_main).astype(int)
    pred["y_pred_group_WG"] = (pred["proba"].astype(float) >= th_wg).astype(int)
    pred.drop(columns=["tau_high_BA","tau_low_BA","tau_high_WG","tau_low_WG"], inplace=True)

col_main, col_wg = "y_pred_group_BA", "y_pred_group_WG"

# --- 群別に抽出 ---
mask_H = pred["group"].astype(str) == "High"
mask_L = pred["group"].astype(str) == "Low"

if not mask_H.any() or not mask_L.any():
    raise RuntimeError("[Cell8.8] High/Low のいずれかが存在しないため群別混同行列が作れません")

# τの μ±SD（群別・スキーム別）
tauH_MAIN_mu, tauH_MAIN_sd = _mu_sd(df_fold.get("tau_high_BA",  pd.Series(dtype=float)))
tauL_MAIN_mu, tauL_MAIN_sd = _mu_sd(df_fold.get("tau_low_BA",   pd.Series(dtype=float)))
tauH_WG_mu,   tauH_WG_sd   = _mu_sd(df_fold.get("tau_high_WG",  pd.Series(dtype=float)))
tauL_WG_mu,   tauL_WG_sd   = _mu_sd(df_fold.get("tau_low_WG",   pd.Series(dtype=float)))

# --- スコア計算（群別×スキーム別） ---
def _score_cm_for(mask, col_pred):
    y = pred.loc[mask, "y_true"].astype(int).to_numpy()
    yhat = pred.loc[mask, col_pred].astype(int).to_numpy()
    if len(y) == 0:
        return None, np.nan, 0, 0, 0, np.nan
    cm = _cm(y, yhat)
    sc = _BA_from_preds(y, yhat)
    n  = int(mask.sum())
    pos = int((pred.loc[mask, "y_true"]==1).sum())
    neg = n - pos
    pi  = (pos / n * 100.0) if n > 0 else np.nan
    return cm, sc, n, pos, neg, pi

cm_H_main, BA_H_main, nH, pH, nHn, piH = _score_cm_for(mask_H, col_main)
cm_L_main, BA_L_main, nL, pL, nLn, piL = _score_cm_for(mask_L, col_main)
cm_H_wg,   BA_H_wg,   _,  _,  _,   _   = _score_cm_for(mask_H, col_wg)
cm_L_wg,   BA_L_wg,   _,  _,  _,   _   = _score_cm_for(mask_L, col_wg)

# --- 描画共通スタイル ---
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30,
    "axes.labelsize": 24,
    "legend.fontsize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
})

def _draw(ax, cm, title):
    TN, FP, FN, TP = cm.ravel()
    mat = np.array([[TN, FP],[FN, TP]], dtype=int)
    name = np.array([["TN","FP"],["FN","TP"]])

    vmax = max(mat.max(), 1)
    ax.imshow(mat, cmap="Blues", vmin=0, vmax=vmax)

    row_sums = mat.sum(axis=1, keepdims=True)
    pct = np.divide(mat, np.where(row_sums==0, 1, row_sums),
                    where=(row_sums!=0)) * 100.0

    for i in range(2):
        for j in range(2):
            val = mat[i, j]; prc = pct[i, j]
            color = "white" if val > 0.6 * vmax else "black"
            ax.text(j, i, f"{name[i,j]} {val}\n({prc:.1f}%)",
                    ha="center", va="center",
                    fontsize=26, fontweight="bold", color=color)

    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred: Non-Sick","Pred: Sick"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True: Non-Sick","True: Sick"], rotation=90, va="center")
    ax.set_title(title, pad=10)
    ax.set_xlabel(""); ax.set_ylabel("")
    ax.grid(False)

# === (1) Group-GLOBAL（BA最適）の High/Low 1x2 ===
fig, axes = plt.subplots(1, 2, figsize=(18, 8), constrained_layout=True)
title_H_main = f"High — Group-{METRIC_NAME}-opt  ({METRIC_NAME}={BA_H_main:.3f}, n={nH}, π={piH:.1f}%)\nτ_H={tauH_MAIN_mu:.3f}±{tauH_MAIN_sd:.3f}"
title_L_main = f"Low  — Group-{METRIC_NAME}-opt  ({METRIC_NAME}={BA_L_main:.3f}, n={nL}, π={piL:.1f}%)\nτ_L={tauL_MAIN_mu:.3f}±{tauL_MAIN_sd:.3f}"
_draw(axes[0], cm_H_main, title_H_main)
_draw(axes[1], cm_L_main, title_L_main)
out_png1 = outpath("CONFMAT_GROUP_GLOBAL_BY_GROUP.png")
plt.savefig(out_png1, dpi=300, bbox_inches="tight"); plt.close()
print(f"[Cell8.8] plot -> {out_png1}")

# === (2) Group-WG（Worst-Group BA最適）の High/Low 1x2 ===
fig, axes = plt.subplots(1, 2, figsize=(18, 8), constrained_layout=True)
title_H_wg = f"High — Group-WG-{METRIC_NAME}-opt  ({METRIC_NAME}={BA_H_wg:.3f}, n={nH}, π={piH:.1f}%)\nτ_H={tauH_WG_mu:.3f}±{tauH_WG_sd:.3f}"
title_L_wg = f"Low  — Group-WG-{METRIC_NAME}-opt  ({METRIC_NAME}={BA_L_wg:.3f}, n={nL}, π={piL:.1f}%)\nτ_L={tauL_WG_mu:.3f}±{tauL_WG_sd:.3f}"
_draw(axes[0], cm_H_wg, title_H_wg)
_draw(axes[1], cm_L_wg, title_L_wg)
out_png2 = outpath("CONFMAT_GROUP_WG_BY_GROUP.png")
plt.savefig(out_png2, dpi=300, bbox_inches="tight"); plt.close()
print(f"[Cell8.8] plot -> {out_png2}")


[Cell8.8] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_GROUP_GLOBAL_BY_GROUP.png
[Cell8.8] plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\CONFMAT_GROUP_WG_BY_GROUP.png


In [None]:
# ===== Cell 9 (NEW): Probability density plots — Single vs High vs Low（しきい値は他セルの出力を読込） =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------------------------------------------
# ユーザ環境前提：
# - outpath() が既に定義済み（他セルと同じ）
# - THRESH_OBJECTIVE in {"f1","ba"} が必要なら定義済み（未定義なら "f1"）
# ------------------------------------------------------------

# ===== パラメータ =====
CELL9_BINS: int = int(globals().get("CELL9_BINS", 30))             # ヒストのビン数
CELL9_SOURCE: str = str(globals().get("CELL9_SOURCE", "auto"))     # "auto" / "inner" / "outer"
CELL9_TAU_AGG: str = str(globals().get("CELL9_TAU_AGG", "wmean"))  # τ代表値の集計: "wmean"|"mean"|"median"

METRIC = str(globals().get("THRESH_OBJECTIVE", "f1")).lower()
if METRIC not in {"f1", "ba"}:
    METRIC = "f1"
METRIC_NAME = "F1" if METRIC == "f1" else "BA"

# ===== 入力候補ファイル =====
PRED_INNER = outpath("GROUP_AWARE_PREDICTIONS.CSV")
FOLD_INNER = outpath("GROUP_AWARE_THRESH_BY_FOLD.CSV")

PRED_OUTER = outpath("PREDICTIONS_OUTERONLY.CSV")
THRS_OUTER = outpath("FINAL_THRESHOLDS_POSTHOC.CSV")

METR_INNER = outpath("LOSO_METRICS.CSV")
METR_OUTER = outpath("METRICS_POSTHOC.CSV")

# ===== ユーティリティ =====
def _exists(p): return isinstance(p, str) and os.path.exists(p)

def _pick_source():
    """描画に用いる予測CSV/しきい値CSVを自動選択（外側post-hocを優先）"""
    if CELL9_SOURCE == "outer":
        if not (_exists(PRED_OUTER) and _exists(THRS_OUTER)):
            raise FileNotFoundError("[Cell9] outer 指定だが PREDICTIONS_OUTERONLY/FINAL_THRESHOLDS_POSTHOC が見つからない")
        return ("outer", PRED_OUTER, THRS_OUTER)
    if CELL9_SOURCE == "inner":
        if not (_exists(PRED_INNER) and _exists(FOLD_INNER)):
            raise FileNotFoundError("[Cell9] inner 指定だが GROUP_AWARE_* CSV が見つからない")
        return ("inner", PRED_INNER, FOLD_INNER)

    # auto: outer を優先、無ければ inner
    if _exists(PRED_OUTER) and _exists(THRS_OUTER):
        return ("outer", PRED_OUTER, THRS_OUTER)
    if _exists(PRED_INNER) and _exists(FOLD_INNER):
        return ("inner", PRED_INNER, FOLD_INNER)
    raise FileNotFoundError("[Cell9] 予測/しきい値CSVが見つからない（Cell 8 or 8b を先に実行）")

def _load_predictions(pred_path: str):
    df = pd.read_csv(pred_path, encoding="utf-8-sig")
    for col in ("y_true","proba","group"):
        if col not in df.columns:
            raise KeyError(f"[Cell9] {pred_path} に {col} 列がない")
    y = df["y_true"].astype(int).to_numpy()
    s = df["proba"].astype(float).to_numpy()
    g = df["group"].astype(str).to_numpy()
    return df, y, s, g

def _aggregate_tau(sr: pd.Series, weights: pd.Series | None, how: str) -> float:
    sr = pd.to_numeric(sr, errors="coerce").dropna()
    if sr.empty: return np.nan
    how = how.lower()
    if how == "median":
        return float(sr.median())
    if how == "wmean" and weights is not None and len(weights)==len(sr):
        w = pd.to_numeric(weights, errors="coerce").fillna(0).to_numpy()
        s = sr.to_numpy()
        wsum = float(w.sum())
        return float((s*w).sum()/wsum) if wsum>0 else float(s.mean())
    # default: mean
    return float(sr.mean())

def _load_thresholds(source_kind: str, th_path: str):
    """
    τ を“読込だけ”で取得する（再計算しない）。
    - outer: FINAL_THRESHOLDS_POSTHOC.CSV から単一値を読む
    - inner: GROUP_AWARE_THRESH_BY_FOLD.CSV から fold 別 τ を集計して代表値を作る
    戻り値: dict {"tau_single","tauH_WG","tauL_WG"}  ※METRICに応じて列を選択
    """
    if source_kind == "outer":
        df = pd.read_csv(th_path, encoding="utf-8-sig")
        need_cols = ["tau_single"]
        if METRIC == "f1":
            need_cols += ["tau_high_WGF1","tau_low_WGF1"]
        else:
            need_cols += ["tau_high_WG","tau_low_WG"]
        for c in need_cols:
            if c not in df.columns:
                raise KeyError(f"[Cell9] {th_path} に {c} が見つからない")

        tau_single = float(df["tau_single"].iloc[0])
        if METRIC == "f1":
            tauH, tauL = float(df["tau_high_WGF1"].iloc[0]), float(df["tau_low_WGF1"].iloc[0])
        else:
            tauH, tauL = float(df["tau_high_WG"].iloc[0]),   float(df["tau_low_WG"].iloc[0])
        return {"tau_single": tau_single, "tauH_WG": tauH, "tauL_WG": tauL, "note": "outer/post-hoc"}

    # inner: fold 別を代表値に集計
    df = pd.read_csv(th_path, encoding="utf-8-sig")
    if "tau_single" not in df.columns:
        raise KeyError(f"[Cell9] {th_path} に tau_single が見つからない")
    weights = df["n_test"] if "n_test" in df.columns else None

    tau_single = _aggregate_tau(df["tau_single"], weights, CELL9_TAU_AGG)

    if METRIC == "f1":
        needH, needL = "tau_high_WGF1", "tau_low_WGF1"
    else:
        needH, needL = "tau_high_WG", "tau_low_WG"
    for c in (needH, needL):
        if c not in df.columns:
            raise KeyError(f"[Cell9] {th_path} に {c} が見つからない")

    tauH = _aggregate_tau(df[needH], weights, CELL9_TAU_AGG)
    tauL = _aggregate_tau(df[needL], weights, CELL9_TAU_AGG)
    return {"tau_single": tau_single, "tauH_WG": tauH, "tauL_WG": tauL, "note": f"inner/agg={CELL9_TAU_AGG}"}

def _load_metrics_title():
    """AUC/n 情報があればタイトル用に取得（無ければ None）"""
    src = METR_OUTER if _exists(METR_OUTER) else (METR_INNER if _exists(METR_INNER) else None)
    if not src: return None
    try:
        m = pd.read_csv(src, encoding="utf-8-sig")
        auc = float(m.filter(regex="AUC", axis=1).iloc[0].dropna().values[0])
        n  = int(m["n_samples"].iloc[0]) if "n_samples" in m.columns else None
        return {"auc": auc, "n": n, "src": os.path.basename(src)}
    except Exception:
        return None

def _draw_density(ax, s0, s1, tau: float | None, title: str, bins: int):
    """
    s0: P(Sick) for Non-Sick (y=0)
    s1: P(Sick) for Sick     (y=1)
    """
    # 規格化ヒスト（density=True）
    bin_edges = np.linspace(0, 1, bins+1)
    ax.hist(s0, bins=bin_edges, density=True, alpha=0.45, label="Non-Sick", edgecolor="none")
    ax.hist(s1, bins=bin_edges, density=True, alpha=0.45, label="Sick",     edgecolor="none")

    # しきい値（読込のみ）
    if tau is not None and np.isfinite(tau):
        ax.axvline(float(tau), linestyle="--", linewidth=1.5)
        ax.text(float(tau), ax.get_ylim()[1]*0.95, f"τ={float(tau):.3f}",
                ha="center", va="top", fontsize=20, bbox=dict(boxstyle="round,pad=0.25", fc="white", ec="none", alpha=0.7))

    ax.set_xlim(0, 1)
    ax.set_xlabel("Predicted probability P(Sick)")
    ax.set_ylabel("Density")
    ax.set_title(title, pad=10)
    ax.legend(loc="upper left", frameon=True)

# ===== メイン処理 =====
source_kind, pred_csv, thr_csv = _pick_source()
pred_df, y_pool, s_pool, g_pool = _load_predictions(pred_csv)
taus = _load_thresholds(source_kind, thr_csv)

# High/Low 抽出
maskH = (g_pool == "High")
maskL = (g_pool == "Low")
if not (maskH.any() and maskL.any()):
    raise RuntimeError("[Cell9] High/Low いずれかが存在しないため属性別パネルが作図不可")

# y=0/1 の確率配列
s_all_0, s_all_1 = s_pool[y_pool==0], s_pool[y_pool==1]
s_high_0, s_high_1 = s_pool[(y_pool==0) & maskH], s_pool[(y_pool==1) & maskH]
s_low_0,  s_low_1  = s_pool[(y_pool==0) & maskL], s_pool[(y_pool==1) & maskL]

# スタイル（ユーザ規約）
plt.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "lines.linewidth": 1.5,
    "axes.titlesize": 30,
    "axes.labelsize": 24,
    "legend.fontsize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
})

# タイトル補助（AUC 等）
info = _load_metrics_title()
title_tail = ""
if info is not None:
    if info.get("n") is not None:
        pos = int((y_pool==1).sum()); neg = int((y_pool==0).sum())
        title_tail = f" (metric={METRIC_NAME}, AUC={info['auc']:.3f}, n={info['n']}, pos={pos}, neg={neg})"
    else:
        title_tail = f" (metric={METRIC_NAME}, AUC={info['auc']:.3f})"
else:
    title_tail = f" (metric={METRIC_NAME})"

# 作図
fig, axes = plt.subplots(1, 3, figsize=(24, 9), constrained_layout=True)

_draw_density(axes[0], s_all_0,  s_all_1,  taus["tau_single"], title="Single (attribute-agnostic)", bins=CELL9_BINS)
_draw_density(axes[1], s_high_0, s_high_1, taus["tauH_WG"],    title="High group (WG-optimized τ_H)", bins=CELL9_BINS)
_draw_density(axes[2], s_low_0,  s_low_1,  taus["tauL_WG"],    title="Low group (WG-optimized τ_L)", bins=CELL9_BINS)

supt = f"Probability distributions of P(Sick){title_tail}\nsource={source_kind}, thresholds={taus.get('note','')}"
fig.suptitle(supt, fontsize=32, y=1.08)
fig.set_constrained_layout_pads(w_pad=0.02, h_pad=0.02, wspace=0.28, hspace=0.02)

out_png = outpath("PROB_DENS_SINGLE_vs_GROUP.png")
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close()

print(f"[Cell9] source={source_kind}, metric={METRIC_NAME}, bins={CELL9_BINS}")
print(f"[Cell9] thresholds loaded: tau_single={taus['tau_single']:.3f}, tau_H={taus['tauH_WG']:.3f}, tau_L={taus['tauL_WG']:.3f} (from {os.path.basename(thr_csv)})")
print(f"[Cell9] saved -> {out_png}")


KeyError: '[Cell9] C:\\Users\\taiki\\OneDrive - Science Tokyo\\デスクトップ\\研究\\本実験結果\\ANALYSIS\\機械学習(MSSQ込み)\\閾値FMS1\\FINAL_THRESHOLDS_POSTHOC.CSV に tau_high_WGF1 が見つからない'

In [None]:
# ===== Cell 10: AUC–k 曲線（被験者数 s ごとの複数本；SHAPランキング利用） =====
"""
機能：
- Cell 4で保存済みの SHAP_FEATURE_RANKING*.CSV から「特徴量重要度順」を取得し，
  Cell 6 の k→AUC（pooled, LOSO）ロジックを拡張して，
  学習に使う被験者数 s を変えた複数本の k–AUC 曲線を描画する．
- 各 s について：被験者 s 名を無作為抽出 → その s 名の中で LeaveOneGroupOut を回し，
  k 本の上位特徴を使って pooled AUC を算出．これを複数リピートして平均と95%分位を描く．

前提：
- X_scaled_all, y_all, groups, fit_xgb_classifier(), outpath() が定義済みであること
- Cell 4 が生成した SHAP_FEATURE_RANKING.CSV（または _LABELED.CSV）が OUT_DIR にあること

出力：
- CSV: AUC_K_vs_SUBJECTS.csv（列: s, k, repeat, auc）
- 図 : AUC_K_vs_SUBJECTS.png（横軸=k，縦軸=AUC，曲線=各 s の平均，帯=2.5–97.5%）
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score

# ---------- ランキング読込（Cell 4の成果物） ----------
rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                   outpath("SHAP_FEATURE_RANKING.CSV")]
rank_path = None
for p in rank_candidates:
    if os.path.exists(p):
        rank_path = p; break
if rank_path is None:
    raise FileNotFoundError("[ERROR] SHAP_FEATURE_RANKING(_LABELED).CSV が OUT_DIR に存在しない．Cell 4 を先に実行すること．")

rank_df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
rank_col = "mean_abs" if "mean_abs" in rank_df.columns else ("mean_abs_shap" if "mean_abs_shap" in rank_df.columns else None)
if rank_col is None:
    raise KeyError("[ERROR] ランキングCSVに mean_abs / mean_abs_shap 列が無い．")
rank_df = rank_df.sort_values(rank_col, ascending=False)

# Xの列に存在する順序リスト
feature_order = [f for f in rank_df.index if f in X_scaled_all.columns]
if not feature_order:
    raise RuntimeError("[ERROR] ランキングに載っている特徴が X_scaled_all に見つからない．")

# ---------- パラメータ ----------
SUBJ_SIZES = None   # 例: [6, 8, 10, 12, 14]。Noneなら自動（{4,6,8,...,全体-1}）
K_LIST     = None   # 例: [2,4,6,8,10,12,16,20,24]。Noneなら自動（BEST_Kを含めつつ列数以内）
REPEATS    = 10     # 各(s,k)の反復回数
SEED_BASE  = 31415  # 乱数種
SAVE_PREFIX = "AUC_K_vs_SUBJECTS"

# ---------- 自動セットアップ ----------
subj_ids_all = groups.astype(str).unique()
n_subjects_total = len(subj_ids_all)

if SUBJ_SIZES is None:
    # {8,10,12,..., 全体-1}（4,6は除外）
    SUBJ_SIZES = [s for s in range(8, max(9, n_subjects_total), 2) if s < n_subjects_total]
    if not SUBJ_SIZES:
        SUBJ_SIZES = [max(8, n_subjects_total-1)]  # どうしても小さい場合のフォールバック

maxK = len(feature_order)
if K_LIST is None:
    _bk = globals().get("BEST_K", None)
    base = [2,4,6,8,10,12,16,20,24,32,48,64]
    if isinstance(_bk, (int, np.integer)):
        base.append(int(_bk))
    # 列数以内に制限し，重複を削除して昇順に
    K_LIST = sorted({k for k in base if 1 <= k <= maxK})
    if not K_LIST:
        K_LIST = list(range(2, min(1+maxK, 20), 2))

print(f"[INFO] s-list={SUBJ_SIZES}, k-list={K_LIST}, repeats={REPEATS}")

# ---------- 主処理 ----------
logo = LeaveOneGroupOut()
rng  = np.random.default_rng(SEED_BASE)
records = []

X_all = X_scaled_all.astype(np.float32).copy()
y_all = pd.Series(np.asarray(y_all)).reset_index(drop=True)
g_all = pd.Series(groups.astype(str).values).reset_index(drop=True)

for s in SUBJ_SIZES:
    for r in range(REPEATS):
        # 学習対象の被験者 s 名を無作為抽出
        chosen = rng.choice(subj_ids_all, size=s, replace=False)
        mask   = g_all.isin(chosen)
        X_sub  = X_all.loc[mask, feature_order]
        y_sub  = y_all.loc[mask].values
        g_sub  = g_all.loc[mask].values

        # kごとに pooled AUC（LOSO over chosen）を計算
        for k in K_LIST:
            feats = feature_order[:k]
            Xk = X_sub[feats]

            try:
                y_true_all, proba_all = [], []
                for tr_idx, te_idx in logo.split(Xk, y_sub, g_sub):
                    X_tr, X_te = Xk.iloc[tr_idx], Xk.iloc[te_idx]
                    y_tr, y_te = y_sub[tr_idx], y_sub[te_idx]

                    # 片クラスfoldはスキップ（そのままではAUC不可）
                    if len(np.unique(y_tr)) < 2 or len(np.unique(y_te)) < 2:
                        continue

                    model = fit_xgb_classifier(X_tr, pd.Series(y_tr))
                    # predict_proba を前提（fit_xgb_classifier がXGB想定）
                    proba = model.predict_proba(X_te)[:, 1]
                    y_true_all.append(y_te)
                    proba_all.append(proba)

                if len(y_true_all) == 0:
                    auc_val = np.nan
                else:
                    y_true_k  = np.concatenate(y_true_all)
                    proba_k   = np.concatenate(proba_all)
                    if len(np.unique(y_true_k)) < 2:
                        auc_val = np.nan
                    else:
                        auc_val = float(roc_auc_score(y_true_k, proba_k))

                records.append({"s": int(s), "k": int(k), "repeat": int(r), "auc": auc_val})
            except Exception as e:
                print(f"[SKIP] s={s}, r={r}, k={k} (reason: {e})")
                continue

# ---------- 保存 ----------
df_rec = pd.DataFrame(records)
csv_path = outpath(f"{SAVE_PREFIX}.csv")
df_rec.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {csv_path}")

# ---------- 集計＆描画 ----------
# ---------- 集計＆描画（kで着色／極端kは大きめ、x軸反転、s=4/6は非表示） ----------
from matplotlib.colors import TwoSlopeNorm

# ---------- 集計＆描画（明るい配色／sごと色固定／まとめ図＋個別図） ----------
agg = (df_rec.groupby(["s","k"])["auc"]
       .agg(mean="mean",
            p2p5=lambda x: np.nanquantile(x, 0.025) if np.isfinite(x).any() else np.nan,
            p97p5=lambda x: np.nanquantile(x, 0.975) if np.isfinite(x).any() else np.nan,
            n="count")
       .reset_index())

# s=4,6 を除外
s_list = sorted([s for s in agg["s"].unique() if s not in (4, 6)])

# 明るいカラーパレット（Set2→Set3→tab10 を順に使う）
palettes = [plt.cm.get_cmap("Set2").colors,
            plt.cm.get_cmap("Set3").colors,
            plt.cm.get_cmap("tab10").colors]
color_pool = [c for pal in palettes for c in pal]
color_map_s = {s: color_pool[i % len(color_pool)] for i, s in enumerate(s_list)}

# ===== まとめ図 =====
plt.figure(figsize=(10, 7))
for s in s_list:
    d  = agg[agg["s"]==s].sort_values("k")
    xs = d["k"].values
    ys = d["mean"].values
    lo = d["p2p5"].values
    hi = d["p97p5"].values
    c  = color_map_s[s]

    # 95%帯（同色で明るめ）
    if np.isfinite(lo).all() and np.isfinite(hi).all():
        plt.fill_between(xs, lo, hi, color=c, alpha=0.22, linewidth=0, zorder=1)
    # 線＆マーカー
    plt.plot(xs, ys, marker="o", linewidth=1.5, color=c, label=f"s={s}", zorder=3)

plt.title("AUC vs Number of Features (k) by Training Subjects", fontsize=30)
plt.xlabel("Number of features (k)", fontsize=24)
plt.ylabel("ROC AUC (pooled, LOSO within chosen s)", fontsize=24)

# 目盛は降順（左=大きいk, 右=小さいk）
xticks = sorted(K_LIST, reverse=True)
plt.xticks(xticks, fontsize=20)
plt.yticks(fontsize=20)
plt.ylim(0.5, 1.0)
plt.grid(True, linestyle="--", linewidth=1.0, alpha=0.4)
plt.legend(title="Training subjects", fontsize=16, title_fontsize=16, ncol=2)

plt.gca().invert_xaxis()
fig_path = outpath(f"{SAVE_PREFIX}.png")
plt.tight_layout()
plt.savefig(fig_path, dpi=300)
plt.close()
print(f"[OK] FIG (combined) -> {fig_path}")

# ===== sごとの個別図 =====
for s in s_list:
    d  = agg[agg["s"]==s].sort_values("k")
    xs = d["k"].values
    ys = d["mean"].values
    lo = d["p2p5"].values
    hi = d["p97p5"].values
    c  = color_map_s[s]

    plt.figure(figsize=(8, 6))
    if np.isfinite(lo).all() and np.isfinite(hi).all():
        plt.fill_between(xs, lo, hi, color=c, alpha=0.22, linewidth=0, zorder=1)
    plt.plot(xs, ys, marker="o", linewidth=1.5, color=c, label=f"s={s}", zorder=3)

    plt.title(f"AUC vs Number of Features (k) — s={s}", fontsize=30)
    plt.xlabel("Number of features (k)", fontsize=24)
    plt.ylabel("ROC AUC (pooled, LOSO within s)", fontsize=24)
    plt.xticks(sorted(K_LIST, reverse=True), fontsize=20)
    plt.yticks(fontsize=20)
    plt.ylim(0.5, 1.0)
    plt.grid(True, linestyle="--", linewidth=1.0, alpha=0.4)
    plt.gca().invert_xaxis()

    fig_s_path = outpath(f"{SAVE_PREFIX}_s{s}.png")
    plt.tight_layout()
    plt.savefig(fig_s_path, dpi=300)
    plt.close()
    print(f"[OK] FIG (per-s) -> {fig_s_path}")


[INFO] s-list=[8, 10, 12, 14, 16], k-list=[2, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32], repeats=10


KeyboardInterrupt: 

In [None]:
# ===== Cell X: 被験者ブートストラップCI（AUC, OOFベース既定） =====
"""
目的：
- LOSO OOF予測を固定し，被験者（cluster）単位のブートストラップで AUC の95%CIを推定する．
- 既に OOF が無ければ，上位 BEST_K 特徴（Cell4のSHAPランキング）で一度だけLOSOしてOOFを作成してから実行．

出力：
- OOF_PRED_BESTK.CSV（無ければ作成）
- AUC_BOOTSTRAP_SUBJECT.csv（各反復のAUC）
- AUC_BOOTSTRAP_SUMMARY.csv（平均/SE/95%CI/有効反復数 等）
- AUC_BOOTSTRAP_HIST.png（分布＋CI） / AUC_BOOTSTRAP_ECDF.png（累積分布）

主要パラメータ（下の CONFIG を調整）：
- B = 2000（反復回数）
- SEED = 20251101（乱数）
- MAX_REDRAW = 20（単一クラス回避の再抽選上限）
- MODE = "oof" または "retrain"（既定は oof）
- STRATIFY_BY = None または "MSSQ_group"（被験者層別の比率維持；既定 None）
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score

# -------- CONFIG --------
B = 2000
SEED = 20251101
MAX_REDRAW = 20
MODE = "oof"            # "oof" or "retrain"
STRATIFY_BY = None      # None or "MSSQ_group"
OOF_CSV = "OOF_PRED_BESTK.CSV"
BOOT_CSV = "AUC_BOOTSTRAP_SUBJECT.csv"
SUMM_CSV = "AUC_BOOTSTRAP_SUMMARY.csv"
HIST_PNG = "AUC_BOOTSTRAP_HIST.png"
ECDF_PNG = "AUC_BOOTSTRAP_ECDF.png"

# ------------- 前提確認 -------------
assert 'X_scaled_all' in globals(), "[ERROR] X_scaled_all が未定義"
assert 'y_all'        in globals(), "[ERROR] y_all が未定義"
assert 'groups'       in globals(), "[ERROR] groups が未定義"
assert 'outpath'      in globals(), "[ERROR] outpath() が未定義"

# ------------- ユーティリティ -------------
def _load_feature_order():
    """Cell4のランキングCSVから重要度降順の特徴順を取得"""
    rank_candidates = [outpath("SHAP_FEATURE_RANKING_LABELED.CSV"),
                       outpath("SHAP_FEATURE_RANKING.CSV")]
    rank_path = None
    for p in rank_candidates:
        if os.path.exists(p):
            rank_path = p; break
    if rank_path is None:
        raise FileNotFoundError("[ERROR] SHAP_FEATURE_RANKING*.CSV が見つかりません（Cell 4 実行を確認）")

    df = pd.read_csv(rank_path, encoding="utf-8-sig", index_col=0)
    rcol = "mean_abs" if "mean_abs" in df.columns else ("mean_abs_shap" if "mean_abs_shap" in df.columns else None)
    if rcol is None:
        raise KeyError("[ERROR] ランキングCSVに mean_abs / mean_abs_shap が無い")
    order = [f for f in df.sort_values(rcol, ascending=False).index if f in X_scaled_all.columns]
    if not order:
        raise RuntimeError("[ERROR] ランキングの特徴が X_scaled_all に存在しません")
    return order

def _predict_proba_safe(model, X):
    """predict_proba が無い学習器への保険"""
    try:
        return model.predict_proba(X.astype(np.float32))[:, 1]
    except Exception:
        p = model.decision_function(X)
        p = (p - p.min()) / (p.max() - p.min() + 1e-12)
        return p

def build_oof_bestk(feature_order, best_k):
    """上位 BEST_K 特徴で LOSO OOF を作成（1回だけ学習）"""
    feats = feature_order[:int(best_k)]
    X = X_scaled_all[feats].astype(np.float32)
    y = pd.Series(np.asarray(y_all)).reset_index(drop=True)
    g = pd.Series(groups.astype(str).values).reset_index(drop=True)

    logo = LeaveOneGroupOut()
    rows = []
    for tr_idx, te_idx in logo.split(X, y, g):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        if len(np.unique(y_tr)) < 2:
            raise RuntimeError("[ERROR] 学習foldが単一クラス（OOF作成中）")
        model = fit_xgb_classifier(X_tr, y_tr)
        proba = _predict_proba_safe(model, X_te)
        sub_ids = g.iloc[te_idx].astype(str).values  # 同一subjectが並ぶ
        df_fold = pd.DataFrame({"subject": sub_ids, "y_true": y_te.values, "y_score": proba})
        rows.append(df_fold)

    oof = pd.concat(rows, ignore_index=True)
    oof.to_csv(outpath(OOF_CSV), index=False, encoding="utf-8-sig")
    print(f"[OK] OOF saved -> {outpath(OOF_CSV)}")
    return oof

def _attach_strata_if_needed(oof_df):
    """STRATIFY_BY='MSSQ_group' の場合，被験者メタから層ラベルを付与"""
    if STRATIFY_BY is None:
        oof_df["strata"] = "ALL"
        return oof_df
    if STRATIFY_BY == "MSSQ_group" and 'SUBJECT_META' in globals():
        meta = SUBJECT_META.copy()
        # 代表名の推定
        id_col  = next((c for c in meta.columns if c.lower() in ("subject","subject_id","id","sid")), None)
        grp_col = next((c for c in meta.columns if c.lower() in ("mssq_group","mssqgroup","group","mssq_highlow")), None)
        if (id_col is not None) and (grp_col is not None):
            d = dict(zip(meta[id_col].astype(str), meta[grp_col].astype(str)))
            oof_df["strata"] = oof_df["subject"].astype(str).map(d).fillna("ALL")
            return oof_df
    # フォールバック
    oof_df["strata"] = "ALL"
    return oof_df

def bootstrap_auc_subject(oof_df, B=2000, seed=20251101, max_redraw=20):
    """被験者（cluster）ブートストラップでAUC分布を推定（OOF固定）"""
    rng = np.random.default_rng(seed)
    subjects = oof_df["subject"].astype(str).unique()
    n_subj = len(subjects)

    # strataごとに被験者集合を準備
    strata_by_subj = (oof_df[["subject","strata"]].drop_duplicates()
                      .set_index("subject")["strata"].to_dict())
    strata_levels = sorted(oof_df["strata"].unique())
    subj_by_strata = {s: [sub for sub in subjects if strata_by_subj.get(sub,"ALL")==s] for s in strata_levels}

    rec, skipped = [], 0
    print(f"[BOOT] start: B={B}, mode=oof, BEST_K={globals().get('BEST_K','?')}, SEED={seed}")

    for b in range(B):
        redraw = 0
        while True:
            # 層別（必要なら各層で元の被験者数と同数を復元）
            chosen = []
            for st in strata_levels:
                pool = subj_by_strata[st]
                if len(pool) == 0:
                    continue
                chosen.extend(list(rng.choice(pool, size=len(pool), replace=True)))
            # 連結（重複subjectは複数回分を結合）
            parts = [oof_df[oof_df["subject"]==sid] for sid in chosen]
            boot = pd.concat(parts, ignore_index=True)

            yb = boot["y_true"].values
            if np.unique(yb).size >= 2:
                break
            redraw += 1
            if redraw > max_redraw:
                skipped += 1
                boot = None
                break
        if boot is None:
            continue

        auc_b = float(roc_auc_score(boot["y_true"].values, boot["y_score"].values))
        rec.append(dict(
            b_id=int(b), auc=auc_b, n_subjects=int(n_subj),
            n_pos=int((boot["y_true"].values==1).sum()),
            n_neg=int((boot["y_true"].values==0).sum()),
            seed=int(seed)
        ))
        if (b+1) % 200 == 0:
            print(f"[BOOT] b={b+1:4d}  auc={auc_b:.3f}")

    df_boot = pd.DataFrame(rec)
    return df_boot, skipped

def summarize_bootstrap(df_boot, auc_oof):
    """平均・SE・95%percentile CI を計算"""
    vals = df_boot["auc"].dropna().values
    mean = float(np.nanmean(vals)) if len(vals) else np.nan
    se   = float(np.nanstd(vals, ddof=1)/np.sqrt(max(1,len(vals)))) if len(vals)>1 else np.nan
    p2p5 = float(np.nanquantile(vals, 0.025)) if len(vals) else np.nan
    p97p5= float(np.nanquantile(vals, 0.975)) if len(vals) else np.nan
    return dict(auc_oof=float(auc_oof), mean=mean, se=se, p2p5=p2p5, p97p5=p97p5,
                n_boot=int(len(vals)))

def _set_plot_style():
    plt.rcParams.update({
        "font.size": 20, "axes.titlesize": 30, "axes.labelsize": 24,
        "xtick.labelsize": 20, "ytick.labelsize": 20, "legend.fontsize": 20
    })

def plot_bootstrap_hist(df_boot, auc_oof, ci_low, ci_high, note, png_name):
    _set_plot_style()
    plt.figure(figsize=(9,6))
    vals = df_boot["auc"].dropna().values
    plt.hist(vals, bins=30, alpha=0.8)
    # 目標線
    ax = plt.gca()
    ax.axvline(auc_oof, color="red", linewidth=1.5, label=f"OOF AUC = {auc_oof:.3f}")
    ax.axvline(ci_low, color="black", linewidth=1.5, linestyle="--", label=f"95% CI [{ci_low:.3f}, {ci_high:.3f}]")
    ax.axvline(ci_high, color="black", linewidth=1.5, linestyle="--")
    plt.title("Subject Bootstrap of AUC (Histogram)")
    plt.xlabel("AUC")
    plt.ylabel("Frequency")
    plt.legend()
    plt.text(0.98, 0.02, note, ha="right", va="bottom", transform=ax.transAxes, fontsize=12)
    plt.tight_layout()
    plt.savefig(outpath(png_name), dpi=300)
    plt.close()
    print(f"[OK] FIG -> {outpath(png_name)}")

def plot_bootstrap_ecdf(df_boot, auc_oof, ci_low, ci_high, note, png_name):
    _set_plot_style()
    plt.figure(figsize=(9,6))
    vals = np.sort(df_boot["auc"].dropna().values)
    y = np.arange(1, len(vals)+1) / max(1, len(vals))
    plt.plot(vals, y, linewidth=1.5)
    ax = plt.gca()
    ax.axvline(auc_oof, color="red", linewidth=1.5, label=f"OOF AUC = {auc_oof:.3f}")
    ax.axvline(ci_low, color="black", linewidth=1.5, linestyle="--", label=f"95% CI [{ci_low:.3f}, {ci_high:.3f}]")
    ax.axvline(ci_high, color="black", linewidth=1.5, linestyle="--")
    plt.title("Subject Bootstrap of AUC (ECDF)")
    plt.xlabel("AUC")
    plt.ylabel("Cumulative probability")
    plt.legend()
    plt.text(0.98, 0.02, note, ha="right", va="bottom", transform=ax.transAxes, fontsize=12)
    plt.tight_layout()
    plt.savefig(outpath(png_name), dpi=300)
    plt.close()
    print(f"[OK] FIG -> {outpath(png_name)}")

# ------------- 実行 -------------
# 1) OOFの用意（無ければ作成）
oof_path = outpath(OOF_CSV)
if os.path.exists(oof_path):
    oof = pd.read_csv(oof_path, encoding="utf-8-sig")
    print(f"[INFO] load OOF -> {oof_path}")
else:
    assert 'BEST_K' in globals(), "[ERROR] BEST_K が未定義（Cell 6 実行で決定してください）"
    feat_order = _load_feature_order()
    oof = build_oof_bestk(feat_order, BEST_K)

# OOF基準のAUC
auc_oof = float(roc_auc_score(oof["y_true"].values, oof["y_score"].values))
print(f"[INFO] OOF AUC = {auc_oof:.3f}  (n={len(oof)})")

# 層ラベル付与（必要時）
oof = _attach_strata_if_needed(oof)

# 2) ブートストラップ（MODE="oof" の場合）
if MODE == "oof":
    df_boot, skipped = bootstrap_auc_subject(oof, B=B, seed=SEED, max_redraw=MAX_REDRAW)

# 3) まとめ・保存
df_boot.to_csv(outpath(BOOT_CSV), index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {outpath(BOOT_CSV)}")

summ = summarize_bootstrap(df_boot, auc_oof)
summ_df = pd.DataFrame([{
    **summ,
    "skipped": int(skipped),
    "B": int(B),
    "BEST_K": int(globals().get("BEST_K", -1)),
    "MODE": MODE,
    "SEED": int(SEED)
}])
summ_df.to_csv(outpath(SUMM_CSV), index=False, encoding="utf-8-sig")
print(f"[OK] CSV -> {outpath(SUMM_CSV)}")
print(f"[INFO] 95% CI: [{summ['p2p5']:.3f}, {summ['p97p5']:.3f}]  mean={summ['mean']:.3f}  se={summ['se']:.4f}  (n_boot={summ['n_boot']}, skipped={skipped})")

# 4) 図
note = f"B={B}, BEST_K={globals().get('BEST_K','?')}, MODE={MODE}, SEED={SEED}"
plot_bootstrap_hist(df_boot, auc_oof, summ["p2p5"], summ["p97p5"], note, HIST_PNG)
plot_bootstrap_ecdf(df_boot, auc_oof, summ["p2p5"], summ["p97p5"], note, ECDF_PNG)


[OK] OOF saved -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\OOF_PRED_BESTK.CSV
[INFO] OOF AUC = 0.747  (n=340)
[BOOT] start: B=2000, mode=oof, BEST_K=5, SEED=20251101
[BOOT] b= 200  auc=0.697
[BOOT] b= 400  auc=0.758
[BOOT] b= 600  auc=0.751
[BOOT] b= 800  auc=0.677
[BOOT] b=1000  auc=0.771
[BOOT] b=1200  auc=0.718
[BOOT] b=1400  auc=0.742
[BOOT] b=1600  auc=0.731
[BOOT] b=1800  auc=0.659
[BOOT] b=2000  auc=0.817
[OK] CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_BOOTSTRAP_SUBJECT.csv
[OK] CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_BOOTSTRAP_SUMMARY.csv
[INFO] 95% CI: [0.637, 0.836]  mean=0.746  se=0.0011  (n_boot=2000, skipped=0)
[OK] FIG -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS1\AUC_BOOTSTRAP_HIST.png
[OK] FIG -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\機械学習(MSSQ込み)\閾値FMS

In [None]:
# ===== NEW CELL (REPLACE): Probability distributions — pooled only + 3 kinds of τ =====
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.metrics import confusion_matrix

# ============ 調整パラメータ（ここだけ触ればOK） ============
FIGSIZE_FULL = (18, 4.4)      # 全域 [0,1]
FIGSIZE_ZOOM = (18, 4.4)      # 拡大 [0, XMAX_ZOOM]
XMAX_ZOOM    = 0.20
DPI_FULL     = 300
DPI_ZOOM     = 300

FONTS = dict(                   # 文字サイズと線幅
    suptitle=18, title=14, label=12, tick=10, legend=11, box=11,
    kde_lw=1.6, hist_edge=0.2, hist_alpha=0.55, vline_lw=1.8
)

# KDE サンプル数（ピクセルではなく曲線の“滑らかさ”）
XGRID_N_FULL = 800
XGRID_N_ZOOM = 4000
# ===========================================================

# ---- 予測明細（Cell 8 の出力） ----
pred = pd.read_csv(outpath("GROUP_AWARE_PREDICTIONS.CSV"), encoding="utf-8-sig")
pred["group"] = pred["group"].astype(str)
y = pred["y_true"].astype(int).to_numpy()
s = pred["proba"].to_numpy()

# グループ（しきい値最適化用にのみ使う／分布は使わない）
H = pred.query("group=='High'")
L = pred.query("group=='Low'")
high_s = H["proba"].to_numpy(); high_y = H["y_true"].astype(int).to_numpy()
low_s  = L["proba"].to_numpy(); low_y  = L["y_true"].astype(int).to_numpy()

# 分布（プールのみ）
pooled_neg, pooled_pos = s[y==0], s[y==1]

# ---- 共通ビン（Freedman–Diaconis） ----
def _fd_bins(a, min_bins=40, max_bins=160):
    a = np.asarray(a); a = a[np.isfinite(a)]
    if a.size < 5: return min_bins
    q75, q25 = np.percentile(a, [75, 25]); iqr = max(q75-q25, 1e-6)
    h = 2*iqr*(a.size**(-1/3)); 
    if h <= 0: return min_bins
    n = int(np.clip((a.max()-a.min())/h, min_bins, max_bins))
    return max(n, min_bins)

NBINS = _fd_bins(np.r_[pooled_neg, pooled_pos])
BINS  = np.linspace(0.0, 1.0, NBINS+1)

# ---- BA と τ 探索（図用） ----
def _ba_from_preds(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    tpr = TP/(TP+FN) if (TP+FN)>0 else 0.0
    tnr = TN/(TN+FP) if (TN+FP)>0 else 0.0
    return 0.5*(tpr+tnr), tpr, tnr

def _best_tau_single(scores, labels, step=0.001):
    taus = np.arange(0.0, 1.0+1e-12, step)
    best = (-1.0, 0.5, (0.0,0.0))
    for t in taus:
        ba, tpr, tnr = _ba_from_preds(labels, (scores>=t).astype(int))
        if ba > best[0]: best = (ba, float(t), (tpr, tnr))
    return best  # (BA, tau, (TPR,TNR))

def _best_tau_group(hs, hy, ls, ly, coarse=0.01, fine=0.001, margin=0.03, mode="BA"):
    def eval_pair(tH, tL):
        baH,_,_ = _ba_from_preds(hy, (hs>=tH).astype(int))
        baL,_,_ = _ba_from_preds(ly, (ls>=tL).astype(int))
        BA = (baH+baL)/2.0
        WG = min(baH, baL)
        return BA, WG, baH, baL
    taus = np.arange(0.0, 1.0+1e-12, coarse)
    best = (-1.0, 0.5, 0.5)
    for tH in taus:
        for tL in taus:
            BA, WG, *_ = eval_pair(tH, tL)
            score = BA if mode=="BA" else WG
            if score > best[0]: best = (score, tH, tL)
    def span(t):
        lo, hi = max(0.0, t-margin), min(1.0, t+margin)
        return np.arange(lo, hi+1e-12, fine)
    score0, tH0, tL0 = best
    for tH in span(tH0):
        for tL in span(tL0):
            BA, WG, *_ = eval_pair(tH, tL)
            score = BA if mode=="BA" else WG
            if score > score0: score0, tH0, tL0 = score, tH, tL
    BA, WG, baH, baL = eval_pair(tH0, tL0)
    return {"tau_H":float(tH0), "tau_L":float(tL0), "BA":float(BA), "WG":float(WG),
            "BA_H":float(baH), "BA_L":float(baL)}

# Single τ
pooled_s = np.r_[pooled_neg, pooled_pos]
pooled_y = np.r_[np.zeros_like(pooled_neg, int), np.ones_like(pooled_pos, int)]
BA_single, tau_single, (TPR_single, TNR_single) = _best_tau_single(pooled_s, pooled_y)

# Group τ（2種）
gBA = _best_tau_group(high_s, high_y, low_s, low_y, mode="BA")
gWG = _best_tau_group(high_s, high_y, low_s, low_y, mode="WG")

# ---- 描画ヘルパ（プール分布のみ） ----
def _kde(ax, data, n):
    if len(data) > 1:
        xs = np.linspace(0, 1, n)
        ax.plot(xs, gaussian_kde(data)(xs), lw=FONTS["kde_lw"], color="#2ca02c")

def _panel(ax, xlim, xgrid_n, show_legend=True):
    ax.set_xlim(*xlim); ax.set_ylim(bottom=0)
    ax.set_xlabel("Predicted probability", fontsize=FONTS["label"])
    ax.set_ylabel("Density", fontsize=FONTS["label"])
    ax.tick_params(labelsize=FONTS["tick"])
    ax.grid(True, ls="--", alpha=0.25)
    # ヒスト（pooledのみ）
    ax.hist(pooled_neg, bins=BINS, density=True, alpha=FONTS["hist_alpha"],
            color="#4e79a7", edgecolor="k", linewidth=FONTS["hist_edge"], label="y=0")
    ax.hist(pooled_pos, bins=BINS, density=True, alpha=FONTS["hist_alpha"],
            color="#f28e2b", edgecolor="k", linewidth=FONTS["hist_edge"], label="y=1")
    _kde(ax, pooled_neg, xgrid_n); _kde(ax, pooled_pos, xgrid_n)
    if show_legend:
        ax.legend(loc="upper left", framealpha=0.9, fontsize=FONTS["legend"])

def _draw_thresholds(ax):
    # Single（黒点線）
    ax.axvline(tau_single, color="black", lw=FONTS["vline_lw"], ls=":")
    # Group BA-opt（青：High=実線 / Low=破線）
    ax.axvline(gBA["tau_H"], color="#1f77b4", lw=FONTS["vline_lw"], ls="-")
    ax.axvline(gBA["tau_L"], color="#1f77b4", lw=FONTS["vline_lw"], ls="--")
    # Group WG-opt（橙：High=実線 / Low=破線）
    ax.axvline(gWG["tau_H"], color="#ff7f0e", lw=FONTS["vline_lw"], ls="-")
    ax.axvline(gWG["tau_L"], color="#ff7f0e", lw=FONTS["vline_lw"], ls="--")

def _annot(ax, where="right"):
    txt = (f"Single: τ={tau_single:.3f}, BA={BA_single:.3f}\n"
           f"Group BA-opt: τ_H={gBA['tau_H']:.3f}, τ_L={gBA['tau_L']:.3f}, BA={gBA['BA']:.3f}\n"
           f"Group WG-opt: τ_H={gWG['tau_H']:.3f}, τ_L={gWG['tau_L']:.3f}, WG-BA={gWG['WG']:.3f}")
    ax.text(0.99 if where=="right" else 0.01, 0.98, txt,
            ha="right" if where=="right" else "left", va="top",
            transform=ax.transAxes, fontsize=FONTS["box"],
            bbox=dict(facecolor="white", alpha=0.85, boxstyle="round,pad=0.25"))

# ===================== FULL =====================
figF, axF = plt.subplots(1, 1, figsize=FIGSIZE_FULL)
figF.subplots_adjust(left=0.07, right=0.98, bottom=0.18, top=0.86)
_panel(axF, (0.0, 1.0), XGRID_N_FULL, show_legend=True)
_draw_thresholds(axF)
_annot(axF, "right")
axF.set_title("Probability distributions (pooled) — full range", fontsize=FONTS["title"])
figF.suptitle("Pooled score distributions with Single τ / Group BA-opt / Group WG-opt", fontsize=FONTS["suptitle"], y=0.96)
plt.savefig(outpath("PROB_DENSITY_CLEAN_FULL.png"), dpi=DPI_FULL)
plt.close(figF)

# ===================== ZOOM =====================
figZ, axZ = plt.subplots(1, 1, figsize=FIGSIZE_ZOOM)
figZ.subplots_adjust(left=0.07, right=0.98, bottom=0.18, top=0.86)
_panel(axZ, (0.0, XMAX_ZOOM), XGRID_N_ZOOM, show_legend=True)
_draw_thresholds(axZ)
_annot(axZ, "right")
axZ.set_title(f"Probability distributions (pooled) — zoom to {XMAX_ZOOM:.2f}", fontsize=FONTS["title"])
figZ.suptitle("Pooled score distributions (zoomed) with three thresholds", fontsize=FONTS["suptitle"], y=0.96)
plt.savefig(outpath("PROB_DENSITY_CLEAN_ZOOM.png"), dpi=DPI_ZOOM)
plt.close(figZ)

print("[OK] Saved:", outpath("PROB_DENSITY_CLEAN_FULL.png"), " / ", outpath("PROB_DENSITY_CLEAN_ZOOM.png"))


In [None]:
# -*- coding: utf-8 -*-
# ===== Cell X (final, rev3): FMS histogram with Top-percentile in-bar lines =====
"""
変更点：
- パーセンタイルは「上の方（FMSが高い側）から」カウント：Top 20% / 50% / 10%
- 赤い点線は該当ビンの内部に水平表示、ラベルは点線より少し上に表示
- 件数ラベルもやや上に（ymaxの6%）
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path

# ===== 設定 =====
USE_ML_WINDOW = True
LINEWIDTH = 1.5
COUNT_LABEL_OFFSET_FRAC = 0.06   # 棒上の件数ラベルの上げ幅（ymax比）
PCT_LABEL_OFFSET_FRAC = 0.02     # 点線ラベルの上げ幅（ymax比）

FMS_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\ANALYSIS\FMS")
FMS_DIR.mkdir(parents=True, exist_ok=True)

# ===== データ取得 =====
CANDIDATE_DF_NAMES_ML = ["df_ml", "df_ml_epoch"] if USE_ML_WINDOW else []
CANDIDATE_DF_NAMES_ALL = ["df"]
CANDIDATE_DF_NAMES = CANDIDATE_DF_NAMES_ML + CANDIDATE_DF_NAMES_ALL
CANDIDATE_FMS_COLS = ["FMS", "fms", "Fms", "FMS_score", "FMS_Score", "FMS_label", "FMSLabel"]

src_name = None
series = None
picked_df_name = None
picked_col = None

for df_name in CANDIDATE_DF_NAMES:
    if df_name in globals() and isinstance(globals()[df_name], pd.DataFrame):
        df_candidate = globals()[df_name]
        for col in CANDIDATE_FMS_COLS:
            if col in df_candidate.columns:
                series = df_candidate[col].copy()
                picked_df_name = df_name
                picked_col = col
                src_name = "ML" if ("ml" in df_name.lower()) else "All"
                break
        if series is not None:
            break

if series is None:
    raise RuntimeError(
        "[ERROR] Could not find FMS series.\n"
        f"  Tried dataframes: {CANDIDATE_DF_NAMES}\n"
        f"  Tried columns:    {CANDIDATE_FMS_COLS}\n"
        "  → 変数名／列名を確認してください。"
    )

print(f"[OK] Using {picked_df_name}['{picked_col}'] as FMS source (src={src_name}).")

# ===== 前処理 =====
series = pd.to_numeric(series, errors="coerce").dropna().astype(int).clip(0, 4)
N = int(series.size)
if N == 0:
    raise RuntimeError("[SKIP] Input FMS series is empty after cleaning.")

# ===== スタイル =====
mpl.rcParams.update({
    "figure.dpi": 120, "savefig.dpi": 300,
    "font.size": 20,
    "axes.titlesize": 30,
    "axes.labelsize": 24,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,
    "legend.fontsize": 20,
})

# ===== 集計 =====
bin_values = np.arange(0, 5)  # 0..4
bin_counts = np.array([(series == k).sum() for k in bin_values], dtype=int)

# ===== 図作成 =====
bins = np.arange(-0.5, 5.5, 1)
fig, ax = plt.subplots(figsize=(10, 7))
counts, _, _ = ax.hist(series.values, bins=bins, edgecolor='black', alpha=0.85, linewidth=LINEWIDTH)

ax.set_xlim(-0.5, 4.5)
ymax = max(10, int(max(counts) * 1.25)) if counts.size else 10
ax.set_ylim(0, ymax)
ax.set_xticks(bin_values)
ax.set_xlabel("FMS")
ax.set_ylabel("Count")
title = f"FMS Histogram ({'ML' if src_name=='ML' else 'All'})"
ax.set_title(title)

# 棒の上に件数(n)を表示：さらに上へ
offset = max(1, int(COUNT_LABEL_OFFSET_FRAC * ymax))
for x, c in zip(bin_values, counts):
    if c > 0:
        y_text = min(c + offset, ymax * 0.98)
        ax.text(x, y_text, f"{int(c)}", ha='center', va='bottom', fontsize=18, fontweight='bold')

# ===== 「上の方から」パーセンタイル（Top 20/50/10%） =====
targets_top = [(0.20, "20%"), (0.50, "50%"), (0.10, "10%")]

def draw_top_percentile_in_bar(p_top: float, label: str):
    """
    上位 p_top の位置に赤点線を引く。
    手順：FMS=4→0 の順に累積し、need_top = p_top*N を満たす最初のビンを見つけ、
    そのビン内で「上から within_top 本目」に相当する高さに線を引く。
    """
    need_top = p_top * N
    cum = 0.0
    bin_idx = None
    within_top = 0.0

    # 4,3,2,1,0 の順で走査
    for idx in range(len(bin_counts) - 1, -1, -1):
        c = float(bin_counts[idx])
        if cum + c >= need_top:
            bin_idx = idx
            within_top = need_top - cum  # そのビンの「上から」within_top 本目
            break
        cum += c

    if bin_idx is None or bin_counts[bin_idx] == 0:
        return

    bar_h = float(bin_counts[bin_idx])

    # y座標（下からの高さ）= bar_h - within_top
    y_line = bar_h - within_top
    # 線が棒の外に出ないようクランプ（最小0.5、最大bar_h-0.5）
    y_line = float(np.clip(y_line, 0.5, max(0.5, bar_h - 0.5)))

    # 点線
    x_left, x_right = bin_idx - 0.5, bin_idx + 0.5
    ax.hlines(y_line, xmin=x_left, xmax=x_right, colors='red', linestyles='--', linewidth=LINEWIDTH + 1.0)

    # ラベル（点線より少し上、棒の外に出ないように）
    y_label = min(y_line + PCT_LABEL_OFFSET_FRAC * ymax, max(1.0, bar_h - 0.2))
    ax.text(x_right - 0.03, y_label, f"{label} (n={int(round(need_top))})",
            color='red', ha='right', va='bottom', fontsize=16, fontweight='bold')

for p_top, lab in targets_top:
    draw_top_percentile_in_bar(p_top, lab)

plt.tight_layout()

# ===== 保存 =====
out_name = "FMS_HISTOGRAM_ML_percentile_inbar.png" if (src_name == "ML") else "FMS_HISTOGRAM_ALL_percentile_inbar.png"
out_path = FMS_DIR / out_name
plt.savefig(out_path)
plt.close()
print(f"[OK] Plot -> {out_path.resolve()}")
