In [39]:
# ===== Cell 1: Load & assemble dataset =====
import os
import pandas as pd
import numpy as np

# --- 前提（ユーザー定義） ---
subjects = [
    ("0521", "因幡先生"),
    ("06021", "今村さん"),
    ("06022", "梅野さん"),
    ("06271", ""),
    ("06272", ""),
    ("06273", ""),
    ("06274", ""),
    ("06275", "")
]

title_map = { 'Face_Temp_Max_mean': 'Face_Temp Max', 'Face_Temp_Mean_mean': 'Face_Temp Mean', 'Face_Temp_Max_Diff_mean': 'Face_Temp Max Diff', 'Face_Temp_Mean_Diff_mean': 'Face_Temp Mean Diff', 'HeartRate_BPM_mean': 'Pulse HR', 'RR_interval_sec_mean': 'RR Interval', 'RMSSD': 'RMSSD', 'watch_Heart_Rate(bpm)_mean': 'Watch HR', 'watch_Sweat_Rate(mg/cm^2/min)_mean': 'Watch ACC', 'watch_Skin_Temperature(C)_mean': 'Watch Temp', 'SDSD': 'SDSD', 'pNN50': 'pNN50', 'HF_power': 'HF Power', 'LF_power': 'LF Power', 'LF_HF_ratio': 'HF/LF', 'CSI': 'CSI', 'CVI': 'CVI', 'SD1': 'SD1', 'SD2': 'SD2' }
interval_map = { 'Face_Temp_Max_mean': 30, 'Face_Temp_Mean_mean': 30, 'Face_Temp_Max_Diff_mean': 30, 'Face_Temp_Mean_Diff_mean': 30, 'HeartRate_BPM_mean': 30, 'RR_interval_sec_mean': 30, 'RMSSD': 30, 'watch_Heart_Rate(bpm)_mean': 30, 'watch_Sweat_Rate(mg/cm^2/min)_mean': 30, 'watch_Skin_Temperature(C)_mean': 30, 'SDSD': 30, 'pNN50': 60, 'HF_power': 120, 'LF_power': 120, 'LF_HF_ratio': 120, 'CSI': 30, 'CVI': 30, 'SD1': 30, 'SD2': 30 }

BASE_DIR = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果"

# --- 探索する候補パスのテンプレ ---
def candidate_paths(base_dir, sid, name):
    fname = f"{sid}_epoch30_merged.csv"
    cand = [
        os.path.join(base_dir, f"{sid}{name}", "EPOCH", fname),
        os.path.join(base_dir, f"{sid}{name}", fname),
        os.path.join(base_dir, fname),
    ]
    # nameが空文字のときの余分なスラッシュなども吸収
    return list(dict.fromkeys([p.replace("//", "/").replace("\\\\", "\\") for p in cand]))

# --- 列名を仕様どおりに正規化（位置で強制） ---
def normalize_columns(df):
    cols = list(df.columns)
    if len(cols) < 4:
        raise ValueError(f"列数が不足しています（{len(cols)}列）。最低4列必要です。")
    # 位置ベースで先頭3列名を固定、それ以降は既存名を尊重（重複回避）
    new_cols = ["epoch_start", "epoch_end", "FMS"] + cols[3:]
    df = df.copy()
    df.columns = new_cols
    return df

# --- 型の統一：4列目以降は数値に、FMSは数値化 ---
def enforce_types(df):
    df = df.copy()
    # FMS
    df["FMS"] = pd.to_numeric(df["FMS"], errors="coerce")
    # 4列目以降（特徴量）をfloat化
    feature_cols = df.columns[3:]
    for c in feature_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# --- 読み込み本体 ---
all_frames = []
for sid, name in subjects:
    # ログの見出し
    print(f"# Subject {sid}{name}")
    found = None
    for p in candidate_paths(BASE_DIR, sid, name):
        if os.path.exists(p):
            found = p
            break

    if found is None:
        print(f"[SKIP] CSV: {sid}_epoch30_merged.csv が見つかりません")
        continue

    try:
        df_i = pd.read_csv(found, encoding="utf-8-sig")
        df_i = normalize_columns(df_i)
        df_i = enforce_types(df_i)
        # 被験者情報の付与
        df_i["subject_id"] = sid
        df_i["person_name"] = name
        all_frames.append(df_i)
        print(f"[OK]  CSV -> {found}")
    except Exception as e:
        print(f"[SKIP] CSV: 読み込み/正規化に失敗: {e}")

# --- 縦結合と基本チェック ---
if len(all_frames) == 0:
    raise RuntimeError("有効なCSVが1件も読み込めませんでした。パス設定を確認してください。")

df = pd.concat(all_frames, axis=0, ignore_index=True)

# --- 学習用の列指定を確定 ---
label_col = "FMS"
group_col = "subject_id"
# 4列目以降が特徴量、ただし末尾2列（subject_id, person_name）は除外
feature_cols = list(df.columns[3:-2])

# --- 確認出力（先頭行と基本統計） ---
print("\n[INFO] df shape:", df.shape)
print("[INFO] label_col:", label_col)
print("[INFO] group_col:", group_col)
print("[INFO] #features:", len(feature_cols))
print("[INFO] head():")
print(df.iloc[:5, :8])  # 先頭8列だけ表示


# Subject 0521因幡先生
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\0521因幡先生\EPOCH\0521_epoch30_merged.csv
# Subject 06021今村さん
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06021今村さん\EPOCH\06021_epoch30_merged.csv
# Subject 06022梅野さん
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06022梅野さん\EPOCH\06022_epoch30_merged.csv
# Subject 06271
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06271\EPOCH\06271_epoch30_merged.csv
# Subject 06272
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06272\EPOCH\06272_epoch30_merged.csv
# Subject 06273
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06273\EPOCH\06273_epoch30_merged.csv
# Subject 06274
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06274\EPOCH\06274_epoch30_merged.csv
# Subject 06275
[OK]  CSV -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\06275\EPOCH\06275_epoch30_merged.csv

[INFO] df 

In [40]:
# ===== Cell 2: Label binarization & sanity checks =====
import numpy as np
import pandas as pd

# 二値化（仕様：FMS ≥ 2 → 1）
y_all = (df[label_col] >= 2).astype(int)

# 特徴量行列とグループ列
X_all = df[feature_cols].copy()
groups = df[group_col].copy()

# 型・欠損の最終チェック（必要最低限）
# 特徴量はfloat想定：object等が混ざっていないか確認
non_numeric = [c for c in X_all.columns if not np.issubdtype(X_all[c].dtype, np.number)]
if non_numeric:
    print(f"[SKIP] 非数値列を検出: {non_numeric} -> 前処理でfloat化が必要です")
else:
    print(f"[OK]  Features dtype check -> all numeric ({len(feature_cols)} cols)")

# 欠損の有無（ここでは警告のみ。補完は前工程の責務）
na_counts = X_all.isna().sum()
n_cols_with_na = int((na_counts > 0).sum())
if n_cols_with_na > 0:
    print(f"[SKIP] 欠損あり列: {n_cols_with_na} / {len(feature_cols)} 列 -> 前工程の補完を確認してください")
else:
    print("[OK]  Missing check -> no NaNs in features")

# 被験者ごとのクラス分布を表示（LOSOの健全性確認）
print("\n[INFO] Class balance per subject (y=1 counts / total):")
warn_single_class = []
for sid, grp in df.groupby(group_col):
    y_sub = (grp[label_col] >= 2).astype(int)
    pos = int(y_sub.sum())
    tot = int(len(y_sub))
    neg = tot - pos
    print(f"# Subject {sid}")
    print(f"[OK]  Class -> pos={pos}, neg={neg}, total={tot}")
    if pos == 0 or neg == 0:
        warn_single_class.append(sid)

# 単一クラスfoldの警告（AUCはNaNになるため）
if warn_single_class:
    ids = ", ".join(map(str, warn_single_class))
    print(f"\n[SKIP] 単一クラスfold検出: {ids} -> 当該foldのROC AUCはNaNになります（仕様どおり継続）")
else:
    print("\n[OK]  全foldで2クラス確認 -> ROC AUC計算は可能")

# 形状の要約
print("\n[INFO] Shapes:")
print("X_all:", X_all.shape, "/ y_all:", y_all.shape, "/ groups:", groups.shape)


[OK]  Features dtype check -> all numeric (25 cols)
[SKIP] 欠損あり列: 12 / 25 列 -> 前工程の補完を確認してください

[INFO] Class balance per subject (y=1 counts / total):
# Subject 0521
[OK]  Class -> pos=2, neg=22, total=24
# Subject 06021
[OK]  Class -> pos=8, neg=16, total=24
# Subject 06022
[OK]  Class -> pos=1, neg=23, total=24
# Subject 06271
[OK]  Class -> pos=2, neg=22, total=24
# Subject 06272
[OK]  Class -> pos=12, neg=12, total=24
# Subject 06273
[OK]  Class -> pos=5, neg=19, total=24
# Subject 06274
[OK]  Class -> pos=0, neg=24, total=24
# Subject 06275
[OK]  Class -> pos=0, neg=24, total=24

[SKIP] 単一クラスfold検出: 06274, 06275 -> 当該foldのROC AUCはNaNになります（仕様どおり継続）

[INFO] Shapes:
X_all: (192, 25) / y_all: (192,) / groups: (192,)


In [41]:
# ===== Cell 2.5 : Baseline(単一行) & Scaling (delta/relative/robust) =====
import numpy as np
import pandas as pd

# ---- 必須カラム確認 ----
for col in ["epoch_start", "epoch_end", "FMS"]:
    if col not in df.columns:
        raise ValueError(f"[ERROR] 必須カラムが見つかりません: {col}")

# subject_id が無いCSVでも動くように（ただし単一被験者に限定）
if "subject_id" not in df.columns:
    df["subject_id"] = "SINGLE"

# ---- パラメータ ----
BASELINE_START = 1470   # ← 仕様どおり
BASELINE_END   = 1500   # ← 仕様どおり（半開区間: [1500,1530)）
ML_START       = 1500
ML_END         = 2100

# スケーリング選択: 'delta' | 'relative' | 'robust'
SCALING_MODE     = "delta"
RELATIVE_PERCENT = False  # Trueで%（×100）

# ---- 学習対象行（ML窓） ----
df_ml = df[(df["epoch_start"] >= ML_START) & (df["epoch_start"] < ML_END)].copy()
if df_ml.empty:
    raise ValueError(f"[ERROR] ML窓({ML_START} <= epoch_start < {ML_END})にデータがありません。")

# ---- 特徴量確認 ----
if "feature_cols" not in globals():
    # 4列目以降〜末尾2列手前（subject_id, person_nameなど）を想定
    feature_cols = df.columns[3:]
    # subject_id/名前系が紛れていたら除外（既に変数がある場合はそのまま使用）
    feature_cols = [c for c in feature_cols if c not in {"subject_id", "person_name"}]

if not feature_cols:
    raise ValueError("[ERROR] feature_cols が空です。")

# すべて数値化（エラーはここでSTOP）
try:
    df_ml[feature_cols] = df_ml[feature_cols].astype(float)
except Exception as e:
    raise ValueError(f"[ERROR] 特徴量に数値化できない列があります: {e}")

# ---- ベースライン行（各被験者ちょうど1行）抽出（フォールバック禁止）----
bl_rows = []
for sid, g in df.groupby("subject_id"):
    bl = g[(g["epoch_start"] >= BASELINE_START) & (g["epoch_start"] < BASELINE_END)]
    if len(bl) != 1:
        raise ValueError(
            f"[ERROR] ベースライン行がちょうど1行ではありません: subject={sid}, 該当行数={len(bl)} "
            f"(必要条件: {BASELINE_START} <= epoch_start < {BASELINE_END})"
        )
    bl_rows.append(bl.iloc[0].copy())

baseline_df = pd.DataFrame(bl_rows).set_index("subject_id")

# ベースライン値（数値化 & 欠損チェック：1つでもNaNがあればSTOP）
baseline_mat = baseline_df[feature_cols].copy()
try:
    baseline_mat = baseline_mat.astype(float)
except Exception as e:
    raise ValueError(f"[ERROR] ベースライン行に数値化できない値が含まれます: {e}")

na_mask = baseline_mat.isna()
if na_mask.any().any():
    bad = {sid: baseline_mat.columns[cols].tolist()
           for sid, cols in na_mask.index.to_series().map(lambda s: np.where(na_mask.loc[s])[0]).items()
           if len(cols) > 0}
    raise ValueError(f"[ERROR] ベースラインにNaNがあります。詳細: {bad}")

# ---- スケーリング関数（フォールバック禁止版）----
def transform_delta(X_sub: pd.DataFrame, b_vec: pd.Series) -> pd.DataFrame:
    return X_sub - b_vec

def transform_relative(X_sub: pd.DataFrame, b_vec: pd.Series, percent: bool=False) -> pd.DataFrame:
    denom = b_vec.abs()
    zero_feats = denom.index[denom == 0.0].tolist()
    if zero_feats:
        raise ValueError(f"[ERROR] relative変換の分母が0です（ベースライン=0）: features={zero_feats}")
    Z = (X_sub - b_vec) / denom
    return Z * 100.0 if percent else Z

def transform_robust(X_sub: pd.DataFrame, b_vec: pd.Series) -> pd.DataFrame:
    # IQRは同被験者のML窓から算出（フォールバック無し）
    q1 = X_sub.quantile(0.25, axis=0)
    q3 = X_sub.quantile(0.75, axis=0)
    iqr = q3 - q1
    if iqr.isna().any():
        raise ValueError(f"[ERROR] robust変換でIQRがNaNの特徴があります: {iqr[iqr.isna()].index.tolist()}")
    nonpos = iqr.index[iqr <= 0.0].tolist()
    if nonpos:
        raise ValueError(f"[ERROR] robust変換でIQR<=0の特徴があります（定数/ほぼ定数）: {nonpos}")
    return (X_sub - b_vec) / iqr

# ---- 変換の実行（被験者単位）----
scaled_blocks = []
for sid, g in df_ml.groupby("subject_id"):
    X_sub = g[feature_cols].copy()

    # ML窓側の欠損チェック：1つでもNaNがあれば即STOP
    na_locs = np.where(X_sub.isna())
    if len(na_locs[0]) > 0:
        # どの行・列にNaNがあるか簡潔に表示
        bad_rows = X_sub.index[na_locs[0]].unique().tolist()
        bad_cols = list(set([feature_cols[j] for j in na_locs[1]]))
        raise ValueError(f"[ERROR] ML窓にNaNがあります: subject={sid}, rows={bad_rows[:5]}..., cols={bad_cols}")

    b_vec = baseline_mat.loc[sid]  # 1行のベクトル（NaNなしは前段で保証済み）

    if SCALING_MODE == "delta":
        X_trf = transform_delta(X_sub, b_vec)
        mode_str = "delta"
    elif SCALING_MODE == "relative":
        X_trf = transform_relative(X_sub, b_vec, percent=RELATIVE_PERCENT)
        mode_str = "relative_pct" if RELATIVE_PERCENT else "relative_ratio"
    elif SCALING_MODE == "robust":
        X_trf = transform_robust(X_sub, b_vec)
        mode_str = "robust_iqr"
    else:
        raise ValueError(f"[ERROR] 未知のSCALING_MODE: {SCALING_MODE}")

    X_trf.index = g.index
    scaled_blocks.append(X_trf)

# ---- 全被験者を縦結合 → X_scaled_all / y_all / groups ----
X_scaled_all = pd.concat(scaled_blocks, axis=0).sort_index().astype(np.float32)
y_all = (df_ml["FMS"] >= 2).astype(int)
groups = df_ml["subject_id"]

# ---- 保存（大文字ファイル名）----
baseline_out = baseline_mat.copy()
baseline_out.index.name = "subject_id"
baseline_out.to_csv("BASELINE_VALUES.CSV", encoding="utf-8-sig")

pd.DataFrame([{
    "mode": mode_str,
    "baseline_window": f"[{BASELINE_START},{BASELINE_END})",
    "ml_window": f"[{ML_START},{ML_END})",
    "n_subjects": df_ml["subject_id"].nunique(),
    "n_rows_ml": len(df_ml)
}]).to_csv("SCALING_META.CSV", index=False, encoding="utf-8-sig")

print(f"[OK] Scaling completed (strict). mode={mode_str}  subjects={df_ml['subject_id'].nunique()}  rows={len(df_ml)}")


ValueError: [ERROR] ベースラインにNaNがあります。詳細: {'06272': ['CSI', 'CVI', 'HeartRate', 'pNN50', 'RMSSD', 'RR_interval', 'SD1', 'SD2', 'SDSD']}

In [None]:
# ===== Cell 3: Modeling & SHAP helper functions (with deterministic settings) =====
from typing import Dict, Optional
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb  # for pred_contribs SHAP

def fit_xgb_classifier(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    random_state: Optional[int] = None,  # 互換のため受け取るが未使用
) -> XGBClassifier:
    # 決定論性のため型を固定
    X_train = X_train.astype(np.float32)
    y_train = y_train.astype(np.int32)

    model = XGBClassifier(
        n_estimators=100,
        eval_metric="logloss",
        subsample=1.0,          # 明示
        colsample_bytree=1.0,   # 明示
        n_jobs=1,               # 単一スレッドで決定論的に
        tree_method="hist",
        # xgboost<2.0 の場合は device 引数が無いので下行はコメントアウトしてください
        device="cpu",
        seed=0, random_state=0
    )
    model.fit(X_train, y_train)
    return model

def compute_train_shap_abs_mean(model: XGBClassifier, X_ref: pd.DataFrame) -> pd.Series:
    # XGBoost組み込みSHAP（pred_contribs）で決定論的に
    X_ref = X_ref.astype(np.float32)
    dm = xgb.DMatrix(X_ref, feature_names=list(X_ref.columns))
    contribs = model.get_booster().predict(dm, pred_contribs=True)  # (n_samples, n_features+1)
    shap_vals = contribs[:, :-1]  # 最後の列はバイアス項
    abs_mean = np.abs(shap_vals).mean(axis=0)
    return pd.Series(abs_mean, index=X_ref.columns, name="abs_mean")

def evaluate_fold(
    model: XGBClassifier,
    X_test: pd.DataFrame,
    y_test: pd.Series,
) -> Dict[str, float]:
    # 型を揃えて微小差を抑制
    X_test = X_test.astype(np.float32)

    proba = model.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, proba)
    else:
        roc_auc = float("nan")

    acc = accuracy_score(y_test, pred)
    return {"roc_auc": roc_auc, "accuracy": acc}


In [None]:
# ===== Cell 4 : LOSO → SHAP集計 & 保存（StandardScalerなし） =====
from typing import Dict
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score, accuracy_score

# 前セルで X_scaled_all, y_all, groups, feature_cols が定義済み前提
X_all = X_scaled_all.copy()   # 追加の標準化はしない
y_all = y_all.copy()
groups = groups.copy()

logo = LeaveOneGroupOut()

# foldごとの |SHAP|平均を記録
shap_abs_means = []
metrics_rows = []

fold_id = 0
for tr_idx, te_idx in logo.split(X_all, y_all, groups):
    fold_id += 1
    test_sid = pd.Series(groups).iloc[te_idx].unique()[0]

    X_tr, X_te = X_all.iloc[tr_idx], X_all.iloc[te_idx]
    y_tr, y_te = y_all.iloc[tr_idx], y_all.iloc[te_idx]

    # 学習（Cell 3の決定論版）
    model = fit_xgb_classifier(X_tr, y_tr)

    # 学習データ上の SHAP（|値|平均）
    abs_mean = compute_train_shap_abs_mean(model, X_tr).rename(f"fold{fold_id}")
    shap_abs_means.append(abs_mean)

    # 評価
    proba = model.predict_proba(X_te.astype(np.float32))[:, 1]
    if len(np.unique(y_te)) == 2:
        roc_auc = roc_auc_score(y_te, proba)
    else:
        roc_auc = float("nan")
    acc = accuracy_score(y_te, (proba >= 0.5).astype(int))

    metrics_rows.append({"test_subject": str(test_sid), "roc_auc": roc_auc, "accuracy": acc})

# SHAPランキング（fold列＋mean_abs）
shap_rank = pd.concat(shap_abs_means, axis=1)
shap_rank["mean_abs"] = shap_rank.mean(axis=1)
shap_rank = shap_rank.sort_values("mean_abs", ascending=False)

# 保存（大文字ファイル名）
shap_rank.to_csv("SHAP_FEATURE_RANKING.CSV", encoding="utf-8-sig")
shap_rank.head(10).to_csv("SHAP_TOP10.CSV", encoding="utf-8-sig")

metrics_df = pd.DataFrame(metrics_rows)
metrics_df.to_csv("LOSO_METRICS.CSV", index=False, encoding="utf-8-sig")

print("[OK]  SHAP Ranking -> ./SHAP_FEATURE_RANKING.CSV")
print("[OK]  SHAP TopK -> ./SHAP_TOP10.CSV")
print("[OK]  Metrics -> ./LOSO_METRICS.CSV")


# Subject 0521因幡先生
[OK]  Eval -> ROC AUC = 0.806, ACC = 0.850
# Subject 06021因幡先生
[OK]  Eval -> ROC AUC = 0.927, ACC = 0.650
# Subject 06022今村さん
[OK]  Eval -> ROC AUC = 0.947, ACC = 0.950
# Subject 06271梅野さん
[OK]  Eval -> ROC AUC = 0.972, ACC = 0.950
# Subject 06272
[OK]  Eval -> ROC AUC = 0.573, ACC = 0.400
# Subject 06273
[OK]  Eval -> ROC AUC = 0.333, ACC = 0.750
# Subject 06274
[SKIP] Eval: ROC AUC = NaN（評価不可）, ACC = 1.000
# Subject 06275
[SKIP] Eval: ROC AUC = NaN（評価不可）, ACC = 0.550
[OK]  SHAP Ranking -> ./SHAP_FEATURE_RANKING.CSV
[OK]  SHAP TopK -> ./SHAP_TOP10.CSV
[OK]  Metrics -> ./LOSO_METRICS.CSV

[INFO] shap_rank head():
                          fold1     fold2     fold3     fold4     fold5  \
HF_power               2.250399  0.824969  1.744066  1.726002  1.807171   
watch_Sweat_Rate_mean  0.969304  1.221709  1.010570  0.608620  1.153376   
watch_Heart_Rate_mean  0.694299  0.679267  0.723402  0.957407  1.037829   
FaceTemp_Max_Diff      0.592341  0.607686  0.525797  0.65685

In [None]:
# ===== Cell 5: Label SHAP table with display names & intervals =====
import pandas as pd
import numpy as np

# 既存のマップ（ノート上で定義済みのものを使う）
# title_map, interval_map が既にある前提

IN_PATH  = "SHAP_FEATURE_RANKING.CSV"
OUT_PATH = "SHAP_FEATURE_RANKING_LABELED.CSV"

shap_df = pd.read_csv(IN_PATH, encoding="utf-8-sig", index_col=0)

# 表示名と間隔を列として追加
def _disp_name(feat: str) -> str:
    return title_map.get(feat, feat)

def _interval_sec(feat: str):
    return interval_map.get(feat, np.nan)

labeled = shap_df.copy()
labeled.insert(0, "display_name", [ _disp_name(f) for f in labeled.index ])
labeled.insert(1, "interval_sec", [ _interval_sec(f) for f in labeled.index ])

# 上位10も別名で保存（任意）
TOPK_LABELED = "SHAP_TOP10_LABELED.CSV"
labeled.to_csv(OUT_PATH, encoding="utf-8-sig")
labeled.head(10).to_csv(TOPK_LABELED, encoding="utf-8-sig")

print(f"[OK]  SHAP Ranking (labeled) -> ./{OUT_PATH}")
print(f"[OK]  SHAP Top10 (labeled)   -> ./{TOPK_LABELED}")
print("\n[INFO] head(10):")
print(labeled.head(10)[["display_name", "interval_sec", "mean_abs"]])


[OK]  SHAP Ranking (labeled) -> ./SHAP_FEATURE_RANKING_LABELED.CSV
[OK]  SHAP Top10 (labeled)   -> ./SHAP_TOP10_LABELED.CSV

[INFO] head(10):
                                display_name  interval_sec  mean_abs
HF_power                            HF Power         120.0  1.651866
watch_Sweat_Rate_mean  watch_Sweat_Rate_mean           NaN  0.958438
watch_Heart_Rate_mean  watch_Heart_Rate_mean           NaN  0.898372
FaceTemp_Max_Diff          FaceTemp_Max_Diff           NaN  0.708284
LF_HF_ratio                            HF/LF         120.0  0.421509
FaceTemp_Mean_Diff        FaceTemp_Mean_Diff           NaN  0.409479
FaceTemp_Mean                  FaceTemp_Mean           NaN  0.301988
watch_Heart_Rate_std    watch_Heart_Rate_std           NaN  0.230990
pNN50                                  pNN50          60.0  0.186186
FaceTemp_Max                    FaceTemp_Max           NaN  0.182586


In [None]:
# ===== Cell 6: Plot SHAP ranking (all features & Top10) =====
import matplotlib.pyplot as plt

# 入力は前セルで作った labeled DataFrame を再利用
# まだ残ってなければ CSV を再読込
if "labeled" not in locals():
    labeled = pd.read_csv("SHAP_FEATURE_RANKING_LABELED.CSV", encoding="utf-8-sig", index_col=0)

# ソート済みのランキングを使用
ranking_all = labeled.sort_values("mean_abs", ascending=True)  # 棒を下から上に
ranking_top10 = ranking_all.tail(10)

# --- 全特徴量ランキング ---
plt.figure(figsize=(8, max(6, len(ranking_all) * 0.35)))
plt.barh(ranking_all["display_name"], ranking_all["mean_abs"])
plt.xlabel("Mean |SHAP|", fontsize=24)
plt.ylabel("Features", fontsize=24)
plt.title("SHAP Feature Importance (All)", fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig("SHAP_RANKING_ALL.PNG", dpi=300)
plt.close()
print("[OK]  Plot -> ./SHAP_RANKING_ALL.PNG")

# --- Top10ランキング ---
plt.figure(figsize=(8, 6))
plt.barh(ranking_top10["display_name"], ranking_top10["mean_abs"])
plt.xlabel("Mean |SHAP|", fontsize=24)
plt.ylabel("Top 10 Features", fontsize=24)
plt.title("SHAP Feature Importance (Top10)", fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig("SHAP_RANKING_TOP10.PNG", dpi=300)
plt.close()
print("[OK]  Plot -> ./SHAP_RANKING_TOP10.PNG")


[OK]  Plot -> ./SHAP_RANKING_ALL.PNG
[OK]  Plot -> ./SHAP_RANKING_TOP10.PNG


In [None]:
# ===== Cell 7 : Per-k metrics CSV + AUC plot + Confusion Matrix at best k =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import (
    roc_auc_score, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)

RANK_LABELED = "SHAP_FEATURE_RANKING_LABELED.CSV"
RANK_RAW     = "SHAP_FEATURE_RANKING.CSV"
OUT_CSV      = "AUC_ACC_PR_REC_F1_PER_K.CSV"   # すべてのkの指標（pooled）

# ---------- 1) SHAPランキングを読み込み（labeled優先） ----------
if os.path.exists(RANK_LABELED):
    rank_df = pd.read_csv(RANK_LABELED, encoding="utf-8-sig", index_col=0)
elif os.path.exists(RANK_RAW):
    rank_df = pd.read_csv(RANK_RAW, encoding="utf-8-sig", index_col=0)
else:
    raise FileNotFoundError("SHAP_FEATURE_RANKING_LABELED.CSV / SHAP_FEATURE_RANKING.CSV が見つかりません。")

rank_df = rank_df.sort_values("mean_abs", ascending=False)
feature_order = [f for f in rank_df.index if f in df.columns]
if len(feature_order) == 0:
    raise RuntimeError("ランキングの特徴量名が df の列と一致していません。")

# ---------- 2) 学習行列の選択（スケーリング済みがあれば優先） ----------
if "X_scaled_all" in globals():
    X_source = X_scaled_all.copy()
    y_source = y_all.copy()
    g_source = groups.copy()
else:
    # スケーリング未使用のとき：ML窓を定義して取り出す
    ML_START, ML_END = 1500, 2100
    df_ml = df[(df["epoch_start"] >= ML_START) & (df["epoch_start"] < ML_END)].copy()
    if "subject_id" not in df_ml.columns:
        df_ml["subject_id"] = "SINGLE"
    X_source = df_ml[feature_cols].copy()
    y_source = (df_ml[label_col] >= 2).astype(int)
    g_source = df_ml[group_col]

# 列順をランキング順に固定 + 型
X_source = X_source.reindex(columns=[c for c in feature_order if c in X_source.columns]).astype(np.float32)

# ---------- 3) k を全探索（多 → 少）し、各 k で pooled metrics を算出 ----------
ks = list(range(len(feature_order), 0, -1))
rows = []
preds_by_k = {}   # k -> (y_true_pooled, y_pred_pooled, proba_pooled)  可視化/CM用

logo = LeaveOneGroupOut()

for k in ks:
    feats = feature_order[:k]
    X = X_source[feats]

    y_true_all, y_pred_all, proba_all = [], [], []

    for tr_idx, te_idx in logo.split(X, y_source, g_source):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y_source.iloc[tr_idx], y_source.iloc[te_idx]

        # 既存の決定論設定版トレーナ（Cell 3の関数）を使用
        model = fit_xgb_classifier(X_tr, y_tr)

        proba = model.predict_proba(X_te.astype(np.float32))[:, 1]
        y_pred = (proba >= 0.5).astype(int)

        y_true_all.append(y_te.values)
        y_pred_all.append(y_pred)
        proba_all.append(proba)

    # ---- pooled結合 ----
    y_true_k = np.concatenate(y_true_all) if y_true_all else np.array([])
    y_pred_k = np.concatenate(y_pred_all) if y_pred_all else np.array([])
    proba_k  = np.concatenate(proba_all)  if proba_all  else np.array([])

    # ---- AUC（pooled）----
    if y_true_k.size > 0 and len(np.unique(y_true_k)) == 2:
        auc_val = float(roc_auc_score(y_true_k, proba_k))
    else:
        auc_val = float("nan")

    # ---- 二値指標（pooled, 閾値0.5）----
    if y_true_k.size > 0:
        acc = float(accuracy_score(y_true_k, y_pred_k))
        pr  = float(precision_score(y_true_k, y_pred_k, zero_division=0))
        rec = float(recall_score(y_true_k, y_pred_k, zero_division=0))
        f1  = float(f1_score(y_true_k, y_pred_k, zero_division=0))
        n_pos_all = int((y_true_k == 1).sum())
        n_neg_all = int((y_true_k == 0).sum())
    else:
        acc = pr = rec = f1 = float("nan")
        n_pos_all = n_neg_all = 0

    rows.append({
        "k": int(k),
        "auc": auc_val,
        "accuracy": acc,
        "precision": pr,
        "recall": rec,
        "f1": f1,
        "n_pos": n_pos_all,
        "n_neg": n_neg_all
    })
    preds_by_k[k] = (y_true_k, y_pred_k, proba_k)

# ---------- 4) CSV保存（降順：特徴量が多い→少ない） ----------
metrics_per_k = pd.DataFrame(rows).sort_values("k", ascending=False)
metrics_per_k.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"[OK]  Per-k metrics CSV -> ./{OUT_CSV}")

# ---------- 5) AUC 図：AUC vs Number of Features（最大点＝赤い太丸、ラベルに k を表示） ----------
auc_values = metrics_per_k.set_index("k").loc[ks, "auc"].values  # ks順で配列化
valid_mask = ~np.isnan(auc_values)
if valid_mask.any():
    best_idx_in_valid = np.argmax(auc_values[valid_mask])
    best_k = np.array(ks)[np.where(valid_mask)[0][best_idx_in_valid]]
    best_auc = auc_values[valid_mask][best_idx_in_valid]
else:
    best_k, best_auc = ks[0], np.nan  # AUCが全NaNの場合のフォールバック

plt.figure(figsize=(8, 6))
plt.plot(ks, auc_values, marker='o')
if not np.isnan(best_auc):
    plt.scatter([best_k], [best_auc], s=160, color="red", edgecolors="none",
                label=f"Max AUC = {best_auc:.3f} (k={best_k})", zorder=5)
plt.gca().invert_xaxis()
plt.xlabel("Number of Features", fontsize=24)
plt.ylabel("ROC AUC (pooled)", fontsize=24)
plt.title("AUC vs Number of Features", fontsize=24)
if not np.isnan(best_auc):
    plt.legend(fontsize=20)
plt.grid(True)
plt.tight_layout()
plt.savefig("AUC_VS_NUM_FEATURES.PNG", dpi=300)
plt.close()
print("[OK]  Plot -> ./AUC_VS_NUM_FEATURES.PNG")

# ---------- 6) 最大AUC時の混同行列（pooledの0.5閾値） ----------
if not np.isnan(best_auc):
    y_true_best, y_pred_best, _ = preds_by_k[int(best_k)]
    if y_true_best.size > 0:
        cm = confusion_matrix(y_true_best, y_pred_best, labels=[0, 1])
        plt.figure(figsize=(8, 6))
        plt.imshow(cm, cmap="Blues")
        for (i, j), val in np.ndenumerate(cm):
            plt.text(j, i, f"{val:d}", ha="center", va="center",
                     fontsize=20, color=("white" if val > cm.max()/2 else "black"))
        plt.xticks([0, 1], ["Pred 0", "Pred 1"], fontsize=20)
        plt.yticks([0, 1], ["True 0", "True 1"], fontsize=20)
        plt.xlabel("Predicted", fontsize=24)
        plt.ylabel("True", fontsize=24)
        plt.title(f"Confusion Matrix at Best AUC (k={best_k})", fontsize=24)
        plt.tight_layout()
        plt.savefig("CONFUSION_MATRIX_BESTK.PNG", dpi=300)
        plt.close()
        print("[OK]  Plot -> ./CONFUSION_MATRIX_BESTK.PNG")
    else:
        print("⚠️ サンプルが空のため、混同行列は作成しませんでした。")
else:
    print("⚠️ どの k でも AUC を計算できなかったため、混同行列の作成はスキップしました。")


[OK]  Per-k metrics CSV -> ./AUC_ACC_PR_REC_F1_PER_K.CSV
[OK]  Plot -> ./AUC_VS_NUM_FEATURES.PNG
[OK]  Plot -> ./CONFUSION_MATRIX_BESTK.PNG
