In [None]:
#Cell0: 特徴量設計
import os
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd


# =========================
# 重要パラメータブロック
# =========================

BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# 解析対象時間（秒）
T_START = 1770.0
T_END = 2400.0

# スライディング窓の設定（本研究）
WINDOW_SEC = 3            # 特徴量窓幅 3秒
SLIDE_STEP_SEC = 3      # 窓終端 t を 0.5秒刻みで計算（←ここを 1.0 に変えれば1秒刻みでも動く）
PC_LAG_SEC = 3            # PCは「3秒前の窓」と比較（= 1つ前の3秒ブロック）

# ラベル（FMS）の時間シフト設定
#   FMS_SHIFT_SEC = 0 : シフト無し（t のFMSは t が属する30秒ブロック）
#   FMS_SHIFT_SEC > 0 : 「t の FMS は t+FMS_SHIFT_SEC のFMSブロック」に対応（将来予測用）
FMS_SHIFT_SEC = 0

# 参考: 先行研究③（VRジェットコースター LSTM）
#   WINDOW_SEC      ≒ 3     # 同じく 3秒ローリング窓で rma/max/min/pc を計算
#   SLIDE_STEP_SEC  ≒ 0.5   # 0.5秒刻みで特徴列 f(t) を作成（2 Hz の時系列）
#   シーケンス長    = 30    # 30ステップの入力系列 (= 過去 15秒分の f(t))
#   ラベル          = 1つ   # 各 15秒ブロック終端時刻に対応する CS(0/1) を1つ付与

# pc の計算パラメータ
PC_DEFAULT_VALUE = 0.0   # 前平均が無い/極小のときに入れる値
PC_EPS = 1e-6            # 「ほぼゼロ判定」のしきい値

# 窓終端の t の範囲
T_MIN_OUT = T_START + WINDOW_SEC   # 1770 + 3 = 1773.0
T_MAX_OUT = T_END                  # 2400.0

# FMS_TEXT（与えられたものをそのまま使用）
FMS_TEXT: Dict[str, str] = {
    "10061": "0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 2 1",
    "10063": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
    "10064": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1",
    "10071": "0 0 0 0 0 0 0 1 1 1 1 1 1 1 2 2 1 1 1 1 1",
    "10072": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
    "10073": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
    "10074": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
    "10081": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1",
    "10082": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1",
    "10083": "0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1",
    "10091": "0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1",
    "10092": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
    "10093": "0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 2 3 3 4 4",
    "10094": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 2 2",
    "10101": "0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3",
    "10102": "0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 2 2 2 2",
    "10103": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4",
}


# MSSQ_percentile 読み込み（ID→0〜1スケールの辞書に）
SUMMARY_PATH = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果\summary_scores.csv"
_df_summary = pd.read_csv(SUMMARY_PATH)
MSSQ_PCT_BY_ID: Dict[str, float] = {}
for _, row in _df_summary.iterrows():
    raw_id = row["ID"]
    # NaNなどはスキップ（念のため）
    if pd.isna(raw_id):
        continue
    # ★ intにしてから文字列化 → "10061.0" 問題を回避
    sid_key = str(int(raw_id))
    MSSQ_PCT_BY_ID[sid_key] = float(row["MSSQ_percentile"]) / 100.0


# =========================
# ユーティリティ関数
# =========================

def minmax_scale(values: np.ndarray, channel_name: str, sid: str) -> np.ndarray:
    """[1770,2400]内の値をmin-maxスケーリング。min==maxまたは全部NaNなら全0."""
    arr = np.asarray(values, float)
    valid = np.isfinite(arr)
    if not valid.any():
        print(f"[WARN] {sid} {channel_name}: all NaN -> set all zeros")
        return np.zeros_like(arr, float)
    vmin = np.nanmin(arr[valid])
    vmax = np.nanmax(arr[valid])
    if vmax - vmin == 0:
        print(f"[WARN] {sid} {channel_name}: min==max ({vmin}) -> set all zeros")
        return np.zeros_like(arr, float)
    return (arr - vmin) / (vmax - vmin)


def compute_window_features_continuous(
    times: np.ndarray,
    values: np.ndarray,
    t_grid: np.ndarray,
    window_len: float,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    不均一サンプリング（1000Hz, 15Hz, 1Hzなど）について、
    各 t (窓終端) に対する [t-window_len, t] の平均/最大/最小を二重ポインタで計算。
    """
    times = np.asarray(times, float)
    values = np.asarray(values, float)
    n = len(times)
    means = np.full(t_grid.shape, np.nan, dtype=float)
    vmaxs = np.full(t_grid.shape, np.nan, dtype=float)
    vmins = np.full(t_grid.shape, np.nan, dtype=float)
    start = 0
    end = -1
    for i, t in enumerate(t_grid):
        w_start = t - window_len
        while start < n and times[start] < w_start:
            start += 1
        while end + 1 < n and times[end + 1] <= t:
            end += 1
        if end >= start and start < n:
            seg = values[start:end + 1]
            if seg.size > 0:
                valid = np.isfinite(seg)
                if valid.any():
                    segv = seg[valid]
                    means[i] = segv.mean()
                    vmaxs[i] = segv.max()
                    vmins[i] = segv.min()
    return means, vmaxs, vmins


def build_hr_1s_from_rr(
    r_times: np.ndarray,
    rr_intervals: np.ndarray,
    t_start: int,
    t_end: int,
    sid: str,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    RR時刻列から、t_start〜t_end の1秒グリッドHR_1s(t)を作る。
    区間 [R_k, R_{k+1}) にいる t のHRは HR_k = 60/RR_k
    ※ 有効RRがゼロ件なら即エラー
    """
    r_times = np.asarray(r_times, float)
    rr_intervals = np.asarray(rr_intervals, float)
    mask = np.isfinite(rr_intervals) & (rr_intervals > 0)
    if not mask.any():
        raise RuntimeError(f"[ERROR] {sid} RRtime: no valid RR_interval_sec (>0)")

    r_times = r_times[mask]
    rr_intervals = rr_intervals[mask]

    t_grid = np.arange(int(t_start), int(t_end) + 1)
    hr_k = 60.0 / rr_intervals

    order = np.argsort(r_times)
    r_times = r_times[order]
    hr_k = hr_k[order]

    hr_1s = np.full_like(t_grid, np.nan, dtype=float)
    k = 0
    n = len(r_times)
    for i, t in enumerate(t_grid):
        while k + 1 < n and r_times[k + 1] <= t:
            k += 1
        if r_times[k] <= t:
            hr_1s[i] = hr_k[k]

    return t_grid, hr_1s


def forward_fill_to_grid(
    times: np.ndarray,
    values: np.ndarray,
    t_grid: np.ndarray,
) -> np.ndarray:
    """
    不規則サンプリングを、任意の t_grid に forward fill する。
    10秒刻み → 0.5秒刻み などもOK。
    """
    times = np.asarray(times, float)
    values = np.asarray(values, float)
    order = np.argsort(times)
    times = times[order]
    values = values[order]

    idx = np.searchsorted(times, t_grid, side="right") - 1
    out = np.full_like(t_grid, np.nan, float)
    valid = idx >= 0
    out[valid] = values[idx[valid]]
    return out


def compute_pc_from_mean(
    mean_arr: np.ndarray,
    step_sec: float,
) -> np.ndarray:
    """
    平均系列 mean_arr（等間隔 step_sec）から、PC_LAG_SEC 秒前との変化率を計算。
    lag_steps = round(PC_LAG_SEC / step_sec) ステップ戻った点を prev とする。

    例:
      step_sec = 1.0, PC_LAG_SEC = 3  → 3ステップ前
      step_sec = 0.5, PC_LAG_SEC = 3  → 6ステップ前
    """
    mean_arr = np.asarray(mean_arr, float)
    n = len(mean_arr)
    pc = np.full(n, PC_DEFAULT_VALUE, float)

    if n == 0:
        return pc

    lag_steps_f = PC_LAG_SEC / step_sec
    lag_steps = int(round(lag_steps_f))
    if not np.isclose(lag_steps_f, lag_steps):
        raise RuntimeError(
            f"[ERROR] PC_LAG_SEC({PC_LAG_SEC}) is not a multiple of step_sec({step_sec})"
        )

    for i in range(n):
        j = i - lag_steps
        if j < 0:
            pc[i] = PC_DEFAULT_VALUE
            continue
        prev = mean_arr[j]
        cur = mean_arr[i]
        if np.isfinite(prev) and abs(prev) > PC_EPS and np.isfinite(cur):
            pc[i] = (cur - prev) / prev
        else:
            pc[i] = PC_DEFAULT_VALUE
    return pc


def build_fms_series_for_t_grid(sid: str, t_grid: np.ndarray) -> np.ndarray:
    """
    FMS_TEXT から t_grid 用の FMS(t) を作る。

    ラベル仕様:
      FMS_SHIFT_SEC = 0 のとき、
        時刻 t のラベルは「t が属する30秒ブロック」の FMS
        idx = floor((t - T_START) / 30)
      FMS_SHIFT_SEC > 0 のとき、
        時刻 t のラベルは「t + FMS_SHIFT_SEC が属する30秒ブロック」の FMS
    """
    text = FMS_TEXT[sid]
    fms_list = [int(x) for x in text.split()]
    if len(fms_list) != 21:
        raise ValueError(f"[FMS] {sid}: expected 21 values, got {len(fms_list)}")
    fms_arr = np.array(fms_list, int)
    out = np.zeros_like(t_grid, int)
    for i, t in enumerate(t_grid):
        idx = int((t - T_START + FMS_SHIFT_SEC) // 30)
        if idx < 0:
            idx = 0
        elif idx > 20:
            idx = 20
        out[i] = fms_arr[idx]
    return out


# =========================
# メイン処理
# =========================

def process_subject(sid: str) -> None:
    print(f"[INFO] Subject {sid} start")

    # ---- MSSQ_percentile（0〜1）取得 ----
    if sid not in MSSQ_PCT_BY_ID:
        raise RuntimeError(f"[ERROR] {sid}: MSSQ_percentile not found in summary_scores.csv")
    mssq_pct01 = MSSQ_PCT_BY_ID[sid]

    # ---- 入力パス ----
    offset_dir = BASE_DIR / sid / "OFFSET"
    feat_dir = BASE_DIR / sid / "FEATURE"
    out_dir = BASE_DIR / sid / "FEATURE2"
    out_dir.mkdir(parents=True, exist_ok=True)

    path_pulse = offset_dir / f"{sid}_Pulse.csv"
    path_sweat = offset_dir / f"{sid}_Sweat.csv"
    path_faceA = offset_dir / f"{sid}_FaceA.csv"
    path_faceB = offset_dir / f"{sid}_FaceB.csv"
    path_skinos = offset_dir / f"{sid}_Skinos.csv"
    path_rr = feat_dir / f"{sid}_RRtime.csv"

    # ---- 出力時刻グリッド（窓終端）----
    # 0.5秒刻みでも 1秒刻みでもOK （T_MIN_OUT〜T_MAX_OUT を含むように少しだけ +step/2）
    t_out = np.arange(T_MIN_OUT, T_MAX_OUT + SLIDE_STEP_SEC / 2, SLIDE_STEP_SEC)

    # ---- FMS 列 ----
    fms_out = build_fms_series_for_t_grid(sid, t_out)

    # =====================
    # Pulse: 1000Hz
    # =====================
    df_pulse = pd.read_csv(path_pulse)
    df_pulse = df_pulse[(df_pulse["Time_sec"] >= T_START) & (df_pulse["Time_sec"] <= T_END)].copy()
    if df_pulse.empty:
        raise RuntimeError(f"[ERROR] {sid} Pulse: no data in [{T_START}, {T_END}]")
    df_pulse = df_pulse.sort_values("Time_sec")
    pulse_norm = minmax_scale(df_pulse["Pulse"].to_numpy(), "Pulse", sid)
    times_pulse = df_pulse["Time_sec"].to_numpy()
    pulse_mean3, pulse_max3, pulse_min3 = compute_window_features_continuous(
        times_pulse, pulse_norm, t_out, window_len=WINDOW_SEC
    )
    pulse_pc3 = compute_pc_from_mean(pulse_mean3, step_sec=SLIDE_STEP_SEC)

    # =====================
    # Sweat (GSR): 1000Hz
    # =====================
    df_sweat = pd.read_csv(path_sweat)
    df_sweat = df_sweat[(df_sweat["Time_sec"] >= T_START) & (df_sweat["Time_sec"] <= T_END)].copy()
    if df_sweat.empty:
        raise RuntimeError(f"[ERROR] {sid} Sweat: no data in [{T_START}, {T_END}]")
    df_sweat = df_sweat.sort_values("Time_sec")
    gsr_norm = minmax_scale(df_sweat["Sweat"].to_numpy(), "Sweat", sid)
    times_gsr = df_sweat["Time_sec"].to_numpy()
    gsr_mean3, gsr_max3, gsr_min3 = compute_window_features_continuous(
        times_gsr, gsr_norm, t_out, window_len=WINDOW_SEC
    )
    gsr_pc3 = compute_pc_from_mean(gsr_mean3, step_sec=SLIDE_STEP_SEC)

    # =====================
    # FaceA/B: 15Hz
    # =====================
    df_faceA = pd.read_csv(path_faceA)
    df_faceB = pd.read_csv(path_faceB)
    df_faceA = df_faceA[(df_faceA["Time_sec"] >= T_START) & (df_faceA["Time_sec"] <= T_END)].copy()
    df_faceB = df_faceB[(df_faceB["Time_sec"] >= T_START) & (df_faceB["Time_sec"] <= T_END)].copy()

    if df_faceA.empty:
        raise RuntimeError(f"[ERROR] {sid} FaceA: no data in [{T_START}, {T_END}]")
    if df_faceB.empty:
        raise RuntimeError(f"[ERROR] {sid} FaceB: no data in [{T_START}, {T_END}]")

    df_faceA = df_faceA.sort_values("Time_sec").reset_index(drop=True)
    df_faceB = df_faceB.sort_values("Time_sec").reset_index(drop=True)

    if len(df_faceA) != len(df_faceB):
        raise RuntimeError(f"[ERROR] {sid} FaceA/FaceB length mismatch: "
                           f"{len(df_faceA)} vs {len(df_faceB)}")
    if not np.allclose(df_faceA["Time_sec"].to_numpy(),
                       df_faceB["Time_sec"].to_numpy()):
        raise RuntimeError(f"[ERROR] {sid} FaceA/FaceB Time_sec mismatch")

    df_face = pd.DataFrame({
        "Time_sec": df_faceA["Time_sec"].to_numpy(),
        "FaceA_BoxAve": df_faceA["FaceA_BoxAve"].to_numpy(),
        "FaceB_BoxAve": df_faceB["FaceB_BoxAve"].to_numpy(),
    })

    sA = df_face["FaceA_BoxAve"].astype(float)
    sB = df_face["FaceB_BoxAve"].astype(float)
    faceA_lp = sA.rolling(window=15, center=True, min_periods=1).mean().to_numpy()
    faceB_lp = sB.rolling(window=15, center=True, min_periods=1).mean().to_numpy()

    def clean_face_channel(arr: np.ndarray) -> np.ndarray:
        x = arr.astype(float).copy()
        med = np.nanmedian(x)
        q1 = np.nanpercentile(x, 25)
        q3 = np.nanpercentile(x, 75)
        iqr = q3 - q1
        if iqr <= 0:
            s = pd.Series(x)
            return s.interpolate(method="linear", limit_direction="both").to_numpy()
        lower = med - 3 * iqr
        upper = med + 3 * iqr
        mask_out = (x < lower) | (x > upper)
        x[mask_out] = np.nan
        s = pd.Series(x)
        x_filled = s.interpolate(method="linear", limit_direction="both").to_numpy()
        return x_filled

    faceA_clean = clean_face_channel(faceA_lp)
    faceB_clean = clean_face_channel(faceB_lp)

    faceA_norm = minmax_scale(faceA_clean, "FaceA_BoxAve", sid)
    faceB_norm = minmax_scale(faceB_clean, "FaceB_BoxAve", sid)

    face_sum = faceA_norm + faceB_norm
    face_diff = faceB_norm - faceA_norm
    times_face = df_face["Time_sec"].to_numpy()

    face_sum_mean3, _, _ = compute_window_features_continuous(
        times_face, face_sum, t_out, window_len=WINDOW_SEC
    )
    face_diff_mean3, _, _ = compute_window_features_continuous(
        times_face, face_diff, t_out, window_len=WINDOW_SEC
    )
    face_sum_pc3 = compute_pc_from_mean(face_sum_mean3, step_sec=SLIDE_STEP_SEC)
    face_diff_pc3 = compute_pc_from_mean(face_diff_mean3, step_sec=SLIDE_STEP_SEC)

    # =====================
    # RRtime -> HR_1s -> HR 3秒窓特徴
    # =====================
    df_rr = pd.read_csv(path_rr)
    if df_rr.empty:
        raise RuntimeError(f"[ERROR] {sid} RRtime: file is empty")

    df_rr_win = df_rr[(df_rr["Time_sec"] >= T_START) & (df_rr["Time_sec"] <= T_END)]
    if df_rr_win.empty:
        raise RuntimeError(f"[ERROR] {sid} RRtime: no R waves in [{T_START}, {T_END}]")

    r_times = df_rr["Time_sec"].to_numpy()
    rr_int = df_rr["RR_interval_sec"].to_numpy()
    t_grid_hr, hr_1s = build_hr_1s_from_rr(r_times, rr_int, int(T_START), int(T_END), sid)

    hr_norm = minmax_scale(hr_1s, "HR_1s", sid)

    # HR_1s (1Hz) を「不均一サンプリング」とみなして、t_outで3秒窓特徴を取る
    hr_rma3, hr_max3, hr_min3 = compute_window_features_continuous(
        t_grid_hr.astype(float), hr_norm, t_out, window_len=WINDOW_SEC
    )
    hr_pc3 = compute_pc_from_mean(hr_rma3, step_sec=SLIDE_STEP_SEC)

    # =====================
    # Skinos: 10秒刻み -> t_out へ forward fill
    # =====================
    df_skin = pd.read_csv(path_skinos)
    df_skin = df_skin[(df_skin["Time_sec"] >= T_START) & (df_skin["Time_sec"] <= T_END)].copy()
    if df_skin.empty:
        raise RuntimeError(f"[ERROR] {sid} Skinos: no data in [{T_START}, {T_END}]")
    df_skin = df_skin.sort_values("Time_sec")

    skin_sweat_norm = minmax_scale(df_skin["Sweat_Rate"].to_numpy(), "Skinos_Sweat_Rate", sid)
    skin_hr_norm = minmax_scale(df_skin["Heart_Rate"].to_numpy(), "Skinos_HeartRate", sid)
    skin_temp_norm = minmax_scale(df_skin["Skin_Temp"].to_numpy(), "Skinos_SkinTemp", sid)

    times_skin = df_skin["Time_sec"].to_numpy()
    skin_sweat_out = forward_fill_to_grid(times_skin, skin_sweat_norm, t_out)
    skin_hr_out = forward_fill_to_grid(times_skin, skin_hr_norm, t_out)
    skin_temp_out = forward_fill_to_grid(times_skin, skin_temp_norm, t_out)

    # MSSQ_percentile（0〜1）を全行に付与
    mssq_col = np.full_like(t_out, mssq_pct01, dtype=float)

    # =====================
    # DataFrame にまとめて保存
    # =====================
    df_out = pd.DataFrame({
        "Time_sec": t_out,               # 0.5刻みの時刻（float）
        "FMS": fms_out.astype(int),
        "Pulse_rma3": pulse_mean3,
        "Pulse_max3": pulse_max3,
        "Pulse_min3": pulse_min3,
        "Pulse_pc3": pulse_pc3,
        "HR_rma3": hr_rma3,
        "HR_max3": hr_max3,
        "HR_min3": hr_min3,
        "HR_pc3": hr_pc3,
        "GSR_rma3": gsr_mean3,
        "GSR_max3": gsr_max3,
        "GSR_min3": gsr_min3,
        "GSR_pc3": gsr_pc3,
        "FaceSum_mean3": face_sum_mean3,
        "FaceDiff_mean3": face_diff_mean3,
        "FaceSum_pc3": face_sum_pc3,
        "FaceDiff_pc3": face_diff_pc3,
        "Skinos_SweatRate": skin_sweat_out,
        "Skinos_HeartRate": skin_hr_out,
        "Skinos_SkinTemp": skin_temp_out,
        "MSSQ_percentile01": mssq_col,   # 0〜1スケールのMSSQ百分位
    })

    out_path = out_dir / f"{sid}_3sFeat_{SLIDE_STEP_SEC}sSlide.csv"
    df_out.to_csv(out_path, index=False)
    print(f"[INFO] Subject {sid} done -> {out_path}")


if __name__ == "__main__":
    for sid in SUBJECT_IDS:
        process_subject(sid)


In [None]:
#Cell1-LSTM: LOSO＋ROC-AUC 単体

import os
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


# -----------------------------
# パス・基本設定
# -----------------------------
BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# このCell用の出力ディレクトリ
CELL_NAME = "Cell1-LSTM"
OUT_DIR = BASE_DIR / f"解析{SLIDE_STEP_SEC}" / "Cell1"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Cell: {CELL_NAME}, OUT_DIR = {OUT_DIR}")

# 確率分布プロット用ディレクトリ
PROB_PLOT_DIR = OUT_DIR / "prob_dist"
PROB_PLOT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Prob. plot dir = {PROB_PLOT_DIR}")

# -----------------------------
# 時間・シーケンス仕様
# -----------------------------

# LSTM に入れる過去ステップ数（= 過去 SEQ_LEN 秒分）
SLIDE_STEP_SEC = 1
SEQ_LEN = 15
Time_LEN=SLIDE_STEP_SEC*SEQ_LEN
print(f"過去{Time_LEN}sのデータ")

# FEATURE2 での最初の出力時刻（T_START+WINDOW_SEC = 1770+3）
BASE_T_MIN = 1773

# ターゲットの最小時刻：最初の出力時刻＋(SEQ_LEN-1)
# 例：BASE_T_MIN=1773, SEQ_LEN=30 → 1773+29 = 1802
TARGET_T_MIN = BASE_T_MIN + (SEQ_LEN - 1)
TARGET_T_MAX = 2400     # 上限はこれまで通り 2400 秒

# ラベル閾値：FMS >= 1 を陽性とする
FMS_POS_THRESHOLD = 1

# -----------------------------
# LSTMハイパラ（変更候補は CSV に出力）
# -----------------------------
HIDDEN_SIZE = 32
FC_HIDDEN_SIZE = 8
DROPOUT_LSTM = 0.0
DROPOUT_FC = 0.5
LEARNING_RATE = 0.005
BATCH_SIZE = 256
N_EPOCHS = 30
WEIGHT_DECAY = 1e-4  # L2正則化（Adam の weight_decay）

# -----------------------------
# 特徴量ON/OFF設定
# -----------------------------
FEATURE_SWITCHES: List[Tuple[str, bool]] = [
    ("Pulse_rma3",       True),
    ("Pulse_max3",       True),
    ("Pulse_min3",       True),
    ("Pulse_pc3",        True),
    ("HR_rma3",          True),
    ("HR_max3",          True),
    ("HR_min3",          True),
    ("HR_pc3",           True),
    ("GSR_rma3",         True),
    ("GSR_max3",         True),
    ("GSR_min3",         True),
    ("GSR_pc3",          True),
    ("FaceSum_mean3",    True),
    ("FaceDiff_mean3",   True),
    ("FaceSum_pc3",      True),
    ("FaceDiff_pc3",     True),
    ("Skinos_SweatRate", True),
    ("Skinos_HeartRate", False),
    ("Skinos_SkinTemp",  True),
    ("MSSQ_percentile01",  True),
]

FEATURE_COLS: List[str] = [name for name, use in FEATURE_SWITCHES if use]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("[ERROR] FEATURE_SWITCHES: 有効な特徴量が0個です（すべてFalse）。")

N_FEATURES = len(FEATURE_COLS)
print(f"[INFO] Using {N_FEATURES} features:", ", ".join(FEATURE_COLS))


# -----------------------------
# LSTM モデル定義
# -----------------------------
class LSTMMotionSickness(nn.Module):
    """
    単方向1層LSTM → Dropout → FC(HIDDEN_SIZE→FC_HIDDEN_SIZE) → ReLU → FC → ロジット
    出力はロジット（Sigmoidはloss/評価側で適用）
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = HIDDEN_SIZE,
        fc_hidden_size: int = FC_HIDDEN_SIZE,
        dropout_lstm: float = DROPOUT_LSTM,
        dropout_fc: float = DROPOUT_FC,
    ):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
            dropout=dropout_lstm,  # num_layers=1 では実質無視される
        )
        self.dropout = nn.Dropout(dropout_fc)
        self.fc1 = nn.Linear(hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, input_size)
        return: ロジット (batch,)
        """
        lstm_out, (hn, cn) = self.lstm(x)
        h_last = hn[-1]              # (batch, hidden_size)
        z = self.dropout(h_last)
        z = self.relu(self.fc1(z))
        z = self.fc_out(z)           # (batch, 1)
        return z.squeeze(-1)         # (batch,)


# -----------------------------
# データ読み込み & シーケンス生成
# -----------------------------
def load_subject_df(sid: str) -> pd.DataFrame:
    """FEATURE2/{sid}_3sFeat_1sSlide.csv を読み込む."""
    path = BASE_DIR / sid / "FEATURE2" / f"{sid}_3sFeat_1sSlide.csv"
    if not path.exists():
        raise FileNotFoundError(f"[ERROR] Subject {sid}: file not found: {path}")
    df = pd.read_csv(path)
    df = df.sort_values("Time_sec").reset_index(drop=True)
    return df


def build_sequences_for_subject(
    sid: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    1被験者について:
      - FEATURE2 CSVを読み込み
      - FMS>=1 を陽性にした y(t) を作成
      - t=TARGET_T_MIN〜TARGET_T_MAX の各時刻 t に対し，
          X_seq(t) = [t-SEQ_LEN+1 .. t] のシーケンスを生成
      - その際，特徴量内にNaNがあれば即エラー
    戻り値:
      X_seq: (N_seq, SEQ_LEN, N_FEATURES)
      y_seq: (N_seq,)
      t_seq: (N_seq,)
      fms_seq: (N_seq,)
    """
    df = load_subject_df(sid)

    # 必要列が揃っているかチェック
    required_cols = ["Time_sec", "FMS"] + FEATURE_COLS
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise RuntimeError(f"[ERROR] Subject {sid}: missing columns in FEATURE2 csv: {missing}")

    # NaNチェック（仕様：NaNがあれば即エラー）
    if df[FEATURE_COLS].isna().values.any():
        nan_mask = df[FEATURE_COLS].isna()
        bad_idx = np.where(nan_mask.values)[0][0]
        bad_time = df.loc[bad_idx, "Time_sec"]
        bad_cols = list(nan_mask.columns[nan_mask.iloc[bad_idx]])
        raise RuntimeError(
            f"[ERROR] Subject {sid}: NaN detected at Time_sec={bad_time}, cols={bad_cols}"
        )

    times = df["Time_sec"].to_numpy().astype(int)
    fms = df["FMS"].to_numpy().astype(int)
    features = df[FEATURE_COLS].to_numpy().astype(np.float32)

    # TARGET_T_MIN〜TARGET_T_MAX の範囲があるか
    target_mask = (times >= TARGET_T_MIN) & (times <= TARGET_T_MAX)
    if not target_mask.any():
        raise RuntimeError(f"[ERROR] Subject {sid}: no Time_sec in [{TARGET_T_MIN}, {TARGET_T_MAX}]")

    X_list: List[np.ndarray] = []
    y_list: List[int] = []
    t_list: List[int] = []
    fms_list: List[int] = []

    for idx in range(len(times)):
        t = times[idx]
        if t < TARGET_T_MIN or t > TARGET_T_MAX:
            continue

        if idx < SEQ_LEN - 1:
            raise RuntimeError(
                f"[ERROR] Subject {sid}: idx={idx}, Time_sec={t} has no enough history (need {SEQ_LEN})."
            )

        window_feat = features[idx - SEQ_LEN + 1: idx + 1, :]  # (SEQ_LEN, N_FEATURES)
        if not np.isfinite(window_feat).all():
            raise RuntimeError(
                f"[ERROR] Subject {sid}: non-finite value in sequence ending at Time_sec={t}"
            )

        # ラベル：FMS>=1
        y = 1 if fms[idx] >= FMS_POS_THRESHOLD else 0

        X_list.append(window_feat)
        y_list.append(y)
        t_list.append(t)
        fms_list.append(int(fms[idx]))

    X_seq = np.stack(X_list).astype(np.float32)   # (N_seq, SEQ_LEN, N_FEATURES)
    y_seq = np.array(y_list, dtype=np.int64)
    t_seq = np.array(t_list, dtype=np.int64)
    fms_seq = np.array(fms_list, dtype=np.int64)

    print(
        f"[INFO] Subject {sid}: target Time_sec range = {t_seq[0]}–{t_seq[-1]}, "
        f"N_seq = {len(t_seq)}, N_pos = {y_seq.sum()}, N_neg = {len(y_seq) - y_seq.sum()}"
    )

    return X_seq, y_seq, t_seq, fms_seq


# -----------------------------
# LOSO 学習・評価ループ
# -----------------------------
def train_one_fold(
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
) -> Tuple[LSTMMotionSickness, List[float]]:
    """
    1つのLOSO foldについて，訓練データのみを使ってLSTMを学習する。
    戻り値: (学習済みモデル, 各epochの平均train lossリスト)
    """
    model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
    )
    criterion = nn.BCEWithLogitsLoss()

    # 陽性割合をプリント
    n_train = len(train_y)
    n_pos = int(train_y.sum())
    n_neg = n_train - n_pos
    pos_ratio = n_pos / n_train if n_train > 0 else 0.0
    print(
        f"[INFO] Train stats: N={n_train}, N_pos={n_pos}, N_neg={n_neg}, "
        f"pos_ratio={pos_ratio:.3f}"
    )

    # DataLoader 構築
    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    epoch_loss_list: List[float] = []

    for epoch in range(1, N_EPOCHS + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)           # (batch,)
            loss = criterion(logits, batch_y) # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        avg_loss = running_loss / max(n_batches, 1)
        epoch_loss_list.append(avg_loss)

        if epoch % 5 == 0 or epoch == 1 or epoch == N_EPOCHS:
            print(f"[INFO] Epoch {epoch:02d}/{N_EPOCHS} - train_loss={avg_loss:.4f}")

    return model, epoch_loss_list


# -----------------------------
# 確率分布プロット
# -----------------------------
def plot_probability_distributions(
    df_pred: pd.DataFrame,
    out_dir: Path,
) -> None:
    """
    Foldごと（被験者ごと）と全体の predicted probability 分布をヒストグラムで保存する。

    df_pred:
        列: ['SubjectID', 'Time_sec', 'FMS', 'Label_bin', 'Prob_FMS_ge1']
    out_dir:
        画像を保存するディレクトリ
    """
    # グラフ体裁（ユーザ指定）
    TITLE_FONTSIZE = 30
    LABEL_FONTSIZE = 24
    TICK_FONTSIZE = 20
    LEGEND_FONTSIZE = 20
    LINEWIDTH = 1.5

    # ---- 全体の分布（全fold結合） ----
    fig, ax = plt.subplots(figsize=(8, 6))

    for label, alpha, lab_name in [
        (0, 0.6, "Label=0 (FMS<1)"),
        (1, 0.6, "Label=1 (FMS>=1)")
    ]:
        vals = df_pred.loc[df_pred["Label_bin"] == label, "Prob_FMS_ge1"].values
        if len(vals) == 0:
            continue
        ax.hist(
            vals,
            bins=20,
            range=(0.0, 1.0),
            density=True,
            alpha=alpha,
            label=lab_name,
            edgecolor="black",
            linewidth=LINEWIDTH,
        )

    ax.set_xlim(0.0, 1.0)
    ax.set_xlabel("Predicted probability (FMS ≥ 1)", fontsize=LABEL_FONTSIZE)
    ax.set_ylabel("Density", fontsize=LABEL_FONTSIZE)
    ax.set_title("All subjects – Probability distribution", fontsize=TITLE_FONTSIZE)
    ax.tick_params(axis="both", labelsize=TICK_FONTSIZE)
    ax.legend(fontsize=LEGEND_FONTSIZE)
    plt.tight_layout()

    out_path_all = out_dir / "Cell1_LSTM_ProbDist_ALL.png"
    fig.savefig(out_path_all, dpi=300)
    plt.close(fig)
    print(f"[INFO] Saved global probability distribution plot to: {out_path_all}")

    # ---- 被験者ごとの分布 ----
    for sid, df_sub in df_pred.groupby("SubjectID"):
        fig, ax = plt.subplots(figsize=(8, 6))

        for label, alpha, lab_name in [
            (0, 0.6, "Label=0 (FMS<1)"),
            (1, 0.6, "Label=1 (FMS>=1)")
        ]:
            vals = df_sub.loc[df_sub["Label_bin"] == label, "Prob_FMS_ge1"].values
            if len(vals) == 0:
                continue
            ax.hist(
                vals,
                bins=20,
                range=(0.0, 1.0),
                density=True,
                alpha=alpha,
                label=lab_name,
                edgecolor="black",
                linewidth=LINEWIDTH,
            )

        ax.set_xlim(0.0, 1.0)
        ax.set_xlabel("Predicted probability (FMS ≥ 1)", fontsize=LABEL_FONTSIZE)
        ax.set_ylabel("Density", fontsize=LABEL_FONTSIZE)
        ax.set_title(f"Subject {sid} – Probability distribution", fontsize=TITLE_FONTSIZE)
        ax.tick_params(axis="both", labelsize=TICK_FONTSIZE)
        ax.legend(fontsize=LEGEND_FONTSIZE)
        plt.tight_layout()

        out_path_sid = out_dir / f"Cell1_LSTM_ProbDist_{sid}.png"
        fig.savefig(out_path_sid, dpi=300)
        plt.close(fig)
        print(f"[INFO] Saved probability distribution plot for {sid} to: {out_path_sid}")


# -----------------------------
# main
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Using device: {device}")

    # 再現性のためにseed固定
    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # ---- 全被験者のシーケンスを構築 ----
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}
    t_by_sid: Dict[str, np.ndarray] = {}
    fms_by_sid: Dict[str, np.ndarray] = {}

    for sid in SUBJECT_IDS:
        print(f"[INFO] ==== Build sequences: Subject {sid} ====")
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(sid)
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq
        t_by_sid[sid] = t_seq
        fms_by_sid[sid] = fms_seq

    all_y_tmp = np.concatenate([y_by_sid[sid] for sid in SUBJECT_IDS])
    print(
        f"[INFO] Overall (all subjects) target stats: "
        f"N={len(all_y_tmp)}, N_pos={all_y_tmp.sum()}, "
        f"pos_ratio={all_y_tmp.mean():.3f}"
    )

    # ---- LOSO 学習・評価 ----
    all_probs: List[np.ndarray] = []
    all_true: List[np.ndarray] = []
    pred_rows: List[pd.DataFrame] = []
    fold_summary_rows: List[Dict] = []
    epoch_loss_records: List[Dict] = []

    for test_sid in SUBJECT_IDS:
        print(f"\n[INFO] ===== LOSO fold: Test Subject {test_sid} =====")

        # 学習・テスト分割
        train_X_list = []
        train_y_list = []
        for sid in SUBJECT_IDS:
            if sid == test_sid:
                continue
            train_X_list.append(X_by_sid[sid])
            train_y_list.append(y_by_sid[sid])

        train_X = np.concatenate(train_X_list, axis=0)
        train_y = np.concatenate(train_y_list, axis=0)
        test_X = X_by_sid[test_sid]
        test_y = y_by_sid[test_sid]
        test_t = t_by_sid[test_sid]
        test_fms = fms_by_sid[test_sid]

        print(
            f"[INFO] Fold data sizes: "
            f"Train N_seq={len(train_y)}, Test N_seq={len(test_y)}"
        )

        # 1 fold 学習
        model, epoch_loss_list = train_one_fold(train_X, train_y, device=device)

        # epochごとの loss をログ用に保存
        for ep_idx, loss_val in enumerate(epoch_loss_list, start=1):
            epoch_loss_records.append(
                {
                    "SubjectID": test_sid,
                    "Epoch": ep_idx,
                    "TrainLoss": loss_val,
                }
            )

        # テスト被験者の予測確率
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(test_X).float().to(device)
            logits = model(X_test_tensor)
            probs = torch.sigmoid(logits).cpu().numpy()  # (N_test,)

        all_probs.append(probs)
        all_true.append(test_y.astype(int))

        # foldごとのROC-AUC
        n_pos_test = int(test_y.sum())
        n_neg_test = int(len(test_y) - n_pos_test)
        if n_pos_test == 0 or n_neg_test == 0:
            rocauc_fold = float("nan")
            print(
                f"[INFO] Subject {test_sid}: ROC-AUC undefined (N_pos={n_pos_test}, N_neg={n_neg_test})"
            )
        else:
            rocauc_fold = roc_auc_score(test_y, probs)
            print(
                f"[INFO] Subject {test_sid}: ROC-AUC(test fold) = {rocauc_fold:.4f} "
                f"(N_test={len(test_y)}, N_pos={n_pos_test}, N_neg={n_neg_test})"
            )

        fold_summary_rows.append(
            {
                "SubjectID": test_sid,
                "N_test": int(len(test_y)),
                "N_pos_test": n_pos_test,
                "N_neg_test": n_neg_test,
                "pos_ratio_test": float(test_y.mean()),
                "ROC_AUC_test": rocauc_fold,
            }
        )

        # このfoldの予測詳細
        df_fold = pd.DataFrame(
            {
                "SubjectID": test_sid,
                "Time_sec": test_t,
                "FMS": test_fms,
                "Label_bin": test_y.astype(int),
                "Prob_FMS_ge1": probs,
            }
        )
        pred_rows.append(df_fold)

    # ---- 全foldをまとめた ROC-AUC ----
    y_all = np.concatenate(all_true)
    p_all = np.concatenate(all_probs)

    n_total = len(y_all)
    n_pos = int(y_all.sum())
    n_neg = n_total - n_pos
    pos_ratio = n_pos / n_total if n_total > 0 else 0.0

    print("\n[INFO] ===== Overall LOSO result =====")
    print(
        f"[INFO] All folds combined: N={n_total}, N_pos={n_pos}, "
        f"N_neg={n_neg}, pos_ratio={pos_ratio:.3f}"
    )

    if n_pos == 0 or n_pos == n_total:
        raise RuntimeError(
            f"[ERROR] ROC-AUC undefined: labels are all the same "
            f"(N={n_total}, N_pos={n_pos})."
        )

    rocauc = roc_auc_score(y_all, p_all)
    print(f"[RESULT] Global ROC-AUC (LOSO, LSTM, FMS>=1) = {rocauc:.4f}")

    # ---- 結果保存 ----
    # 1) ROC-AUC のサマリ（ハイパラ込み）
    result_path = OUT_DIR / "Cell1_LSTM_LOSO_ROCAUC.csv"
    df_result = pd.DataFrame(
        {
            "ROC_AUC_global": [rocauc],
            "N_total": [n_total],
            "N_pos": [n_pos],
            "N_neg": [n_neg],
            "pos_ratio": [pos_ratio],
            "N_features": [N_FEATURES],
            "feature_list": [",".join(FEATURE_COLS)],
            # 変更候補ハイパラを全部記録
            "WINDOW_SEC": [WINDOW_SEC],
            "SLIDE_STEP_SEC": [SLIDE_STEP_SEC],
            "SEQ_LEN": [SEQ_LEN],
            "HIDDEN_SIZE": [HIDDEN_SIZE],
            "FC_HIDDEN_SIZE": [FC_HIDDEN_SIZE],
            "DROPOUT_LSTM": [DROPOUT_LSTM],
            "DROPOUT_FC": [DROPOUT_FC],
            "LEARNING_RATE": [LEARNING_RATE],
            "BATCH_SIZE": [BATCH_SIZE],
            "N_EPOCHS": [N_EPOCHS],
            "WEIGHT_DECAY": [WEIGHT_DECAY],
        }
    )
    df_result.to_csv(result_path, index=False)
    print(f"[INFO] Saved ROC-AUC result to: {result_path}")

    # 2) シーケンスごとの詳細予測
    df_pred = pd.concat(pred_rows, ignore_index=True)
    pred_path = OUT_DIR / "Cell1_LSTM_LOSO_pred_detail.csv"
    df_pred.to_csv(pred_path, index=False)
    print(f"[INFO] Saved per-sequence predictions to: {pred_path}")

    # 2.5) 確率分布プロット（foldごと＋全体）
    plot_probability_distributions(df_pred, PROB_PLOT_DIR)

    # 3) foldごとの summary（被験者別 ROC-AUC）
    df_fold_summary = pd.DataFrame(fold_summary_rows)
    fold_summary_path = OUT_DIR / "Cell1_LSTM_LOSO_fold_summary.csv"
    df_fold_summary.to_csv(fold_summary_path, index=False)
    print(f"[INFO] Saved per-fold summary to: {fold_summary_path}")

    # 4) epochごとの train loss
    df_loss = pd.DataFrame(epoch_loss_records)
    loss_path = OUT_DIR / "Cell1_LSTM_LOSO_train_loss_by_epoch.csv"
    df_loss.to_csv(loss_path, index=False)
    print(f"[INFO] Saved train loss by epoch to: {loss_path}")

    # ---- 最後に、被験者ごとのROC-AUCを()付きでプリント ----
    print("\n[SUMMARY] ===== Per-subject ROC-AUC (LOSO) =====")
    print(f"[SUMMARY] Global ROC-AUC (all folds combined) = {rocauc:.4f}")

    good_mask = df_fold_summary["ROC_AUC_test"].notna() & (df_fold_summary["ROC_AUC_test"] > 0.5)
    bad_mask = df_fold_summary["ROC_AUC_test"].notna() & (df_fold_summary["ROC_AUC_test"] <= 0.5)
    nan_mask = df_fold_summary["ROC_AUC_test"].isna()

    def format_sid_list(mask) -> str:
        rows = df_fold_summary.loc[mask, ["SubjectID", "ROC_AUC_test"]]
        if rows.empty:
            return "なし"
        return ", ".join(f"{row.SubjectID}({row.ROC_AUC_test:.3f})" for _, row in rows.iterrows())

    good_str = format_sid_list(good_mask)
    bad_str = format_sid_list(bad_mask)
    nan_sids = df_fold_summary.loc[nan_mask, "SubjectID"].tolist()
    nan_str = ", ".join(nan_sids) if len(nan_sids) > 0 else "なし"

    print(f"[SUMMARY] よく当たっている被験者(>0.5): {good_str}")
    print(f"[SUMMARY] あまり当たっていない被験者(<=0.5): {bad_str}")
    print(f"[SUMMARY] 評価不能(ROC-AUC算出不可): {nan_str}")


if __name__ == "__main__":
    main()


In [None]:
#Cell1-LSTM: LOSO＋ROC-AUC Inner GroupKFold

import os
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold


# -----------------------------
# パス・基本設定
# -----------------------------
BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# このCell用の出力ディレクトリ
CELL_NAME = "Cell1-LSTM"
OUT_DIR = BASE_DIR / "解析" / "Cell1"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Cell: {CELL_NAME}, OUT_DIR = {OUT_DIR}")

# -----------------------------
# 時間・シーケンス仕様
# -----------------------------
WINDOW_SEC = 3          # 3秒窓（既にFEATURE2で反映済み）
SLIDE_STEP_SEC = 1      # 1秒刻み（既にFEATURE2で反映済み）

SEQ_LEN = 30            # LSTMに入れる過去ステップ数（= 過去30秒分）

# FEATURE2 での最初の出力時刻（T_START+WINDOW_SEC = 1770+3）
BASE_T_MIN = 1773

# ターゲットの最小時刻：最初の出力時刻＋(SEQ_LEN-1)
# 例：BASE_T_MIN=1773, SEQ_LEN=30 → 1773+29 = 1802
TARGET_T_MIN = BASE_T_MIN + (SEQ_LEN - 1)

TARGET_T_MAX = 2400     # 上限はこれまで通り 2400 秒

# ラベル閾値：FMS >= 1 を陽性とする
FMS_POS_THRESHOLD = 1

# -----------------------------
# LSTMハイパラ（単方向1層）
# -----------------------------
HIDDEN_SIZE = 32
FC_HIDDEN_SIZE = 8
DROPOUT_LSTM = 0.0
DROPOUT_FC = 0.5
LEARNING_RATE = 0.005
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-4          # L2正則化（Adam の weight_decay）

INNER_MAX_EPOCHS = 30        # inner GroupKFold で探索する最大エポック
INNER_N_SPLITS = 4           # inner GroupKFold の分割数

# -----------------------------
# 特徴量ON/OFF設定
# -----------------------------
FEATURE_SWITCHES: List[Tuple[str, bool]] = [
    ("Pulse_rma3",       True),
    ("Pulse_max3",       True),
    ("Pulse_min3",       True),
    ("Pulse_pc3",        True),
    ("HR_rma3",          True),
    ("HR_max3",          True),
    ("HR_min3",          True),
    ("HR_pc3",           True),
    ("GSR_rma3",         True),
    ("GSR_max3",         True),
    ("GSR_min3",         True),
    ("GSR_pc3",          True),
    ("FaceSum_mean3",    True),
    ("FaceDiff_mean3",   True),
    ("FaceSum_pc3",      True),
    ("FaceDiff_pc3",     True),
    ("Skinos_SweatRate", True),
    ("Skinos_HeartRate", True),
    ("Skinos_SkinTemp",  True),
]

FEATURE_COLS: List[str] = [name for name, use in FEATURE_SWITCHES if use]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("[ERROR] FEATURE_SWITCHES: 有効な特徴量が0個です（すべてFalse）。")

N_FEATURES = len(FEATURE_COLS)
print(f"[INFO] Using {N_FEATURES} features:", ", ".join(FEATURE_COLS))


# -----------------------------
# LSTM モデル定義
# -----------------------------
class LSTMMotionSickness(nn.Module):
    """
    単方向1層LSTM → Dropout → FC(32→8) → ReLU → FC(8→1)
    出力はロジット（Sigmoidはloss/評価側で適用）
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = HIDDEN_SIZE,
        fc_hidden_size: int = FC_HIDDEN_SIZE,
        dropout_lstm: float = DROPOUT_LSTM,
        dropout_fc: float = DROPOUT_FC,
    ):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
            dropout=dropout_lstm,
        )
        self.dropout = nn.Dropout(dropout_fc)
        self.fc1 = nn.Linear(hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, input_size)
        return: ロジット (batch,)
        """
        lstm_out, (hn, cn) = self.lstm(x)
        # hn: (num_layers, batch, hidden_size) -> (batch, hidden_size)
        h_last = hn[-1]
        z = self.dropout(h_last)
        z = self.relu(self.fc1(z))
        z = self.fc_out(z)         # (batch, 1)
        return z.squeeze(-1)       # (batch,)


# -----------------------------
# データ読み込み & シーケンス生成
# -----------------------------
def load_subject_df(sid: str) -> pd.DataFrame:
    """FEATURE2/{sid}_3sFeat_1sSlide.csv を読み込む."""
    path = BASE_DIR / sid / "FEATURE2" / f"{sid}_3sFeat_1sSlide.csv"
    if not path.exists():
        raise FileNotFoundError(f"[ERROR] Subject {sid}: file not found: {path}")
    df = pd.read_csv(path)
    # 時刻順に並べておく
    df = df.sort_values("Time_sec").reset_index(drop=True)
    return df


def build_sequences_for_subject(
    sid: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    1被験者について:
      - FEATURE2 CSVを読み込み
      - FMS>=1 を陽性にした y(t) を作成
      - t=TARGET_T_MIN〜TARGET_T_MAX の各時刻 t に対し，
          X_seq(t) = [t-SEQ_LEN+1 .. t] のシーケンスを生成
      - その際，特徴量内にNaNがあれば即エラー

    戻り値:
      X_seq:   (N_seq, SEQ_LEN, N_FEATURES)
      y_seq:   (N_seq,)      0/1ラベル
      t_seq:   (N_seq,)      対応する Time_sec
      fms_seq: (N_seq,)      元のFMSスコア（0〜4）
    """
    df = load_subject_df(sid)

    # 必要列が揃っているかチェック
    required_cols = ["Time_sec", "FMS"] + FEATURE_COLS
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise RuntimeError(f"[ERROR] Subject {sid}: missing columns in FEATURE2 csv: {missing}")

    # NaNチェック（仕様：NaNがあれば即エラー）
    if df[FEATURE_COLS].isna().values.any():
        nan_mask = df[FEATURE_COLS].isna()
        bad_idx = np.where(nan_mask.values)[0][0]
        bad_time = df.loc[bad_idx, "Time_sec"]
        bad_cols = list(nan_mask.columns[nan_mask.iloc[bad_idx]])
        raise RuntimeError(
            f"[ERROR] Subject {sid}: NaN detected at Time_sec={bad_time}, cols={bad_cols}"
        )

    times = df["Time_sec"].to_numpy().astype(int)
    fms = df["FMS"].to_numpy().astype(int)
    features = df[FEATURE_COLS].to_numpy().astype(np.float32)

    # ターゲット時刻のマスク：TARGET_T_MIN〜TARGET_T_MAX
    target_mask = (times >= TARGET_T_MIN) & (times <= TARGET_T_MAX)
    if not target_mask.any():
        raise RuntimeError(f"[ERROR] Subject {sid}: no Time_sec in [{TARGET_T_MIN}, {TARGET_T_MAX}]")

    X_list: List[np.ndarray] = []
    y_list: List[int] = []
    t_list: List[int] = []
    fms_list: List[int] = []

    for idx in range(len(times)):
        t = times[idx]
        if t < TARGET_T_MIN or t > TARGET_T_MAX:
            continue

        if idx < SEQ_LEN - 1:
            # 理論上ここには来ないはずだが，一応チェック
            raise RuntimeError(
                f"[ERROR] Subject {sid}: idx={idx}, Time_sec={t} has no enough history (need {SEQ_LEN})."
            )

        window_feat = features[idx - SEQ_LEN + 1 : idx + 1, :]  # (SEQ_LEN, N_FEATURES)
        if not np.isfinite(window_feat).all():
            raise RuntimeError(
                f"[ERROR] Subject {sid}: non-finite value in sequence ending at Time_sec={t}"
            )

        # ラベル：FMS>=1
        y = 1 if fms[idx] >= FMS_POS_THRESHOLD else 0

        X_list.append(window_feat)
        y_list.append(y)
        t_list.append(t)
        fms_list.append(int(fms[idx]))

    X_seq = np.stack(X_list).astype(np.float32)        # (N_seq, SEQ_LEN, N_FEATURES)
    y_seq = np.array(y_list, dtype=np.int64)           # (N_seq,)
    t_seq = np.array(t_list, dtype=np.int64)           # (N_seq,)
    fms_seq = np.array(fms_list, dtype=np.int64)       # (N_seq,)

    print(
        f"[INFO] Subject {sid}: target Time_sec range = {t_seq[0]}–{t_seq[-1]}, "
        f"N_seq = {len(t_seq)}, N_pos = {y_seq.sum()}, N_neg = {len(y_seq) - y_seq.sum()}"
    )

    return X_seq, y_seq, t_seq, fms_seq


# -----------------------------
# Inner GroupKFold (4分割) による best epoch 探索
# -----------------------------
def run_inner_groupkfold_for_outer_fold(
    outer_test_sid: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    train_groups: np.ndarray,
    device: torch.device,
    max_epochs: int = INNER_MAX_EPOCHS,
) -> Tuple[int, np.ndarray]:
    """
    外側 test 被験者 outer_test_sid に対して，
    学習側16名を GroupKFold(n_splits=INNER_N_SPLITS) で分割し，
    各 epoch の mean validation ROC-AUC を計算して best_epoch を返す。
    """
    gkf = GroupKFold(n_splits=INNER_N_SPLITS)

    val_auc_sum = np.zeros(max_epochs, dtype=float)
    val_auc_cnt = np.zeros(max_epochs, dtype=int)

    print(
        f"[INFO][InnerCV] Start {INNER_N_SPLITS}-fold GroupKFold CV "
        f"for outer test subject {outer_test_sid}"
    )

    for inner_fold, (tr_idx, val_idx) in enumerate(
        gkf.split(train_X, train_y, groups=train_groups),
        start=1,
    ):
        X_tr = train_X[tr_idx]
        y_tr = train_y[tr_idx]
        X_val = train_X[val_idx]
        y_val = train_y[val_idx]

        # DataLoader 構築
        train_ds = TensorDataset(
            torch.from_numpy(X_tr).float(),
            torch.from_numpy(y_tr).float(),
        )
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

        # この fold 用のモデルを新規に作成
        model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
        )
        criterion = nn.BCEWithLogitsLoss()

        print(
            f"[INFO][InnerCV]  Fold {inner_fold}: "
            f"N_train={len(y_tr)}, N_val={len(y_val)}"
        )

        for epoch in range(1, max_epochs + 1):
            # ---- 1 epoch 学習 ----
            model.train()
            running_loss = 0.0
            n_batches = 0
            for batch_X, batch_y in train_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)

                optimizer.zero_grad()
                logits = model(batch_X)
                loss = criterion(logits, batch_y)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                n_batches += 1

            # ---- validation ROC-AUC ----
            model.eval()
            with torch.no_grad():
                Xv = torch.from_numpy(X_val).float().to(device)
                logits_val = model(Xv)
                prob_val = torch.sigmoid(logits_val).cpu().numpy()

            n_pos = int(y_val.sum())
            n_val = len(y_val)
            n_neg = n_val - n_pos

            if n_pos == 0 or n_neg == 0:
                # この fold / epoch は ROC-AUC 未定義 → 平均には入れない
                continue

            auc = roc_auc_score(y_val.astype(int), prob_val)
            val_auc_sum[epoch - 1] += auc
            val_auc_cnt[epoch - 1] += 1

    # ---- epoch ごとの平均 ROC-AUC ----
    mean_val_auc = np.full(max_epochs, np.nan, dtype=float)
    mask = val_auc_cnt > 0
    mean_val_auc[mask] = val_auc_sum[mask] / val_auc_cnt[mask]

    print(
        f"[INFO][InnerCV] mean validation ROC-AUC by epoch "
        f"for outer test subject {outer_test_sid}:"
    )
    for epoch in range(1, max_epochs + 1):
        m = mean_val_auc[epoch - 1]
        if np.isnan(m):
            print(f"[INFO][InnerCV]  epoch={epoch:02d}: mean_val_ROC-AUC = nan (no valid folds)")
        else:
            print(f"[INFO][InnerCV]  epoch={epoch:02d}: mean_val_ROC-AUC = {m:.4f}")

    # 有効な epoch の中で最大のものを選択
    if np.all(np.isnan(mean_val_auc)):
        best_epoch = 1
        best_auc = float("nan")
    else:
        best_idx = int(np.nanargmax(mean_val_auc))
        best_epoch = best_idx + 1
        best_auc = mean_val_auc[best_idx]

    print(
        f"[INFO][InnerCV] Selected best_epoch = {best_epoch} "
        f"(mean_val_ROC-AUC = {best_auc}) for outer test subject {outer_test_sid}"
    )

    return best_epoch, mean_val_auc


# -----------------------------
# Outer training（best_epoch だけ学習）
# -----------------------------
def train_outer_model(
    outer_test_sid: str,
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
    n_epochs: int,
) -> Tuple[LSTMMotionSickness, List[Dict]]:
    """
    inner GroupKFold で決まった best_epoch (= n_epochs) だけ
    学習データ全体で学習する。
    戻り値:
      model: 学習済みモデル
      loss_log: [{SubjectID, epoch, train_loss}, ...]
    """
    model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
    )
    criterion = nn.BCEWithLogitsLoss()

    n_train = len(train_y)
    n_pos = int(train_y.sum())
    n_neg = n_train - n_pos
    pos_ratio = n_pos / n_train if n_train > 0 else 0.0

    print(
        f"[INFO] Train stats (outer test {outer_test_sid}): "
        f"N={n_train}, N_pos={n_pos}, N_neg={n_neg}, pos_ratio={pos_ratio:.3f}"
    )
    print(f"[INFO] Train final model for outer test subject {outer_test_sid} with best_epoch={n_epochs}")

    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    loss_log: List[Dict] = []

    for epoch in range(1, n_epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        avg_loss = running_loss / max(n_batches, 1)
        print(
            f"[INFO] Epoch {epoch:02d}/{n_epochs:02d} "
            f"(outer test {outer_test_sid}) - train_loss={avg_loss:.4f}"
        )
        loss_log.append(
            {
                "SubjectID": outer_test_sid,
                "epoch": epoch,
                "train_loss": avg_loss,
            }
        )

    return model, loss_log


# -----------------------------
# main
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Using device: {device}")

    # 再現性のためにseed固定
    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # ---- 全被験者のシーケンスを構築 ----
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}
    t_by_sid: Dict[str, np.ndarray] = {}
    fms_by_sid: Dict[str, np.ndarray] = {}

    for sid in SUBJECT_IDS:
        print(f"[INFO] ==== Build sequences: Subject {sid} ====")
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(sid)
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq
        t_by_sid[sid] = t_seq
        fms_by_sid[sid] = fms_seq

    # 全体陽性割合（参考）
    all_y_tmp = np.concatenate([y_by_sid[sid] for sid in SUBJECT_IDS])
    print(
        f"[INFO] Overall (all subjects) target stats: "
        f"N={len(all_y_tmp)}, N_pos={all_y_tmp.sum()}, "
        f"pos_ratio={all_y_tmp.mean():.3f}"
    )

    # ---- 外側 LOSO 学習・評価 ----
    all_probs: List[np.ndarray] = []
    all_true: List[np.ndarray] = []
    pred_rows: List[pd.DataFrame] = []        # シーケンスごとの詳細保存用
    fold_summary_rows: List[Dict] = []        # 被験者ごとのサマリ
    train_loss_rows: List[Dict] = []          # outer training の loss ログ

    for test_sid in SUBJECT_IDS:
        print(f"\n[INFO] ===== LOSO fold: Test Subject {test_sid} =====")

        # 学習・テストデータをLOSOで分割
        train_X_list = []
        train_y_list = []
        train_groups_list = []

        for sid in SUBJECT_IDS:
            if sid == test_sid:
                continue
            X_sid = X_by_sid[sid]
            y_sid = y_by_sid[sid]

            train_X_list.append(X_sid)
            train_y_list.append(y_sid)

            n_seq_sid = len(y_sid)
            train_groups_list.append(np.full(n_seq_sid, sid))

        train_X = np.concatenate(train_X_list, axis=0)
        train_y = np.concatenate(train_y_list, axis=0)
        train_groups = np.concatenate(train_groups_list, axis=0)

        test_X = X_by_sid[test_sid]
        test_y = y_by_sid[test_sid]
        test_t = t_by_sid[test_sid]
        test_fms = fms_by_sid[test_sid]

        print(
            f"[INFO] Fold data sizes: "
            f"Train N_seq={len(train_y)}, Test N_seq={len(test_y)}"
        )

        # ---- Inner GroupKFold で best_epoch 探索 ----
        best_epoch, mean_val_auc = run_inner_groupkfold_for_outer_fold(
            outer_test_sid=test_sid,
            train_X=train_X,
            train_y=train_y,
            train_groups=train_groups,
            device=device,
            max_epochs=INNER_MAX_EPOCHS,
        )

        best_inner_auc = (
            float(mean_val_auc[best_epoch - 1])
            if not np.isnan(mean_val_auc[best_epoch - 1])
            else float("nan")
        )

        # ---- best_epoch だけ outer train で学習 ----
        model, loss_log = train_outer_model(
            outer_test_sid=test_sid,
            train_X=train_X,
            train_y=train_y,
            device=device,
            n_epochs=best_epoch,
        )
        train_loss_rows.extend(loss_log)

        # ---- outer test 被験者の予測 ----
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(test_X).float().to(device)
            logits = model(X_test_tensor)
            probs = torch.sigmoid(logits).cpu().numpy()

        all_probs.append(probs)
        all_true.append(test_y.astype(int))

        # per-fold ROC-AUC
        n_test = len(test_y)
        n_test_pos = int(test_y.sum())
        n_test_neg = n_test - n_test_pos

        if n_test_pos == 0 or n_test_neg == 0:
            roc_auc_test = float("nan")
            print(
                f"[INFO] Subject {test_sid}: ROC-AUC undefined "
                f"(N_pos={n_test_pos}, N_neg={n_test_neg})"
            )
        else:
            roc_auc_test = roc_auc_score(test_y.astype(int), probs)
            print(
                f"[INFO] Subject {test_sid}: ROC-AUC(test fold) = {roc_auc_test:.4f} "
                f"(N_test={n_test}, N_pos={n_test_pos}, N_neg={n_test_neg})"
            )

        # fold summary 行を追加
        fold_summary_rows.append(
            dict(
                SubjectID=test_sid,
                N_train=len(train_y),
                N_train_pos=int(train_y.sum()),
                N_train_neg=int(len(train_y) - train_y.sum()),
                N_test=n_test,
                N_test_pos=n_test_pos,
                N_test_neg=n_test_neg,
                best_epoch=best_epoch,
                best_inner_mean_val_roc_auc=best_inner_auc,
                test_roc_auc=roc_auc_test,
            )
        )

        # このfoldの予測詳細を保存用に集約
        df_fold = pd.DataFrame(
            {
                "SubjectID": test_sid,
                "Time_sec": test_t,
                "FMS": test_fms,
                "Label_bin": test_y.astype(int),
                "Prob_FMS_ge1": probs,
            }
        )
        pred_rows.append(df_fold)

    # ---- 全foldをまとめた ROC-AUC ----
    y_all = np.concatenate(all_true)
    p_all = np.concatenate(all_probs)

    n_total = len(y_all)
    n_pos = int(y_all.sum())
    n_neg = n_total - n_pos
    pos_ratio = n_pos / n_total if n_total > 0 else 0.0

    print("\n[INFO] ===== Overall LOSO result =====")
    print(
        f"[INFO] All folds combined: N={n_total}, N_pos={n_pos}, "
        f"N_neg={n_neg}, pos_ratio={pos_ratio:.3f}"
    )

    if n_pos == 0 or n_pos == n_total:
        raise RuntimeError(
            f"[ERROR] ROC-AUC undefined: labels are all the same "
            f"(N={n_total}, N_pos={n_pos})."
        )

    rocauc_global = roc_auc_score(y_all, p_all)
    print(f"[RESULT] Global ROC-AUC (LOSO, LSTM, FMS>=1) = {rocauc_global:.4f}")

    # -----------------------------
    # 結果保存
    # -----------------------------
    # 1) ROC-AUC のサマリ（ハイパラ込み）
    result_path = OUT_DIR / "Cell1_LSTM_LOSO_ROCAUC.csv"
    df_result = pd.DataFrame(
        {
            "ROC_AUC_global": [rocauc_global],
            "N_total": [n_total],
            "N_pos": [n_pos],
            "N_neg": [n_neg],
            "pos_ratio": [pos_ratio],
            "N_features": [N_FEATURES],
            "feature_list": [",".join(FEATURE_COLS)],
            "SEQ_LEN": [SEQ_LEN],
            "HIDDEN_SIZE": [HIDDEN_SIZE],
            "FC_HIDDEN_SIZE": [FC_HIDDEN_SIZE],
            "DROPOUT_LSTM": [DROPOUT_LSTM],
            "DROPOUT_FC": [DROPOUT_FC],
            "LEARNING_RATE": [LEARNING_RATE],
            "BATCH_SIZE": [BATCH_SIZE],
            "WEIGHT_DECAY": [WEIGHT_DECAY],
            "INNER_MAX_EPOCHS": [INNER_MAX_EPOCHS],
            "INNER_N_SPLITS": [INNER_N_SPLITS],
        }
    )
    df_result.to_csv(result_path, index=False)
    print(f"[INFO] Saved ROC-AUC result to: {result_path}")

    # 2) シーケンスごとの詳細予測
    df_pred = pd.concat(pred_rows, ignore_index=True)
    pred_path = OUT_DIR / "Cell1_LSTM_LOSO_pred_detail.csv"
    df_pred.to_csv(pred_path, index=False)
    print(f"[INFO] Saved per-sequence predictions to: {pred_path}")

    # 3) 被験者ごとのfoldサマリ
    df_fold_summary = pd.DataFrame(fold_summary_rows)
    fold_summary_path = OUT_DIR / "Cell1_LSTM_LOSO_fold_summary.csv"
    df_fold_summary.to_csv(fold_summary_path, index=False)
    print(f"[INFO] Saved per-fold summary to: {fold_summary_path}")

    # 4) outer training の train loss ログ
    df_train_loss = pd.DataFrame(train_loss_rows)
    loss_path = OUT_DIR / "Cell1_LSTM_LOSO_train_loss_by_epoch.csv"
    df_train_loss.to_csv(loss_path, index=False)
    print(f"[INFO] Saved train loss by epoch to: {loss_path}")

    # -----------------------------
    # コンソール用の簡易サマリ
    # -----------------------------
    print("\n[SUMMARY] ===== Per-subject ROC-AUC (LOSO) =====")
    print(f"[SUMMARY] Global ROC-AUC (all folds combined) = {rocauc_global:.4f}")

    # ROC-AUC > 0.5 / <= 0.5 / NaN で分類
    good_list = []
    bad_list = []
    nan_list = []

    for row in fold_summary_rows:
        sid = row["SubjectID"]
        auc = row["test_roc_auc"]
        if np.isnan(auc):
            nan_list.append(sid)
        elif auc > 0.5:
            good_list.append(f"{sid}({auc:.3f})")
        else:
            bad_list.append(f"{sid}({auc:.3f})")

    print("[SUMMARY] よく当たっている被験者(>0.5): " + (", ".join(good_list) if good_list else "なし"))
    print("[SUMMARY] あまり当たっていない被験者(<=0.5): " + (", ".join(bad_list) if bad_list else "なし"))
    print("[SUMMARY] 評価不能(ROC-AUC算出不可): " + (", ".join(nan_list) if nan_list else "なし"))


if __name__ == "__main__":
    main()


In [None]:
#Cell2-LSTM-SHAP: SHAP重要度可視化

import os
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import shap
import matplotlib.pyplot as plt


# --------------------------------
# 前提チェック（Cell1-LSTM 実行済み想定）
# --------------------------------
required_globals = [
    "BASE_DIR",
    "SUBJECT_IDS",
    "FEATURE_COLS",
    "SEQ_LEN",
    "TARGET_T_MIN",
    "TARGET_T_MAX",
    "FMS_POS_THRESHOLD",
    "LSTMMotionSickness",
    "build_sequences_for_subject",
    "HIDDEN_SIZE",
    "FC_HIDDEN_SIZE",
    "DROPOUT_LSTM",
    "DROPOUT_FC",
    "LEARNING_RATE",
    "BATCH_SIZE",
    "N_EPOCHS",
]
missing = [name for name in required_globals if name not in globals()]
if missing:
    raise RuntimeError(
        "[Cell2-LSTM-SHAP] 必要な定義が見つかりません。"
        "先に Cell1-LSTM を同じノートブック上で実行してください。\n"
        f"不足: {missing}"
    )

N_FEATURES = len(FEATURE_COLS)

# このCell用の出力ディレクトリ
CELL_NAME = "Cell2-LSTM-SHAP"
OUT_DIR = BASE_DIR / "解析" / "Cell2"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO][SHAP] Cell: {CELL_NAME}, OUT_DIR = {OUT_DIR}")

# SHAP用のサンプル数設定
N_BACKGROUND_MAX = 200   # 各foldで DeepExplainer の背景に使う最大サンプル数
N_SHAP_EVAL_MAX = 2000   # 各foldで SHAP を計算する最大サンプル数（訓練シーケンス）


# --------------------------------
# SHAP用ラッパーモデル
#   base_model: (batch, seq_len, feat) -> (batch,)
#   → DeepExplainer 用に (batch, 1) に変形
# --------------------------------
class WrappedLSTMForSHAP(nn.Module):
    """
    SHAP用ラッパー:
    - base_model の出力 (batch,) を (batch, 1) に変形して返す
    """
    def __init__(self, base_model: nn.Module):
        super().__init__()
        self.base_model = base_model

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        logits = self.base_model(x)      # (batch,)
        return logits.unsqueeze(1)       # (batch, 1)


# -----------------------------
# 1 fold 学習（Cell1と同仕様）
# -----------------------------
def train_one_fold_for_shap(
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
) -> "LSTMMotionSickness":
    """
    SHAP用：Cell1-LSTM と同じ設定で1fold分のLSTMを学習する。
    """
    model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    n_train = len(train_y)
    n_pos = int(train_y.sum())
    n_neg = n_train - n_pos
    pos_ratio = n_pos / n_train if n_train > 0 else 0.0
    print(
        f"[INFO][SHAP] Train stats: N={n_train}, N_pos={n_pos}, N_neg={n_neg}, "
        f"pos_ratio={pos_ratio:.3f}"
    )

    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(1, N_EPOCHS + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        avg_loss = running_loss / max(n_batches, 1)
        if epoch % 5 == 0 or epoch == 1 or epoch == N_EPOCHS:
            print(f"[INFO][SHAP] Epoch {epoch:02d}/{N_EPOCHS} - train_loss={avg_loss:.4f}")

    return model


# -----------------------------
# メイン処理（LOSO＋SHAP）
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO][SHAP] Using device: {device}")

    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # --- 全被験者のシーケンス構築（Cell1と同じ関数を再利用） ---
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}

    for sid in SUBJECT_IDS:
        print(f"[INFO][SHAP] ==== Build sequences: Subject {sid} ====")
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(sid)
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq

    # 全体の陽性割合（参考）
    all_y_tmp = np.concatenate([y_by_sid[sid] for sid in SUBJECT_IDS])
    print(
        f"[INFO][SHAP] Overall (all subjects) target stats: "
        f"N={len(all_y_tmp)}, N_pos={all_y_tmp.sum()}, "
        f"pos_ratio={all_y_tmp.mean():.3f}"
    )

    # --- LOSO 各foldで SHAP を計算 ---
    fold_importances: List[np.ndarray] = []
    fold_meta: List[Dict[str, str]] = []

    # beeswarm 用：各foldの shap と特徴量値（時間平均）を保持
    shap_samples_list: List[np.ndarray] = []
    feature_values_list: List[np.ndarray] = []

    for fold_idx, test_sid in enumerate(SUBJECT_IDS):
        print(f"\n[INFO][SHAP] ===== LOSO fold {fold_idx+1}/{len(SUBJECT_IDS)}: Test {test_sid} =====")

        # 学習データ（LOSO）
        train_X_list = []
        train_y_list = []
        for sid in SUBJECT_IDS:
            if sid == test_sid:
                continue
            train_X_list.append(X_by_sid[sid])
            train_y_list.append(y_by_sid[sid])

        train_X = np.concatenate(train_X_list, axis=0)  # (N_train, SEQ_LEN, N_FEATURES)
        train_y = np.concatenate(train_y_list, axis=0)  # (N_train,)

        print(
            f"[INFO][SHAP] Fold data sizes: "
            f"Train N_seq={len(train_y)}, Test N_seq={len(y_by_sid[test_sid])}"
        )

        # --- モデル学習 ---
        model = train_one_fold_for_shap(train_X, train_y, device=device)
        model.eval()

        # --- SHAP用の背景データ（background）をサンプリング ---
        n_train = train_X.shape[0]
        if n_train > N_BACKGROUND_MAX:
            idx_bg = np.random.choice(n_train, size=N_BACKGROUND_MAX, replace=False)
            bg_X = train_X[idx_bg]
        else:
            bg_X = train_X

        print(f"[INFO][SHAP] Using {len(bg_X)} samples as background")
        background = torch.from_numpy(bg_X).float().to(device)

        # --- SHAPを計算する対象サンプル（訓練シーケンスのサブセット） ---
        if n_train > N_SHAP_EVAL_MAX:
            idx_eval = np.random.choice(n_train, size=N_SHAP_EVAL_MAX, replace=False)
            X_eval = train_X[idx_eval]
        else:
            X_eval = train_X

        print(f"[INFO][SHAP] Computing SHAP on {len(X_eval)} training sequences")
        X_eval_tensor = torch.from_numpy(X_eval).float().to(device)

        # --- DeepExplainer を構築 ---
        wrapped_model = WrappedLSTMForSHAP(model).to(device)
        explainer = shap.DeepExplainer(wrapped_model, background)

        # shap_values: (N_eval, SEQ_LEN, N_FEATURES, [output_dim]) か，
        # それを要素に持つリスト
        # ★ additivity チェックをオフ（RNN系でよく落ちるので）
        shap_values = explainer.shap_values(X_eval_tensor, check_additivity=False)

        # 戻り値の型に応じて整形
        if isinstance(shap_values, list):
            sv = shap_values[0]
        else:
            sv = shap_values

        if isinstance(sv, torch.Tensor):
            sv = sv.detach().cpu().numpy()
        else:
            sv = np.array(sv)

        # 出力次元が最後に1つだけ付いている場合は squeeze
        # 例: (N_eval, SEQ_LEN, N_FEATURES, 1) → (N_eval, SEQ_LEN, N_FEATURES)
        if sv.ndim == 4 and sv.shape[-1] == 1:
            sv = sv[..., 0]

        # 形チェック
        if sv.ndim != 3 or sv.shape[1] != SEQ_LEN or sv.shape[2] != N_FEATURES:
            raise RuntimeError(
                f"[ERROR][SHAP] Unexpected SHAP shape: {sv.shape}, "
                f"expected (N_eval, {SEQ_LEN}, {N_FEATURES})"
            )

        # --- beeswarm 用：時間方向で平均して 2次元にする ---
        # sv_mean_t: (N_eval, N_FEATURES) … 30秒履歴の平均寄与
        sv_mean_t = sv.mean(axis=1)

        # 特徴量値も時間平均をとる（色付け用）
        X_eval_np = X_eval_tensor.detach().cpu().numpy()  # (N_eval, SEQ_LEN, N_FEATURES)
        X_mean_t = X_eval_np.mean(axis=1)                 # (N_eval, N_FEATURES)

        shap_samples_list.append(sv_mean_t)
        feature_values_list.append(X_mean_t)

        # --- fold内の特徴重要度：mean(|SHAP|) over (samples, time) ---
        abs_sv = np.abs(sv)
        imp_fold = abs_sv.mean(axis=(0, 1))  # (N_FEATURES,)
        fold_importances.append(imp_fold)
        fold_meta.append({"fold_idx": fold_idx, "test_subject": test_sid})

        # foldごとのトップ特徴をざっくり表示
        order = np.argsort(-imp_fold)  # 降順
        top_k = min(5, N_FEATURES)
        print("[INFO][SHAP] Top features in this fold:")
        for i in range(top_k):
            j = order[i]
            print(f"  {i+1}. {FEATURE_COLS[j]} : mean|SHAP| = {imp_fold[j]:.4e}")

    # --- fold間で平均して最終重要度を算出 ---
    imp_mat = np.stack(fold_importances, axis=0)  # (n_folds, N_FEATURES)
    mean_imp = imp_mat.mean(axis=0)               # (N_FEATURES,)
    std_imp = imp_mat.std(axis=0)                 # (N_FEATURES,)

    order_global = np.argsort(-mean_imp)          # 降順

    # ランキング表を作成
    rows = []
    for rank, idx in enumerate(order_global, start=1):
        rows.append(
            {
                "rank": rank,
                "feature": FEATURE_COLS[idx],
                "mean_abs_shap": float(mean_imp[idx]),
                "std_abs_shap": float(std_imp[idx]),
            }
        )
    df_rank = pd.DataFrame(rows)

    # --- CSV保存（全体ランキング） ---
    rank_path = OUT_DIR / "Cell2_LSTM_SHAP_feature_importance.csv"
    df_rank.to_csv(rank_path, index=False)
    print(f"[INFO][SHAP] Saved SHAP feature ranking to: {rank_path}")

    # fold別の重要度行列も保存
    df_fold_imp = pd.DataFrame(
        imp_mat,
        columns=[f"SHAP_{name}" for name in FEATURE_COLS],
    )
    df_fold_imp.insert(0, "test_subject", [m["test_subject"] for m in fold_meta])
    df_fold_imp.insert(0, "fold_idx", [m["fold_idx"] for m in fold_meta])
    fold_imp_path = OUT_DIR / "Cell2_LSTM_SHAP_feature_importance_by_fold.csv"
    df_fold_imp.to_csv(fold_imp_path, index=False)
    print(f"[INFO][SHAP] Saved fold-wise SHAP importances to: {fold_imp_path}")

    # --- バー図で可視化（特徴重要度, mean|SHAP|） ---
    plt.figure(figsize=(8, 6))
    idxs = order_global  # 重要度降順
    y_pos = np.arange(len(idxs))

    plt.barh(y_pos, mean_imp[idxs])
    plt.yticks(y_pos, [FEATURE_COLS[i] for i in idxs], fontsize=20)
    plt.gca().invert_yaxis()

    plt.xlabel("Mean |SHAP| (over samples & time)", fontsize=24)
    plt.title("LSTM (30s history) SHAP feature importance", fontsize=30)
    plt.xticks(fontsize=20)

    plt.tight_layout()

    fig_path = OUT_DIR / "Cell2_LSTM_SHAP_feature_importance_bar.png"
    plt.savefig(fig_path, dpi=300)
    plt.close()
    print(f"[INFO][SHAP] Saved SHAP bar plot to: {fig_path}")

    # --- SHAP summary beeswarm 図（全fold統合） ---
    try:
        shap_all = np.concatenate(shap_samples_list, axis=0)   # (N_total, N_FEATURES)
        X_all_plot = np.concatenate(feature_values_list, axis=0)
        X_all_df = pd.DataFrame(X_all_plot, columns=FEATURE_COLS)

        shap.summary_plot(
            shap_all,
            X_all_df,
            feature_names=FEATURE_COLS,
            show=False,
            max_display=len(FEATURE_COLS),
        )

        fig = plt.gcf()
        ax = plt.gca()
        ax.set_xlabel("SHAP value (impact on model output)", fontsize=24)
        fig.suptitle("LSTM (30s history) SHAP summary (all folds)", fontsize=30, y=1.02)

        for lbl in ax.get_xticklabels():
            lbl.set_fontsize(20)
        for lbl in ax.get_yticklabels():
            lbl.set_fontsize(20)

        fig.tight_layout()

        beeswarm_path = OUT_DIR / "Cell2_LSTM_SHAP_summary_beeswarm.png"
        fig.savefig(beeswarm_path, dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"[INFO][SHAP] Saved SHAP summary beeswarm plot to: {beeswarm_path}")
    except Exception as e:
        print(f"[WARN][SHAP] Failed to create SHAP summary beeswarm plot: {e}")


if __name__ == "__main__":
    main()


In [None]:
#Cell1-LSTM: LSTMパラメータ探索
import os
from pathlib import Path
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score


# -----------------------------
# パス・基本設定
# -----------------------------
BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# このCell用の出力ディレクトリ
CELL_NAME = "Cell1-LSTM"
OUT_DIR = BASE_DIR / "解析" / "Cell1_12071"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Cell: {CELL_NAME}, OUT_DIR = {OUT_DIR}")

# -----------------------------
# 時間・シーケンス仕様（SEQ_LENは固定）
# -----------------------------
WINDOW_SEC = 3          # 3秒窓（FEATURE2で反映済み）
SLIDE_STEP_SEC = 1      # 1秒刻み（FEATURE2で反映済み）

# LSTM に入れる過去ステップ数（= 過去 SEQ_LEN 秒分） ← 固定
SEQ_LEN = 10

# FEATURE2 での最初の出力時刻（T_START+WINDOW_SEC = 1770+3）
BASE_T_MIN = 1773

# ターゲットの最小時刻：最初の出力時刻＋(SEQ_LEN-1)
TARGET_T_MIN = BASE_T_MIN + (SEQ_LEN - 1)
TARGET_T_MAX = 2400     # 上限はこれまで通り 2400 秒

# ラベル閾値：FMS >= 1 を陽性とする
FMS_POS_THRESHOLD = 1

# -----------------------------
# LSTMハイパラ（デフォルト値）
#   → 実際には CONFIG_LIST 側で上書き
# -----------------------------
DEFAULT_FC_HIDDEN_SIZE = 8
DEFAULT_DROPOUT_LSTM = 0.0
DEFAULT_DROPOUT_FC = 0.5
DEFAULT_LEARNING_RATE = 0.005
DEFAULT_BATCH_SIZE = 256
DEFAULT_N_EPOCHS = 10   # 基準としての値（実際の学習回数は config["N_EPOCHS"]）

# -----------------------------
# 特徴量ON/OFF設定
# -----------------------------
FEATURE_SWITCHES: List[Tuple[str, bool]] = [
    ("Pulse_rma3",       True),
    ("Pulse_max3",       True),
    ("Pulse_min3",       True),
    ("Pulse_pc3",        True),
    ("HR_rma3",          True),
    ("HR_max3",          True),
    ("HR_min3",          True),
    ("HR_pc3",           True),
    ("GSR_rma3",         True),
    ("GSR_max3",         True),
    ("GSR_min3",         True),
    ("GSR_pc3",          True),
    ("FaceSum_mean3",    True),
    ("FaceDiff_mean3",   True),
    ("FaceSum_pc3",      True),
    ("FaceDiff_pc3",     True),
    ("Skinos_SweatRate", True),
    ("Skinos_HeartRate", False),
    ("Skinos_SkinTemp",  True),
    ("MSSQ_percentile01",  True),
]

FEATURE_COLS: List[str] = [name for name, use in FEATURE_SWITCHES if use]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("[ERROR] FEATURE_SWITCHES: 有効な特徴量が0個です（すべてFalse）。")

N_FEATURES = len(FEATURE_COLS)
print(f"[INFO] Using {N_FEATURES} features:", ", ".join(FEATURE_COLS))


# -----------------------------
# LSTM モデル定義
# -----------------------------
class LSTMMotionSickness(nn.Module):
    """
    単方向1層LSTM → Dropout → FC(HIDDEN_SIZE→FC_HIDDEN_SIZE) → ReLU → FC → ロジット
    出力はロジット（Sigmoidはloss/評価側で適用）
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        fc_hidden_size: int,
        dropout_lstm: float,
        dropout_fc: float,
    ):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
            dropout=dropout_lstm,
        )
        self.dropout = nn.Dropout(dropout_fc)
        self.fc1 = nn.Linear(hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, input_size)
        return: ロジット (batch,)
        """
        lstm_out, (hn, cn) = self.lstm(x)
        h_last = hn[-1]              # (batch, hidden_size)
        z = self.dropout(h_last)
        z = self.relu(self.fc1(z))
        z = self.fc_out(z)           # (batch, 1)
        return z.squeeze(-1)         # (batch,)


# -----------------------------
# データ読み込み & シーケンス生成
# -----------------------------
def load_subject_df(sid: str) -> pd.DataFrame:
    """FEATURE2/{sid}_3sFeat_1sSlide.csv を読み込む."""
    path = BASE_DIR / sid / "FEATURE2" / f"{sid}_3sFeat_1sSlide.csv"
    if not path.exists():
        raise FileNotFoundError(f"[ERROR] Subject {sid}: file not found: {path}")
    df = pd.read_csv(path)
    df = df.sort_values("Time_sec").reset_index(drop=True)
    return df


def build_sequences_for_subject(
    sid: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    1被験者について:
      - FEATURE2 CSVを読み込み
      - FMS>=1 を陽性にした y(t) を作成
      - t=TARGET_T_MIN〜TARGET_T_MAX の各時刻 t に対し，
          X_seq(t) = [t-SEQ_LEN+1 .. t] のシーケンスを生成
      - その際，特徴量内にNaNがあれば即エラー
    """
    df = load_subject_df(sid)

    # 必要列が揃っているかチェック
    required_cols = ["Time_sec", "FMS"] + FEATURE_COLS
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise RuntimeError(f"[ERROR] Subject {sid}: missing columns in FEATURE2 csv: {missing}")

    # NaNチェック（仕様：NaNがあれば即エラー）
    if df[FEATURE_COLS].isna().values.any():
        nan_mask = df[FEATURE_COLS].isna()
        bad_idx = np.where(nan_mask.values)[0][0]
        bad_time = df.loc[bad_idx, "Time_sec"]
        bad_cols = list(nan_mask.columns[nan_mask.iloc[bad_idx]])
        raise RuntimeError(
            f"[ERROR] Subject {sid}: NaN detected at Time_sec={bad_time}, cols={bad_cols}"
        )

    times = df["Time_sec"].to_numpy().astype(int)
    fms = df["FMS"].to_numpy().astype(int)
    features = df[FEATURE_COLS].to_numpy().astype(np.float32)

    # TARGET_T_MIN〜TARGET_T_MAX の範囲があるか
    target_mask = (times >= TARGET_T_MIN) & (times <= TARGET_T_MAX)
    if not target_mask.any():
        raise RuntimeError(f"[ERROR] Subject {sid}: no Time_sec in [{TARGET_T_MIN}, {TARGET_T_MAX}]")

    X_list: List[np.ndarray] = []
    y_list: List[int] = []
    t_list: List[int] = []
    fms_list: List[int] = []

    for idx in range(len(times)):
        t = times[idx]
        if t < TARGET_T_MIN or t > TARGET_T_MAX:
            continue

        if idx < SEQ_LEN - 1:
            raise RuntimeError(
                f"[ERROR] Subject {sid}: idx={idx}, Time_sec={t} has no enough history (need {SEQ_LEN})."
            )

        window_feat = features[idx - SEQ_LEN + 1: idx + 1, :]  # (SEQ_LEN, N_FEATURES)
        if not np.isfinite(window_feat).all():
            raise RuntimeError(
                f"[ERROR] Subject {sid}: non-finite value in sequence ending at Time_sec={t}"
            )

        # ラベル：FMS>=1
        y = 1 if fms[idx] >= FMS_POS_THRESHOLD else 0

        X_list.append(window_feat)
        y_list.append(y)
        t_list.append(t)
        fms_list.append(int(fms[idx]))

    X_seq = np.stack(X_list).astype(np.float32)   # (N_seq, SEQ_LEN, N_FEATURES)
    y_seq = np.array(y_list, dtype=np.int64)
    t_seq = np.array(t_list, dtype=np.int64)
    fms_seq = np.array(fms_list, dtype=np.int64)

    # ここではログを出さない（詳細はCSVで確認）

    return X_seq, y_seq, t_seq, fms_seq


def build_sequences_all_subjects() -> Tuple[
    Dict[str, np.ndarray],
    Dict[str, np.ndarray],
    Dict[str, np.ndarray],
    Dict[str, np.ndarray],
]:
    """
    固定SEQ_LENで全被験者のシーケンスを構築
    """
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}
    t_by_sid: Dict[str, np.ndarray] = {}
    fms_by_sid: Dict[str, np.ndarray] = {}

    for sid in SUBJECT_IDS:
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(sid)
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq
        t_by_sid[sid] = t_seq
        fms_by_sid[sid] = fms_seq

    all_y_tmp = np.concatenate([y_by_sid[sid] for sid in SUBJECT_IDS])
    print(
        f"[INFO] Overall (all subjects) target stats (SEQ_LEN={SEQ_LEN}): "
        f"N={len(all_y_tmp)}, N_pos={all_y_tmp.sum()}, "
        f"pos_ratio={all_y_tmp.mean():.3f}"
    )

    return X_by_sid, y_by_sid, t_by_sid, fms_by_sid


# -----------------------------
# 1 fold 学習
# -----------------------------
def train_one_fold(
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
    config: Dict[str, Any],
) -> Tuple[LSTMMotionSickness, List[float]]:
    """
    1つのLOSO foldについて，訓練データのみを使ってLSTMを学習する。
    戻り値: (学習済みモデル, 各epochの平均train lossリスト)
    """
    model = LSTMMotionSickness(
        input_size=N_FEATURES,
        hidden_size=config["HIDDEN_SIZE"],
        fc_hidden_size=config["FC_HIDDEN_SIZE"],
        dropout_lstm=config["DROPOUT_LSTM"],
        dropout_fc=config["DROPOUT_FC"],
    ).to(device)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config["LEARNING_RATE"],
        weight_decay=config["WEIGHT_DECAY"],
    )
    criterion = nn.BCEWithLogitsLoss()

    batch_size = config["BATCH_SIZE"]
    n_epochs = config["N_EPOCHS"]

    # DataLoader 構築
    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    epoch_loss_list: List[float] = []

    for epoch in range(1, n_epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)           # (batch,)
            loss = criterion(logits, batch_y) # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        avg_loss = running_loss / max(n_batches, 1)
        epoch_loss_list.append(avg_loss)

        # 学習途中のログ出力はしない

    return model, epoch_loss_list


# -----------------------------
# 1 config について LOSO を回す
# -----------------------------
def run_loso_for_config(
    config: Dict[str, Any],
    device: torch.device,
    X_by_sid: Dict[str, np.ndarray],
    y_by_sid: Dict[str, np.ndarray],
    t_by_sid: Dict[str, np.ndarray],
    fms_by_sid: Dict[str, np.ndarray],
) -> Dict[str, Any]:
    """
    与えられた設定(config)で LOSO を1周して評価＆CSV保存
    """
    config_name = config["NAME"]

    all_probs: List[np.ndarray] = []
    all_true: List[np.ndarray] = []
    pred_rows: List[pd.DataFrame] = []
    fold_summary_rows: List[Dict[str, Any]] = []
    epoch_loss_records: List[Dict[str, Any]] = []

    for test_sid in SUBJECT_IDS:
        # 学習・テスト分割
        train_X_list = []
        train_y_list = []
        for sid in SUBJECT_IDS:
            if sid == test_sid:
                continue
            train_X_list.append(X_by_sid[sid])
            train_y_list.append(y_by_sid[sid])

        train_X = np.concatenate(train_X_list, axis=0)
        train_y = np.concatenate(train_y_list, axis=0)
        test_X = X_by_sid[test_sid]
        test_y = y_by_sid[test_sid]
        test_t = t_by_sid[test_sid]
        test_fms = fms_by_sid[test_sid]

        # 1 fold 学習
        model, epoch_loss_list = train_one_fold(train_X, train_y, device=device, config=config)

        # epochごとの loss をログ用に保存（プリントはしない）
        for ep_idx, loss_val in enumerate(epoch_loss_list, start=1):
            epoch_loss_records.append(
                {
                    "ConfigName": config_name,
                    "SEQ_LEN": SEQ_LEN,
                    "SubjectID": test_sid,
                    "Epoch": ep_idx,
                    "TrainLoss": loss_val,
                }
            )

        # テスト被験者の予測確率
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(test_X).float().to(device)
            logits = model(X_test_tensor)
            probs = torch.sigmoid(logits).cpu().numpy()  # (N_test,)

        all_probs.append(probs)
        all_true.append(test_y.astype(int))

        # foldごとのROC-AUC
        n_pos_test = int(test_y.sum())
        n_neg_test = int(len(test_y) - n_pos_test)
        if n_pos_test == 0 or n_neg_test == 0:
            rocauc_fold = float("nan")
        else:
            rocauc_fold = roc_auc_score(test_y, probs)

        fold_summary_rows.append(
            {
                "ConfigName": config_name,
                "SEQ_LEN": SEQ_LEN,
                "SubjectID": test_sid,
                "N_test": int(len(test_y)),
                "N_pos_test": n_pos_test,
                "N_neg_test": n_neg_test,
                "pos_ratio_test": float(test_y.mean()),
                "ROC_AUC_test": rocauc_fold,
            }
        )

        # このfoldの予測詳細
        df_fold = pd.DataFrame(
            {
                "ConfigName": config_name,
                "SEQ_LEN": SEQ_LEN,
                "SubjectID": test_sid,
                "Time_sec": test_t,
                "FMS": test_fms,
                "Label_bin": test_y.astype(int),
                "Prob_FMS_ge1": probs,
            }
        )
        pred_rows.append(df_fold)

    # ---- 全foldをまとめた ROC-AUC ----
    y_all = np.concatenate(all_true)
    p_all = np.concatenate(all_probs)

    n_total = len(y_all)
    n_pos = int(y_all.sum())
    n_neg = n_total - n_pos
    pos_ratio = n_pos / n_total if n_total > 0 else 0.0

    if n_pos == 0 or n_pos == n_total:
        raise RuntimeError(
            f"[ERROR] ROC-AUC undefined: labels are all the same "
            f"(N={n_total}, N_pos={n_pos})."
        )

    rocauc = roc_auc_score(y_all, p_all)

    # ---- 結果保存 ----
    # 1) ROC-AUC のサマリ（ハイパラ込み）
    result_path = OUT_DIR / f"Cell1-LSTM_{config_name}_LOSO_ROCAUC.csv"
    df_result = pd.DataFrame(
        {
            "ConfigName": [config_name],
            "ROC_AUC_global": [rocauc],
            "N_total": [n_total],
            "N_pos": [n_pos],
            "N_neg": [n_neg],
            "pos_ratio": [pos_ratio],
            "N_features": [N_FEATURES],
            "feature_list": [",".join(FEATURE_COLS)],
            "SEQ_LEN": [SEQ_LEN],
            "HIDDEN_SIZE": [config["HIDDEN_SIZE"]],
            "FC_HIDDEN_SIZE": [config["FC_HIDDEN_SIZE"]],
            "DROPOUT_LSTM": [config["DROPOUT_LSTM"]],
            "DROPOUT_FC": [config["DROPOUT_FC"]],
            "LEARNING_RATE": [config["LEARNING_RATE"]],
            "BATCH_SIZE": [config["BATCH_SIZE"]],
            "N_EPOCHS": [config["N_EPOCHS"]],
            "WEIGHT_DECAY": [config["WEIGHT_DECAY"]],
        }
    )
    df_result.to_csv(result_path, index=False)

    # 2) シーケンスごとの詳細予測
    df_pred = pd.concat(pred_rows, ignore_index=True)
    pred_path = OUT_DIR / f"Cell1-LSTM_{config_name}_LOSO_pred_detail.csv"
    df_pred.to_csv(pred_path, index=False)

    # 3) foldごとの summary（被験者別 ROC-AUC）
    df_fold_summary = pd.DataFrame(fold_summary_rows)
    fold_summary_path = OUT_DIR / f"Cell1-LSTM_{config_name}_LOSO_fold_summary.csv"
    df_fold_summary.to_csv(fold_summary_path, index=False)

    # 4) epochごとの train loss
    df_loss = pd.DataFrame(epoch_loss_records)
    loss_path = OUT_DIR / f"Cell1-LSTM_{config_name}_LOSO_train_loss_by_epoch.csv"
    df_loss.to_csv(loss_path, index=False)

    # config summary 用に返す情報（ログは main 側でまとめて出す）
    return {
        "ConfigName": config_name,
        "ROC_AUC_global": rocauc,
        "N_total": n_total,
        "N_pos": n_pos,
        "N_neg": n_neg,
        "pos_ratio": pos_ratio,
        "N_features": N_FEATURES,
        "feature_list": ",".join(FEATURE_COLS),
        "SEQ_LEN": SEQ_LEN,
        "HIDDEN_SIZE": config["HIDDEN_SIZE"],
        "FC_HIDDEN_SIZE": config["FC_HIDDEN_SIZE"],
        "DROPOUT_LSTM": config["DROPOUT_LSTM"],
        "DROPOUT_FC": config["DROPOUT_FC"],
        "LEARNING_RATE": config["LEARNING_RATE"],
        "BATCH_SIZE": config["BATCH_SIZE"],
        "N_EPOCHS": config["N_EPOCHS"],
        "WEIGHT_DECAY": config["WEIGHT_DECAY"],
    }


# -----------------------------
# メイン
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Using device: {device}")

    # 再現性のためにseed固定
    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # -------------------------
    # 固定SEQ_LENで全被験者のシーケンス構築（1回だけ）
    # -------------------------
    X_by_sid, y_by_sid, t_by_sid, fms_by_sid = build_sequences_all_subjects()

    # -------------------------
    # ハイパラパターン一覧を定義
    # -------------------------
    HIDDEN_SIZE_LIST = [32, 64]
    WEIGHT_DECAY_LIST = [0.0, 1e-6, 1e-5, 1e-4]
    N_EPOCHS_LIST = [10, 30, 45, 60]

    CONFIG_LIST: List[Dict[str, Any]] = []
    for h in HIDDEN_SIZE_LIST:
        for wd in WEIGHT_DECAY_LIST:
            if wd == 0.0:
                wd_tag = "0"
            elif wd == 1e-4:
                wd_tag = "1e-4"
            elif wd == 1e-3:
                wd_tag = "1e-3"
            else:
                wd_tag = f"{wd}"
            for n_ep in N_EPOCHS_LIST:
                name = f"H{h}_WD{wd_tag}_EP{n_ep}"
                CONFIG_LIST.append(
                    dict(
                        NAME=name,
                        HIDDEN_SIZE=h,
                        FC_HIDDEN_SIZE=DEFAULT_FC_HIDDEN_SIZE,
                        DROPOUT_LSTM=DEFAULT_DROPOUT_LSTM,
                        DROPOUT_FC=DEFAULT_DROPOUT_FC,
                        LEARNING_RATE=DEFAULT_LEARNING_RATE,
                        BATCH_SIZE=DEFAULT_BATCH_SIZE,
                        N_EPOCHS=n_ep,
                        WEIGHT_DECAY=wd,
                    )
                )

    print(f"[INFO] Total number of configs = {len(CONFIG_LIST)}")

    config_summary_rows: List[Dict[str, Any]] = []

    # -------------------------
    # 各 config について LOSO 実験
    # -------------------------
    for i, config in enumerate(CONFIG_LIST, start=1):
        summary = run_loso_for_config(
            config=config,
            device=device,
            X_by_sid=X_by_sid,
            y_by_sid=y_by_sid,
            t_by_sid=t_by_sid,
            fms_by_sid=fms_by_sid,
        )
        config_summary_rows.append(summary)

        # ★ここで「パラメータとROC-AUC」だけをプリント
        print(
            f"[RESULT] ({i}/{len(CONFIG_LIST)}) "
            f"Config={summary['ConfigName']}, "
            f"H={summary['HIDDEN_SIZE']}, "
            f"WD={summary['WEIGHT_DECAY']}, "
            f"N_EPOCHS={summary['N_EPOCHS']}, "
            f"ROC_AUC_global={summary['ROC_AUC_global']:.4f}"
        )

    # -------------------------
    # 全configの summary を保存
    # -------------------------
    df_config_summary = pd.DataFrame(config_summary_rows)
    summary_path = OUT_DIR / "Cell1-LSTM_LOSO_config_summary.csv"
    df_config_summary.to_csv(summary_path, index=False)
    print(f"[INFO] Saved config summary to: {summary_path}")


if __name__ == "__main__":
    main()


In [None]:
# Cell1-LSTM: LSTMパラメータ探索（SLIDE_STEP_SEC × SEQ_LEN）

import os
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score


# -----------------------------
# パス・基本設定
# -----------------------------
BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# 3秒窓特徴量（FEATURE2）のベース設定
T_START = 1770           # 特徴量計算の開始時刻（秒）
WINDOW_SEC = 3           # FEATURE2 で使用した窓幅（秒）
BASE_T_MIN = float(T_START + WINDOW_SEC)  # 1773.0
TARGET_T_MAX = 2400.0    # ターゲットの上限時刻（秒）

# ラベル閾値：FMS >= 1 を陽性とする
FMS_POS_THRESHOLD = 1


# -----------------------------
# LSTMハイパラ
# -----------------------------
HIDDEN_SIZE = 32
FC_HIDDEN_SIZE = 8
DROPOUT_LSTM = 0.0
DROPOUT_FC = 0.5
LEARNING_RATE = 0.005
BATCH_SIZE = 256
N_EPOCHS = 30
WEIGHT_DECAY = 1e-4  # L2正則化（Adam の weight_decay）


# -----------------------------
# 特徴量ON/OFF設定
# -----------------------------
FEATURE_SWITCHES: List[Tuple[str, bool]] = [
    ("Pulse_rma3",       True),
    ("Pulse_max3",       True),
    ("Pulse_min3",       True),
    ("Pulse_pc3",        True),
    ("HR_rma3",          True),
    ("HR_max3",          True),
    ("HR_min3",          True),
    ("HR_pc3",           True),
    ("GSR_rma3",         True),
    ("GSR_max3",         True),
    ("GSR_min3",         True),
    ("GSR_pc3",          True),
    ("FaceSum_mean3",    True),
    ("FaceDiff_mean3",   True),
    ("FaceSum_pc3",      True),
    ("FaceDiff_pc3",     True),
    ("Skinos_SweatRate", True),
    ("Skinos_HeartRate", False),
    ("Skinos_SkinTemp",  True),
    ("MSSQ_percentile01",  True),
]

FEATURE_COLS: List[str] = [name for name, use in FEATURE_SWITCHES if use]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("[ERROR] FEATURE_SWITCHES: 有効な特徴量が0個です（すべてFalse）。")

N_FEATURES = len(FEATURE_COLS)


# -----------------------------
# LSTM モデル定義
# -----------------------------
class LSTMMotionSickness(nn.Module):
    """
    単方向1層LSTM → Dropout → FC(HIDDEN_SIZE→FC_HIDDEN_SIZE) → ReLU → FC → ロジット
    出力はロジット（Sigmoidはloss/評価側で適用）
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = HIDDEN_SIZE,
        fc_hidden_size: int = FC_HIDDEN_SIZE,
        dropout_lstm: float = DROPOUT_LSTM,
        dropout_fc: float = DROPOUT_FC,
    ):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
            dropout=dropout_lstm,  # num_layers=1 では実質無視される
        )
        self.dropout = nn.Dropout(dropout_fc)
        self.fc1 = nn.Linear(hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, input_size)
        return: ロジット (batch,)
        """
        lstm_out, (hn, cn) = self.lstm(x)
        h_last = hn[-1]              # (batch, hidden_size)
        z = self.dropout(h_last)
        z = self.relu(self.fc1(z))
        z = self.fc_out(z)           # (batch, 1)
        return z.squeeze(-1)         # (batch,)


# -----------------------------
# データ読み込み & シーケンス生成
# -----------------------------
def get_feature2_path(sid: str, slide_step_sec: float) -> Path:
    """
    FEATURE2 CSV のファイルパスを返すヘルパー.

    例:
      slide_step_sec=0.5 -> {sid}_3sFeat_0.5sSlide.csv
      slide_step_sec=1.0 -> {sid}_3sFeat_1sSlide.csv
      slide_step_sec=1.5 -> {sid}_3sFeat_1.5sSlide.csv
      slide_step_sec=3.0 -> {sid}_3sFeat_3sSlide.csv
    """
    step_str = f"{slide_step_sec:g}"  # 0.5 -> '0.5', 1.0 -> '1', 3.0 -> '3'
    fname = f"{sid}_3sFeat_{step_str}sSlide.csv"
    return BASE_DIR / sid / "FEATURE2" / fname


def load_subject_df(sid: str, slide_step_sec: float) -> pd.DataFrame:
    """FEATURE2/{sid}_3sFeat_{step}sSlide.csv を読み込む."""
    path = get_feature2_path(sid, slide_step_sec)
    if not path.exists():
        raise FileNotFoundError(f"[ERROR] Subject {sid}: file not found: {path}")
    df = pd.read_csv(path)
    df = df.sort_values("Time_sec").reset_index(drop=True)
    return df


def build_sequences_for_subject(
    sid: str,
    slide_step_sec: float,
    seq_len: int,
    target_t_min: float,
    target_t_max: float,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    1被験者について:
      - FEATURE2 CSVを読み込み
      - FMS>=1 を陽性にした y(t) を作成
      - t=target_t_min〜target_t_max の各時刻 t に対し，
          X_seq(t) = [t-seq_len+1 .. t] のシーケンスを生成
      - その際，特徴量内にNaNがあれば即エラー
    戻り値:
      X_seq: (N_seq, seq_len, N_FEATURES)
      y_seq: (N_seq,)
      t_seq: (N_seq,)
      fms_seq: (N_seq,)
    """
    df = load_subject_df(sid, slide_step_sec=slide_step_sec)

    # 必要列が揃っているかチェック
    required_cols = ["Time_sec", "FMS"] + FEATURE_COLS
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise RuntimeError(f"[ERROR] Subject {sid}: missing columns in FEATURE2 csv: {missing}")

    # NaNチェック（仕様：NaNがあれば即エラー）
    if df[FEATURE_COLS].isna().values.any():
        nan_mask = df[FEATURE_COLS].isna()
        bad_idx = np.where(nan_mask.values)[0][0]
        bad_time = df.loc[bad_idx, "Time_sec"]
        bad_cols = list(nan_mask.columns[nan_mask.iloc[bad_idx]])
        raise RuntimeError(
            f"[ERROR] Subject {sid}: NaN detected at Time_sec={bad_time}, cols={bad_cols}"
        )

    times = df["Time_sec"].to_numpy().astype(float)
    fms = df["FMS"].to_numpy().astype(int)
    features = df[FEATURE_COLS].to_numpy().astype(np.float32)

    # target_t_min〜target_t_max の範囲があるか
    target_mask = (times >= target_t_min) & (times <= target_t_max)
    if not target_mask.any():
        raise RuntimeError(
            f"[ERROR] Subject {sid}: no Time_sec in [{target_t_min}, {target_t_max}]"
        )

    X_list: List[np.ndarray] = []
    y_list: List[int] = []
    t_list: List[float] = []
    fms_list: List[int] = []

    for idx, t in enumerate(times):
        if t < target_t_min or t > target_t_max:
            continue

        # 万が一 target_t_min の設定のズレで履歴不足が出てもスキップする
        if idx < seq_len - 1:
            continue

        window_feat = features[idx - seq_len + 1: idx + 1, :]  # (seq_len, N_FEATURES)
        if not np.isfinite(window_feat).all():
            raise RuntimeError(
                f"[ERROR] Subject {sid}: non-finite value in sequence ending at Time_sec={t}"
            )

        # ラベル：FMS>=1
        y = 1 if fms[idx] >= FMS_POS_THRESHOLD else 0

        X_list.append(window_feat)
        y_list.append(y)
        t_list.append(float(t))
        fms_list.append(int(fms[idx]))

    if len(X_list) == 0:
        raise RuntimeError(
            f"[ERROR] Subject {sid}: no valid sequence constructed "
            f"(check target_t_min={target_t_min}, seq_len={seq_len}, slide_step={slide_step_sec})"
        )

    X_seq = np.stack(X_list).astype(np.float32)   # (N_seq, seq_len, N_FEATURES)
    y_seq = np.array(y_list, dtype=np.int64)
    t_seq = np.array(t_list, dtype=np.float64)
    fms_seq = np.array(fms_list, dtype=np.int64)

    return X_seq, y_seq, t_seq, fms_seq


# -----------------------------
# LOSO 学習・評価（1設定分）
# -----------------------------
def train_one_fold(
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
) -> LSTMMotionSickness:
    """
    1つのLOSO foldについて，訓練データのみを使ってLSTMを学習する。
    戻り値: 学習済みモデル
    """
    model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
    )
    criterion = nn.BCEWithLogitsLoss()

    # DataLoader 構築
    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(1, N_EPOCHS + 1):
        model.train()
        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)           # (batch,)
            loss = criterion(logits, batch_y) # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()

    return model


def run_experiment_for_config(
    slide_step_sec: float,
    seq_len: int,
    device: torch.device,
) -> float:
    """
    1 組の (SLIDE_STEP_SEC, SEQ_LEN) について LOSO 評価を行い，
    Global ROC-AUC を返す。
    """
    # 再現性のためにseed固定
    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # ---- 全被験者のシーケンスを構築 ----
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}
    t_by_sid: Dict[str, np.ndarray] = {}
    fms_by_sid: Dict[str, np.ndarray] = {}

    # slide_step_sec を考慮した最初のターゲット時刻
    target_t_min = BASE_T_MIN + (seq_len - 1) * float(slide_step_sec)
    target_t_max = TARGET_T_MAX

    for sid in SUBJECT_IDS:
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(
            sid,
            slide_step_sec=slide_step_sec,
            seq_len=seq_len,
            target_t_min=target_t_min,
            target_t_max=target_t_max,
        )
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq
        t_by_sid[sid] = t_seq
        fms_by_sid[sid] = fms_seq

    # ---- LOSO 学習・評価 ----
    all_probs: List[np.ndarray] = []
    all_true: List[np.ndarray] = []

    for test_sid in SUBJECT_IDS:
        # 学習・テスト分割
        train_X_list = []
        train_y_list = []
        for sid in SUBJECT_IDS:
            if sid == test_sid:
                continue
            train_X_list.append(X_by_sid[sid])
            train_y_list.append(y_by_sid[sid])

        train_X = np.concatenate(train_X_list, axis=0)
        train_y = np.concatenate(train_y_list, axis=0)
        test_X = X_by_sid[test_sid]
        test_y = y_by_sid[test_sid]

        # 1 fold 学習
        model = train_one_fold(train_X, train_y, device=device)

        # テスト被験者の予測確率
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(test_X).float().to(device)
            logits = model(X_test_tensor)
            probs = torch.sigmoid(logits).cpu().numpy()  # (N_test,)

        all_probs.append(probs)
        all_true.append(test_y.astype(int))

    # ---- 全foldをまとめた ROC-AUC ----
    y_all = np.concatenate(all_true)
    p_all = np.concatenate(all_probs)

    n_total = len(y_all)
    n_pos = int(y_all.sum())

    if n_pos == 0 or n_pos == n_total:
        raise RuntimeError(
            f"[ERROR] ROC-AUC undefined: labels are all the same "
            f"(N={n_total}, N_pos={n_pos})."
        )

    rocauc = roc_auc_score(y_all, p_all)
    return rocauc


# -----------------------------
# main: (SLIDE_STEP_SEC, SEQ_LEN) グリッド探索
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 探索したいパラメータ候補
    slide_step_list = [0.5,1]
    seq_len_list = [20 ,60]

    results = []

    # ヘッダ行を先に出しておく
    print("SLIDE_STEP_SEC,SEQ_LEN,ROC_AUC_global")

    for slide_step_sec in slide_step_list:
        for seq_len in seq_len_list:
            rocauc = run_experiment_for_config(
                slide_step_sec=slide_step_sec,
                seq_len=seq_len,
                device=device,
            )
            results.append({
                "SLIDE_STEP_SEC": slide_step_sec,
                "SEQ_LEN": seq_len,
                "Time_LEN_sec": slide_step_sec * seq_len,
                "ROC_AUC_global": rocauc,
            })
            # ここだけ出力される
            print(f"{slide_step_sec},{seq_len},{rocauc:.4f}")

    # 結果を DataFrame にして CSV 保存（画面には出さない）
    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values(["SLIDE_STEP_SEC", "SEQ_LEN"]).reset_index(drop=True)

    summary_dir = BASE_DIR / "ANALYSIS" / "機械学習" / "Cell1-LSTM_param_sweep"
    summary_dir.mkdir(parents=True, exist_ok=True)
    summary_path = summary_dir / "Cell1_LSTM_slide_seq_ROCAUC.csv"
    df_results.to_csv(summary_path, index=False, encoding="utf-8-sig")


if __name__ == "__main__":
    main()


In [None]:
#Cell1-LSTM: LOSO＋ROC Temperature Scaling(inner-LOSO)＋Label Smoothing
#1207 ROCAUC改善 3時間

import os
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


# -----------------------------
# パス・基本設定
# -----------------------------
BASE_DIR = Path(r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\本実験結果")

SUBJECT_IDS = [
    "10061", "10063", "10064",
    "10071", "10072", "10073", "10074",
    "10081", "10082", "10083",
    "10091", "10092", "10093", "10094",
    "10101", "10102", "10103",
]

# このCell用の出力ディレクトリ
CELL_NAME = "Cell1-LSTM"
OUT_DIR = BASE_DIR / "解析" / "Cell1"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Cell: {CELL_NAME}, OUT_DIR = {OUT_DIR}")

# 確率分布プロット用ディレクトリ
PROB_PLOT_DIR = OUT_DIR / "prob_dist"
PROB_PLOT_DIR.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Prob. plot dir = {PROB_PLOT_DIR}")

# -----------------------------
# 時間・シーケンス仕様
# -----------------------------
WINDOW_SEC = 3          # 3秒窓（既にFEATURE2で反映済み）
SLIDE_STEP_SEC = 1      # 1秒刻み（既にFEATURE2で反映済み）

# LSTM に入れる過去ステップ数（= 過去 SEQ_LEN 秒分）
SEQ_LEN = 10

# FEATURE2 での最初の出力時刻（T_START+WINDOW_SEC = 1770+3）
BASE_T_MIN = 1773

# ターゲットの最小時刻：最初の出力時刻＋(SEQ_LEN-1)
# 例：BASE_T_MIN=1773, SEQ_LEN=30 → 1773+29 = 1802
TARGET_T_MIN = BASE_T_MIN + (SEQ_LEN - 1)
TARGET_T_MAX = 2400     # 上限はこれまで通り 2400 秒

# ラベル閾値：FMS >= 1 を陽性とする
FMS_POS_THRESHOLD = 1

# -----------------------------
# LSTMハイパラ（変更候補は CSV に出力）
# -----------------------------
HIDDEN_SIZE = 32
FC_HIDDEN_SIZE = 8
DROPOUT_LSTM = 0.0
DROPOUT_FC = 0.5
LEARNING_RATE = 0.005
BATCH_SIZE = 256
N_EPOCHS = 30
WEIGHT_DECAY = 1e-4  # L2正則化（Adam の weight_decay）

# -----------------------------
# Temperature Scaling 設定
# -----------------------------
TEMP_MAX_ITER = 200
TEMP_LR = 0.01

# -----------------------------
# Label Smoothing 設定
# -----------------------------
LABEL_SMOOTHING = 0.0  # 0.1 ならOFF, 例: 0.05 で 0→0.05, 1→0.95

# -----------------------------
# 特徴量ON/OFF設定
# -----------------------------
FEATURE_SWITCHES: List[Tuple[str, bool]] = [
    ("Pulse_rma3",       True),
    ("Pulse_max3",       True),
    ("Pulse_min3",       True),
    ("Pulse_pc3",        True),
    ("HR_rma3",          True),
    ("HR_max3",          True),
    ("HR_min3",          True),
    ("HR_pc3",           True),
    ("GSR_rma3",         True),
    ("GSR_max3",         True),
    ("GSR_min3",         True),
    ("GSR_pc3",          True),
    ("FaceSum_mean3",    True),
    ("FaceDiff_mean3",   True),
    ("FaceSum_pc3",      True),
    ("FaceDiff_pc3",     True),
    ("Skinos_SweatRate", True),
    ("Skinos_HeartRate", False),
    ("Skinos_SkinTemp",  True),
    ("MSSQ_percentile01",  True),
]

FEATURE_COLS: List[str] = [name for name, use in FEATURE_SWITCHES if use]
if len(FEATURE_COLS) == 0:
    raise RuntimeError("[ERROR] FEATURE_SWITCHES: 有効な特徴量が0個です（すべてFalse）。")

N_FEATURES = len(FEATURE_COLS)
print(f"[INFO] Using {N_FEATURES} features:", ", ".join(FEATURE_COLS))


# -----------------------------
# LSTM モデル定義
# -----------------------------
class LSTMMotionSickness(nn.Module):
    """
    単方向1層LSTM → Dropout → FC(HIDDEN_SIZE→FC_HIDDEN_SIZE) → ReLU → FC → ロジット
    出力はロジット（Sigmoidはloss/評価側で適用）
    """
    def __init__(
        self,
        input_size: int,
        hidden_size: int = HIDDEN_SIZE,
        fc_hidden_size: int = FC_HIDDEN_SIZE,
        dropout_lstm: float = DROPOUT_LSTM,
        dropout_fc: float = DROPOUT_FC,
    ):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
            dropout=dropout_lstm,  # num_layers=1 では実質無視される
        )
        self.dropout = nn.Dropout(dropout_fc)
        self.fc1 = nn.Linear(hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, input_size)
        return: ロジット (batch,)
        """
        lstm_out, (hn, cn) = self.lstm(x)
        h_last = hn[-1]              # (batch, hidden_size)
        z = self.dropout(h_last)
        z = self.relu(self.fc1(z))
        z = self.fc_out(z)           # (batch, 1)
        return z.squeeze(-1)         # (batch,)


# -----------------------------
# データ読み込み & シーケンス生成
# -----------------------------
def load_subject_df(sid: str) -> pd.DataFrame:
    """FEATURE2/{sid}_3sFeat_1sSlide.csv を読み込む."""
    path = BASE_DIR / sid / "FEATURE2" / f"{sid}_3sFeat_1sSlide.csv"
    if not path.exists():
        raise FileNotFoundError(f"[ERROR] Subject {sid}: file not found: {path}")
    df = pd.read_csv(path)
    df = df.sort_values("Time_sec").reset_index(drop=True)
    return df


def build_sequences_for_subject(
    sid: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    1被験者について:
      - FEATURE2 CSVを読み込み
      - FMS>=1 を陽性にした y(t) を作成
      - t=TARGET_T_MIN〜TARGET_T_MAX の各時刻 t に対し，
          X_seq(t) = [t-SEQ_LEN+1 .. t] のシーケンスを生成
      - その際，特徴量内にNaNがあれば即エラー
    戻り値:
      X_seq: (N_seq, SEQ_LEN, N_FEATURES)
      y_seq: (N_seq,)
      t_seq: (N_seq,)
      fms_seq: (N_seq,)
    """
    df = load_subject_df(sid)

    # 必要列が揃っているかチェック
    required_cols = ["Time_sec", "FMS"] + FEATURE_COLS
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise RuntimeError(f"[ERROR] Subject {sid}: missing columns in FEATURE2 csv: {missing}")

    # NaNチェック（仕様：NaNがあれば即エラー）
    if df[FEATURE_COLS].isna().values.any():
        nan_mask = df[FEATURE_COLS].isna()
        bad_idx = np.where(nan_mask.values)[0][0]
        bad_time = df.loc[bad_idx, "Time_sec"]
        bad_cols = list(nan_mask.columns[nan_mask.iloc[bad_idx]])
        raise RuntimeError(
            f"[ERROR] Subject {sid}: NaN detected at Time_sec={bad_time}, cols={bad_cols}"
        )

    times = df["Time_sec"].to_numpy().astype(int)
    fms = df["FMS"].to_numpy().astype(int)
    features = df[FEATURE_COLS].to_numpy().astype(np.float32)

    # TARGET_T_MIN〜TARGET_T_MAX の範囲があるか
    target_mask = (times >= TARGET_T_MIN) & (times <= TARGET_T_MAX)
    if not target_mask.any():
        raise RuntimeError(f"[ERROR] Subject {sid}: no Time_sec in [{TARGET_T_MIN}, {TARGET_T_MAX}]")

    X_list: List[np.ndarray] = []
    y_list: List[int] = []
    t_list: List[int] = []
    fms_list: List[int] = []

    for idx in range(len(times)):
        t = times[idx]
        if t < TARGET_T_MIN or t > TARGET_T_MAX:
            continue

        if idx < SEQ_LEN - 1:
            raise RuntimeError(
                f"[ERROR] Subject {sid}: idx={idx}, Time_sec={t} has no enough history (need {SEQ_LEN})."
            )

        window_feat = features[idx - SEQ_LEN + 1: idx + 1, :]  # (SEQ_LEN, N_FEATURES)
        if not np.isfinite(window_feat).all():
            raise RuntimeError(
                f"[ERROR] Subject {sid}: non-finite value in sequence ending at Time_sec={t}"
            )

        # ラベル：FMS>=1
        y = 1 if fms[idx] >= FMS_POS_THRESHOLD else 0

        X_list.append(window_feat)
        y_list.append(y)
        t_list.append(t)
        fms_list.append(int(fms[idx]))

    X_seq = np.stack(X_list).astype(np.float32)   # (N_seq, SEQ_LEN, N_FEATURES)
    y_seq = np.array(y_list, dtype=np.int64)
    t_seq = np.array(t_list, dtype=np.int64)
    fms_seq = np.array(fms_list, dtype=np.int64)

    print(
        f"[INFO] Subject {sid}: target Time_sec range = {t_seq[0]}–{t_seq[-1]}, "
        f"N_seq = {len(t_seq)}, N_pos = {y_seq.sum()}, N_neg = {len(y_seq) - y_seq.sum()}"
    )

    return X_seq, y_seq, t_seq, fms_seq


# -----------------------------
# LOSO 学習・評価ループ（1モデル学習）
# -----------------------------
def train_one_fold(
    train_X: np.ndarray,
    train_y: np.ndarray,
    device: torch.device,
) -> Tuple[LSTMMotionSickness, List[float]]:
    """
    1つのデータセット（train_X, train_y）についてLSTMを学習する。
    戻り値: (学習済みモデル, 各epochの平均train lossリスト)
    """
    model = LSTMMotionSickness(input_size=N_FEATURES).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
    )
    criterion = nn.BCEWithLogitsLoss()

    # 陽性割合をプリント（元ラベルで）
    n_train = len(train_y)
    n_pos = int(train_y.sum())
    n_neg = n_train - n_pos
    pos_ratio = n_pos / n_train if n_train > 0 else 0.0
    print(
        f"[INFO] Train stats: N={n_train}, N_pos={n_pos}, N_neg={n_neg}, "
        f"pos_ratio={pos_ratio:.3f}"
    )

    # DataLoader 構築
    X_tensor = torch.from_numpy(train_X).float()
    y_tensor = torch.from_numpy(train_y).float()  # 0/1

    if LABEL_SMOOTHING > 0.0:
        eps = float(LABEL_SMOOTHING)
        # 0 -> eps, 1 -> 1-eps
        y_smooth = y_tensor * (1.0 - eps) + (1.0 - y_tensor) * eps
        print(f"[INFO] Label smoothing enabled: eps={eps:.3f}")
    else:
        y_smooth = y_tensor

    dataset = TensorDataset(X_tensor, y_smooth)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    epoch_loss_list: List[float] = []

    for epoch in range(1, N_EPOCHS + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for batch_X, batch_y_smooth in loader:
            batch_X = batch_X.to(device)
            batch_y_smooth = batch_y_smooth.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)                 # (batch,)
            loss = criterion(logits, batch_y_smooth)  # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        avg_loss = running_loss / max(n_batches, 1)
        epoch_loss_list.append(avg_loss)

        if epoch % 5 == 0 or epoch == 1 or epoch == N_EPOCHS:
            print(f"[INFO] Epoch {epoch:02d}/{N_EPOCHS} - train_loss={avg_loss:.4f}")

    return model, epoch_loss_list


# -----------------------------
# Temperature Scaling 用クラス・関数
# -----------------------------
class TemperatureScaler(nn.Module):
    """
    logits を 1/T でスケールするモジュール（T > 0）。
    forward: logits -> logits / T
    """
    def __init__(self):
        super().__init__()
        # log_T をパラメータとして持ち，初期値 T=1
        self.log_T = nn.Parameter(torch.zeros(1))

    def forward(self, logits: torch.Tensor) -> torch.Tensor:
        T = torch.exp(self.log_T)
        return logits / T

    def temperature(self) -> torch.Tensor:
        return torch.exp(self.log_T)


def fit_temperature_scaling(
    calib_logits_np: np.ndarray,
    calib_labels_np: np.ndarray,
    device: torch.device,
    max_iter: int = TEMP_MAX_ITER,
    lr: float = TEMP_LR,
) -> float:
    """
    calibration用データ (logits, labels) から Temperature T を学習する。
    T は BCEWithLogitsLoss( logits/T, y ) を最小化するよう最適化。
    返り値: T (float)
    """
    calib_logits = torch.from_numpy(calib_logits_np.astype(np.float32)).to(device)
    calib_labels = torch.from_numpy(calib_labels_np.astype(np.float32)).to(device)

    # ラベルが全0または全1なら校正不能→T=1.0
    n_pos = int(calib_labels.sum().item())
    n_total = calib_labels.numel()
    if n_pos == 0 or n_pos == n_total:
        print(
            f"[WARN] [TempScaling] calibration labels are all same "
            f"(N={n_total}, N_pos={n_pos}) -> skip (T=1.0)"
        )
        return 1.0

    scaler = TemperatureScaler().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(scaler.parameters(), lr=lr)

    scaler.train()
    for step in range(1, max_iter + 1):
        optimizer.zero_grad()
        scaled_logits = scaler(calib_logits)
        loss = criterion(scaled_logits, calib_labels)
        loss.backward()
        optimizer.step()

        if step % 50 == 0 or step == 1 or step == max_iter:
            T_val = scaler.temperature().item()
            print(
                f"[INFO] [TempScaling] step={step:03d}, "
                f"loss={loss.item():.6f}, T={T_val:.4f}"
            )

    T_final = scaler.temperature().item()
    print(f"[INFO] [TempScaling] Finished: T={T_final:.4f}")
    return T_final


def build_calibration_data_inner_loso(
    train_subject_ids: List[str],
    X_by_sid: Dict[str, np.ndarray],
    y_by_sid: Dict[str, np.ndarray],
    device: torch.device,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    パターンB: Train側被験者のみで inner-LOSO を回し，
    各 inner fold の val被験者に対する logits（out-of-fold予測）とラベルを集める。

    返り値:
      calib_logits: shape (N_train_all,)
      calib_labels: shape (N_train_all,)
    """
    all_logits: List[np.ndarray] = []
    all_labels: List[np.ndarray] = []

    for val_sid in train_subject_ids:
        print(f"[INFO] [Calib-InnerLOSO] val subject = {val_sid}")
        inner_train_sids = [s for s in train_subject_ids if s != val_sid]

        inner_train_X = np.concatenate([X_by_sid[s] for s in inner_train_sids], axis=0)
        inner_train_y = np.concatenate([y_by_sid[s] for s in inner_train_sids], axis=0)
        val_X = X_by_sid[val_sid]
        val_y = y_by_sid[val_sid]

        print(
            f"[INFO] [Calib-InnerLOSO] inner-train N={len(inner_train_y)}, "
            f"val N={len(val_y)}"
        )

        # inner fold モデル学習（ここでもラベルスムージング設定が効く）
        inner_model, _ = train_one_fold(inner_train_X, inner_train_y, device=device)

        # val被験者への logits を out-of-fold 予測として取得
        inner_model.eval()
        with torch.no_grad():
            X_val_tensor = torch.from_numpy(val_X).float().to(device)
            logits_val = inner_model(X_val_tensor)  # (N_val,)
            all_logits.append(logits_val.cpu().numpy())
            all_labels.append(val_y.astype(int))

    calib_logits = np.concatenate(all_logits, axis=0)
    calib_labels = np.concatenate(all_labels, axis=0).astype(np.int64)

    n_total = len(calib_labels)
    n_pos = int(calib_labels.sum())
    n_neg = n_total - n_pos
    pos_ratio = n_pos / n_total if n_total > 0 else 0.0

    print(
        f"[INFO] [Calib-InnerLOSO] Collected calibration data: "
        f"N={n_total}, N_pos={n_pos}, N_neg={n_neg}, pos_ratio={pos_ratio:.3f}"
    )

    return calib_logits, calib_labels


# -----------------------------
# 確率分布プロット
# -----------------------------
def plot_probability_distributions(
    df_pred: pd.DataFrame,
    out_dir: Path,
) -> None:
    """
    Foldごと（被験者ごと）と全体の predicted probability 分布をヒストグラムで保存する。

    df_pred:
        列: ['SubjectID', 'Time_sec', 'FMS', 'Label_bin',
             'Prob_FMS_ge1_raw', 'Prob_FMS_ge1(=calibrated)']
        ※ この関数では 'Prob_FMS_ge1'（= Temperature scaling 後）を使用。
    out_dir:
        画像を保存するディレクトリ
    """
    # グラフ体裁（ユーザ指定）
    TITLE_FONTSIZE = 30
    LABEL_FONTSIZE = 24
    TICK_FONTSIZE = 20
    LEGEND_FONTSIZE = 20
    LINEWIDTH = 1.5

    prob_col = "Prob_FMS_ge1"  # calibrated

    # ---- 全体の分布（全fold結合） ----
    fig, ax = plt.subplots(figsize=(8, 6))

    for label, alpha, lab_name in [
        (0, 0.6, "Label=0 (FMS<1)"),
        (1, 0.6, "Label=1 (FMS>=1)")
    ]:
        vals = df_pred.loc[df_pred["Label_bin"] == label, prob_col].values
        if len(vals) == 0:
            continue
        ax.hist(
            vals,
            bins=20,
            range=(0.0, 1.0),
            density=True,
            alpha=alpha,
            label=lab_name,
            edgecolor="black",
            linewidth=LINEWIDTH,
        )

    ax.set_xlim(0.0, 1.0)
    ax.set_xlabel("Calibrated probability (FMS ≥ 1)", fontsize=LABEL_FONTSIZE)
    ax.set_ylabel("Density", fontsize=LABEL_FONTSIZE)
    ax.set_title("All subjects – Calibrated probability distribution", fontsize=TITLE_FONTSIZE)
    ax.tick_params(axis="both", labelsize=TICK_FONTSIZE)
    ax.legend(fontsize=LEGEND_FONTSIZE)
    plt.tight_layout()

    out_path_all = out_dir / "Cell1_LSTM_ProbDist_ALL.png"
    fig.savefig(out_path_all, dpi=300)
    plt.close(fig)
    print(f"[INFO] Saved global probability distribution plot to: {out_path_all}")

    # ---- 被験者ごとの分布 ----
    for sid, df_sub in df_pred.groupby("SubjectID"):
        fig, ax = plt.subplots(figsize=(8, 6))

        for label, alpha, lab_name in [
            (0, 0.6, "Label=0 (FMS<1)"),
            (1, 0.6, "Label=1 (FMS>=1)")
        ]:
            vals = df_sub.loc[df_sub["Label_bin"] == label, prob_col].values
            if len(vals) == 0:
                continue
            ax.hist(
                vals,
                bins=20,
                range=(0.0, 1.0),
                density=True,
                alpha=alpha,
                label=lab_name,
                edgecolor="black",
                linewidth=LINEWIDTH,
            )

        ax.set_xlim(0.0, 1.0)
        ax.set_xlabel("Calibrated probability (FMS ≥ 1)", fontsize=LABEL_FONTSIZE)
        ax.set_ylabel("Density", fontsize=LABEL_FONTSIZE)
        ax.set_title(f"Subject {sid} – Calibrated probability distribution", fontsize=TITLE_FONTSIZE)
        ax.tick_params(axis="both", labelsize=TICK_FONTSIZE)
        ax.legend(fontsize=LEGEND_FONTSIZE)
        plt.tight_layout()

        out_path_sid = out_dir / f"Cell1_LSTM_ProbDist_{sid}.png"
        fig.savefig(out_path_sid, dpi=300)
        plt.close(fig)
        print(f"[INFO] Saved probability distribution plot for {sid} to: {out_path_sid}")


# -----------------------------
# main
# -----------------------------
def main():
    # デバイス選択
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Using device: {device}")

    # 再現性のためにseed固定
    torch.manual_seed(20251206)
    np.random.seed(20251206)

    # ---- 全被験者のシーケンスを構築 ----
    X_by_sid: Dict[str, np.ndarray] = {}
    y_by_sid: Dict[str, np.ndarray] = {}
    t_by_sid: Dict[str, np.ndarray] = {}
    fms_by_sid: Dict[str, np.ndarray] = {}

    for sid in SUBJECT_IDS:
        print(f"[INFO] ==== Build sequences: Subject {sid} ====")
        X_seq, y_seq, t_seq, fms_seq = build_sequences_for_subject(sid)
        X_by_sid[sid] = X_seq
        y_by_sid[sid] = y_seq
        t_by_sid[sid] = t_seq
        fms_by_sid[sid] = fms_seq

    all_y_tmp = np.concatenate([y_by_sid[sid] for sid in SUBJECT_IDS])
    print(
        f"[INFO] Overall (all subjects) target stats: "
        f"N={len(all_y_tmp)}, N_pos={all_y_tmp.sum()}, "
        f"pos_ratio={all_y_tmp.mean():.3f}"
    )

    # ---- LOSO 学習・評価 ----
    all_probs_raw: List[np.ndarray] = []
    all_probs_cal: List[np.ndarray] = []
    all_true: List[np.ndarray] = []
    pred_rows: List[pd.DataFrame] = []
    fold_summary_rows: List[Dict] = []
    epoch_loss_records: List[Dict] = []

    for test_sid in SUBJECT_IDS:
        print(f"\n[INFO] ===== LOSO fold: Test Subject {test_sid} =====")

        # 学習・テスト被験者
        train_sids = [sid for sid in SUBJECT_IDS if sid != test_sid]

        # ---------- inner-LOSO で Temperature Scaling 用データ構築 ----------
        print("[INFO] ----- Build calibration data via inner-LOSO on train subjects -----")
        calib_logits, calib_labels = build_calibration_data_inner_loso(
            train_subject_ids=train_sids,
            X_by_sid=X_by_sid,
            y_by_sid=y_by_sid,
            device=device,
        )

        # Temperature T を学習
        T_fold = fit_temperature_scaling(
            calib_logits_np=calib_logits,
            calib_labels_np=calib_labels,
            device=device,
        )
        print(f"[INFO] Fold (test={test_sid}) learned temperature T = {T_fold:.4f}")

        # ---------- 外側foldの学習データ（train_sids全部）で最終モデル学習 ----------
        train_X_list = []
        train_y_list = []
        for sid in train_sids:
            train_X_list.append(X_by_sid[sid])
            train_y_list.append(y_by_sid[sid])

        train_X = np.concatenate(train_X_list, axis=0)
        train_y = np.concatenate(train_y_list, axis=0)
        test_X = X_by_sid[test_sid]
        test_y = y_by_sid[test_sid]
        test_t = t_by_sid[test_sid]
        test_fms = fms_by_sid[test_sid]

        print(
            f"[INFO] Fold data sizes: "
            f"Train N_seq={len(train_y)}, Test N_seq={len(test_y)}"
        )

        # 1 fold 学習（外側fold用モデル）
        model, epoch_loss_list = train_one_fold(train_X, train_y, device=device)

        # epochごとの loss をログ用に保存
        for ep_idx, loss_val in enumerate(epoch_loss_list, start=1):
            epoch_loss_records.append(
                {
                    "SubjectID": test_sid,
                    "Epoch": ep_idx,
                    "TrainLoss": loss_val,
                }
            )

        # テスト被験者の予測（logits → raw prob → calibrated prob）
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(test_X).float().to(device)
            logits_test = model(X_test_tensor)  # (N_test,)
            probs_raw = torch.sigmoid(logits_test).cpu().numpy()
            logits_scaled = logits_test / T_fold
            probs_cal = torch.sigmoid(logits_scaled).cpu().numpy()

        all_probs_raw.append(probs_raw)
        all_probs_cal.append(probs_cal)
        all_true.append(test_y.astype(int))

        # foldごとのROC-AUC（raw & calibrated）
        n_pos_test = int(test_y.sum())
        n_neg_test = int(len(test_y) - n_pos_test)
        if n_pos_test == 0 or n_neg_test == 0:
            rocauc_fold_raw = float("nan")
            rocauc_fold_cal = float("nan")
            print(
                f"[INFO] Subject {test_sid}: ROC-AUC undefined (N_pos={n_pos_test}, N_neg={n_neg_test})"
            )
        else:
            rocauc_fold_raw = roc_auc_score(test_y, probs_raw)
            rocauc_fold_cal = roc_auc_score(test_y, probs_cal)
            print(
                f"[INFO] Subject {test_sid}: "
                f"ROC-AUC raw = {rocauc_fold_raw:.4f}, "
                f"calib = {rocauc_fold_cal:.4f} "
                f"(N_test={len(test_y)}, N_pos={n_pos_test}, N_neg={n_neg_test})"
            )

        fold_summary_rows.append(
            {
                "SubjectID": test_sid,
                "N_test": int(len(test_y)),
                "N_pos_test": n_pos_test,
                "N_neg_test": n_neg_test,
                "pos_ratio_test": float(test_y.mean()),
                "Temp_T": float(T_fold),
                "ROC_AUC_test_raw": rocauc_fold_raw,
                "ROC_AUC_test_calib": rocauc_fold_cal,
            }
        )

        # このfoldの予測詳細
        df_fold = pd.DataFrame(
            {
                "SubjectID": test_sid,
                "Time_sec": test_t,
                "FMS": test_fms,
                "Label_bin": test_y.astype(int),
                "Prob_FMS_ge1_raw": probs_raw,
                "Prob_FMS_ge1": probs_cal,  # Temperature scaling 後
            }
        )
        pred_rows.append(df_fold)

    # ---- 全foldをまとめた ROC-AUC ----
    y_all = np.concatenate(all_true)
    p_all_raw = np.concatenate(all_probs_raw)
    p_all_cal = np.concatenate(all_probs_cal)

    n_total = len(y_all)
    n_pos = int(y_all.sum())
    n_neg = n_total - n_pos
    pos_ratio = n_pos / n_total if n_total > 0 else 0.0

    print("\n[INFO] ===== Overall LOSO result =====")
    print(
        f"[INFO] All folds combined: N={n_total}, N_pos={n_pos}, "
        f"N_neg={n_neg}, pos_ratio={pos_ratio:.3f}"
    )

    if n_pos == 0 or n_pos == n_total:
        raise RuntimeError(
            f"[ERROR] ROC-AUC undefined: labels are all the same "
            f"(N={n_total}, N_pos={n_pos})."
        )

    rocauc_raw = roc_auc_score(y_all, p_all_raw)
    rocauc_cal = roc_auc_score(y_all, p_all_cal)
    # Temperature scaling は単調変換なので raw と基本同じはず
    print(f"[RESULT] Global ROC-AUC raw   (LOSO, LSTM, FMS>=1) = {rocauc_raw:.4f}")
    print(f"[RESULT] Global ROC-AUC calib (LOSO, LSTM, FMS>=1) = {rocauc_cal:.4f}")

    rocauc = rocauc_cal  # 基本的な代表値として calib を採用

    # ---- 結果保存 ----
    # 1) ROC-AUC のサマリ（ハイパラ込み）
    result_path = OUT_DIR / "Cell1_LSTM_LOSO_ROCAUC.csv"
    df_result = pd.DataFrame(
        {
            "ROC_AUC_global": [rocauc],
            "ROC_AUC_global_raw": [rocauc_raw],
            "ROC_AUC_global_calib": [rocauc_cal],
            "N_total": [n_total],
            "N_pos": [n_pos],
            "N_neg": [n_neg],
            "pos_ratio": [pos_ratio],
            "N_features": [N_FEATURES],
            "feature_list": [",".join(FEATURE_COLS)],
            # 変更候補ハイパラを全部記録
            "WINDOW_SEC": [WINDOW_SEC],
            "SLIDE_STEP_SEC": [SLIDE_STEP_SEC],
            "SEQ_LEN": [SEQ_LEN],
            "HIDDEN_SIZE": [HIDDEN_SIZE],
            "FC_HIDDEN_SIZE": [FC_HIDDEN_SIZE],
            "DROPOUT_LSTM": [DROPOUT_LSTM],
            "DROPOUT_FC": [DROPOUT_FC],
            "LEARNING_RATE": [LEARNING_RATE],
            "BATCH_SIZE": [BATCH_SIZE],
            "N_EPOCHS": [N_EPOCHS],
            "WEIGHT_DECAY": [WEIGHT_DECAY],
            "TEMP_MAX_ITER": [TEMP_MAX_ITER],
            "TEMP_LR": [TEMP_LR],
            "LABEL_SMOOTHING": [LABEL_SMOOTHING],
        }
    )
    df_result.to_csv(result_path, index=False)
    print(f"[INFO] Saved ROC-AUC result to: {result_path}")

    # 2) シーケンスごとの詳細予測
    df_pred = pd.concat(pred_rows, ignore_index=True)
    pred_path = OUT_DIR / "Cell1_LSTM_LOSO_pred_detail.csv"
    df_pred.to_csv(pred_path, index=False)
    print(f"[INFO] Saved per-sequence predictions to: {pred_path}")

    # 2.5) 確率分布プロット（foldごと＋全体, calibrated prob）
    plot_probability_distributions(df_pred, PROB_PLOT_DIR)

    # 3) foldごとの summary（被験者別 ROC-AUC）
    df_fold_summary = pd.DataFrame(fold_summary_rows)
    # 後方互換用に calib を ROC_AUC_test としても持たせておく
    df_fold_summary["ROC_AUC_test"] = df_fold_summary["ROC_AUC_test_calib"]

    fold_summary_path = OUT_DIR / "Cell1_LSTM_LOSO_fold_summary.csv"
    df_fold_summary.to_csv(fold_summary_path, index=False)
    print(f"[INFO] Saved per-fold summary to: {fold_summary_path}")

    # 4) epochごとの train loss
    df_loss = pd.DataFrame(epoch_loss_records)
    loss_path = OUT_DIR / "Cell1_LSTM_LOSO_train_loss_by_epoch.csv"
    df_loss.to_csv(loss_path, index=False)
    print(f"[INFO] Saved train loss by epoch to: {loss_path}")

    # ---- 最後に、被験者ごとのROC-AUCを()付きでプリント（calibベース） ----
    print("\n[SUMMARY] ===== Per-subject ROC-AUC (LOSO, calibrated) =====")
    print(f"[SUMMARY] Global ROC-AUC (all folds combined, calibrated) = {rocauc:.4f}")

    good_mask = df_fold_summary["ROC_AUC_test"].notna() & (df_fold_summary["ROC_AUC_test"] > 0.5)
    bad_mask = df_fold_summary["ROC_AUC_test"].notna() & (df_fold_summary["ROC_AUC_test"] <= 0.5)
    nan_mask = df_fold_summary["ROC_AUC_test"].isna()

    def format_sid_list(mask) -> str:
        rows = df_fold_summary.loc[mask, ["SubjectID", "ROC_AUC_test"]]
        if rows.empty:
            return "なし"
        return ", ".join(f"{row.SubjectID}({row.ROC_AUC_test:.3f})" for _, row in rows.iterrows())

    good_str = format_sid_list(good_mask)
    bad_str = format_sid_list(bad_mask)
    nan_sids = df_fold_summary.loc[nan_mask, "SubjectID"].tolist()
    nan_str = ", ".join(nan_sids) if len(nan_sids) > 0 else "なし"

    print(f"[SUMMARY] よく当たっている被験者(>0.5): {good_str}")
    print(f"[SUMMARY] あまり当たっていない被験者(<=0.5): {bad_str}")
    print(f"[SUMMARY] 評価不能(ROC-AUC算出不可): {nan_str}")


if __name__ == "__main__":
    main()
