In [5]:
# === Cell 1: setup & utilities (load from summary_scores.csv) ===
from __future__ import annotations
import os
import math
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# ===================== CONFIG =====================
BASE_DIR: str = r"C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果"
INPUT_SUMMARY: str = os.path.join(BASE_DIR, "summary_scores.csv")  # ★ ここを固定読み込み
OUT_PLOTS_CORR: str = os.path.join(BASE_DIR, "ANALYSIS", "plots", "correlation")
OUT_PLOTS_DIST: str = os.path.join(BASE_DIR, "ANALYSIS", "plots", "distribution")
OUT_TABLES: str = os.path.join(BASE_DIR, "ANALYSIS", "tables")
CORR_TABLE_PATH: str = os.path.join(OUT_TABLES, "correlation_summary.csv")
REG_TABLE_PATH: str = os.path.join(OUT_TABLES, "regression_summary.csv")

subjects = [
    ("0521", "因幡先生"),
    ("06021", "今村さん"),
    ("06022", "梅野さん"),
    ("06271", ""),
    ("06272", ""),
    ("06273", ""),
    ("06274", ""),
    ("06275", "")
]


# 図の体裁（あなたの規約）
TITLE_FSIZE = 30
LABEL_FSIZE = 24
LEGEND_FSIZE = 20
TICK_FSIZE = 20
LINEWIDTH = 1.5

# テーブル書き込み挙動
OVERWRITE_TABLES: bool = True  # セル2実行時に初期化、以後は追記

# 列名エイリアス辞書（あなたの集計スクリプト列に対応）
ALIASES: Dict[str, List[str]] = {
    "subject_id":  ["id", "subject_id", "subject"],
    "MSSQ":        ["mssq", "mssq_total", "mssq score", "mssq_total_score"],
    "VIMSSQ":      ["vimssq", "vims_susceptibility", "vims susceptibility", "mssq_v"],
    "SSQ_TOTAL":   ["ssq_total", "ssq total", "ssq sum", "ssq_sum", "ssqtotal", "ssq_total_score", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "ssq_totalvalue", "SSQ_Total"],
    "SSQ_NAUSEA":  ["ssq_nausea", "nausea", "ssq nausea", "SSQ_Nausea"],
    "FMS_MAX":     ["max_fms", "fmsmax", "MAX_FMS"],
}

N_MIN: int = 3  # 相関・回帰の最小サンプル数

# ===================== UTILS =====================
def _normalize(s: str) -> str:
    import re
    return re.sub(r"[^0-9a-z_]", "", s.strip().lower().replace(" ", "_"))

def standardize_columns(df: pd.DataFrame, aliases: Dict[str, List[str]]) -> pd.DataFrame:
    inv_map: Dict[str, str] = {}
    norm_cols = {_normalize(c): c for c in df.columns}
    for std, cands in aliases.items():
        for cand in [std] + cands:
            key = _normalize(cand)
            if key in norm_cols:
                inv_map[norm_cols[key]] = std
                break
    return df.rename(columns=inv_map)

def ensure_dirs(*paths: str) -> None:
    for p in paths:
        os.makedirs(p, exist_ok=True)

def fisher_z_ci(r: float, n: int, alpha: float = 0.05) -> Tuple[float, float]:
    if n <= 3 or not np.isfinite(r) or abs(r) >= 0.999999:
        return (np.nan, np.nan)
    z = np.arctanh(np.clip(r, -0.999999, 0.999999))
    se = 1.0 / math.sqrt(n - 3)
    zcrit = stats.norm.ppf(1 - alpha / 2)
    return (np.tanh(z - zcrit * se), np.tanh(z + zcrit * se))

def compute_correlation(x: np.ndarray, y: np.ndarray) -> Tuple[int, float, float, float, float]:
    mask = np.isfinite(x) & np.isfinite(y)
    x_, y_ = x[mask], y[mask]
    n = x_.shape[0]
    if n < N_MIN:
        return (n, np.nan, np.nan, np.nan, np.nan)
    r, p = stats.pearsonr(x_, y_)
    r_lo, r_hi = fisher_z_ci(r, n)
    return (n, r, p, r_lo, r_hi)

def compute_regression(x: np.ndarray, y: np.ndarray) -> Tuple[int, float, float, float, float, float]:
    mask = np.isfinite(x) & np.isfinite(y)
    x_, y_ = x[mask], y[mask]
    n = x_.shape[0]
    if n < N_MIN or np.std(x_) == 0 or np.std(y_) == 0:
        return (n, np.nan, np.nan, np.nan, np.nan, np.nan)
    res = stats.linregress(x_, y_)
    return (n, res.slope, res.intercept, res.rvalue, res.rvalue**2, res.pvalue)

def plot_scatter_with_fit(df: pd.DataFrame, x_col: str, y_col: str, out_png: str,
                          title: Optional[str] = None,
                          annotate_stats: Optional[Tuple[int, float, float]] = None) -> bool:
    try:
        x = pd.to_numeric(df[x_col], errors="coerce").to_numpy()
        y = pd.to_numeric(df[y_col], errors="coerce").to_numpy()
        mask = np.isfinite(x) & np.isfinite(y)
        x_, y_ = x[mask], y[mask]
        if x_.size < N_MIN or np.std(x_) == 0 or np.std(y_) == 0:
            print(f"[SKIP] {x_col} vs {y_col}: insufficient or constant data")
            return False
        reg = stats.linregress(x_, y_)
        xfit = np.linspace(x_.min(), x_.max(), 100)
        yfit = reg.slope * xfit + reg.intercept

        fig, ax = plt.subplots(figsize=(7, 5))
        ax.scatter(x_, y_, s=60, alpha=0.7)
        ax.plot(xfit, yfit, linewidth=LINEWIDTH)
        ax.set_title(title or f"{x_col} vs {y_col}", fontsize=TITLE_FSIZE)
        ax.set_xlabel(x_col, fontsize=LABEL_FSIZE)
        ax.set_ylabel(y_col, fontsize=LABEL_FSIZE)
        ax.tick_params(axis="both", labelsize=TICK_FSIZE)
        if annotate_stats is not None:
            n, r, p = annotate_stats
            ax.text(0.98, 0.02, f"N={n}, r={r:.3f}, p={p:.3g}",
                    transform=ax.transAxes, ha="right", va="bottom", fontsize=LEGEND_FSIZE)
        ensure_dirs(os.path.dirname(out_png))
        fig.tight_layout()
        fig.savefig(out_png, dpi=200, bbox_inches="tight")
        plt.close(fig)
        print(f"[OK]  {x_col} vs {y_col} -> {out_png}")
        return True
    except Exception as e:
        print(f"[SKIP] {x_col} vs {y_col}: {e}")
        return False

def init_tables(corr_csv: str, reg_csv: str, overwrite: bool) -> None:
    ensure_dirs(OUT_TABLES)
    if overwrite:
        for p in (corr_csv, reg_csv):
            if os.path.exists(p):
                try: os.remove(p)
                except Exception: pass

def append_corr_row(path: str, row: Dict) -> None:
    df = pd.DataFrame([row])
    df.to_csv(path, mode="a", index=False, header=not os.path.exists(path))

def append_reg_row(path: str, row: Dict) -> None:
    df = pd.DataFrame([row])
    df.to_csv(path, mode="a", index=False, header=not os.path.exists(path))

# ===================== LOAD DATA =====================
ensure_dirs(OUT_PLOTS_CORR, OUT_PLOTS_DIST, OUT_TABLES)

df_in: Optional[pd.DataFrame] = None
if os.path.exists(INPUT_SUMMARY):
    try:
        df_in = pd.read_csv(INPUT_SUMMARY)
        print(f"[OK]  Load -> {INPUT_SUMMARY}")
        df_in = standardize_columns(df_in, ALIASES)
    except Exception as e:
        print(f"[SKIP] Load: {INPUT_SUMMARY} ({e})")
else:
    print(f"[SKIP] Load: not found {INPUT_SUMMARY}")


[OK]  Load -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\summary_scores.csv


In [6]:
# === Cell 2 : styled scatter with regression & 95% CI, labels, cutoffs ===
from typing import Optional, Tuple
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import os

if df_in is None:
    raise SystemExit("[SKIP] No input DataFrame loaded in Cell 1.")

# ---------- helpers: labels / colors / display names ----------
def mssq_label(v: float) -> str:
    if v <= 11.3:  return "Low"
    if v <= 17.9:  return "Medium"
    if v <= 25.9:  return "High"
    return "Very High"

label_order = ["Low", "Medium", "High", "Very High"]

# seaborn Set2 から近い色（手動指定）
SET2 = {
    "Low":       "#66c2a5",  # greenish
    "Medium":    "#fc8d62",  # orange
    "High":      "#8da0cb",  # blue
    "Very High": "#e78ac3",  # pink
}

VIM_COL = {"Low": "#66c2a5", "High": "#e78ac3"}  # 7点閾値の2色

# 入力→表示の列名（軸ラベルに使う）
DISPLAY = {
    "MSSQ": "MSSQ",
    "VIMSSQ": "VIMSSQ",
    "SSQ_TOTAL": "SSQ_Total",
    "SSQ_NAUSEA": "SSQ_Nausea",
    "FMS_MAX": "MAX_FMS",
}

# ---------- prepare categorical columns ----------
df_plot = df_in.copy()
if "MSSQ" in df_plot.columns:
    df_plot["MSSQ_label"] = df_plot["MSSQ"].astype(float).apply(mssq_label)

if "VIMSSQ" in df_plot.columns:
    df_plot["VIMSSQ_label"] = np.where(pd.to_numeric(df_plot["VIMSSQ"], errors="coerce") < 7, "Low", "High")

# ID列（subject_id優先、なければID）
ID_COL = "subject_id" if "subject_id" in df_plot.columns else ("ID" if "ID" in df_plot.columns else None)

# ---------- math: regression & 95% CI for the mean prediction ----------
def regression_ci(x: np.ndarray, y: np.ndarray, x_grid: np.ndarray, alpha: float=0.05
                 ) -> Tuple[float, float, float, np.ndarray, np.ndarray, np.ndarray]:
    """OLS (with intercept) and 95% CI for mean prediction (yhat).
    returns: slope, intercept, r, yhat_grid, lo_grid, hi_grid
    """
    x = x.astype(float); y = y.astype(float)
    n = x.size
    xbar = x.mean(); ybar = y.mean()
    Sxx = np.sum((x - xbar) ** 2)
    if n < 3 or Sxx <= 0:
        return (np.nan, np.nan, np.nan, np.full_like(x_grid, np.nan),)*2 + (np.full_like(x_grid, np.nan),)

    slope = np.sum((x - xbar)*(y - ybar)) / Sxx
    intercept = ybar - slope * xbar
    yhat = slope * x + intercept
    resid = y - yhat
    s = np.sqrt(np.sum(resid**2) / (n - 2))  # residual std
    r, _p = stats.pearsonr(x, y)

    tcrit = stats.t.ppf(1 - alpha/2, df=n-2)
    se_mean = lambda x0: s * np.sqrt(1/n + ( (x0 - xbar)**2 / Sxx ))

    y_grid = slope * x_grid + intercept
    se_grid = se_mean(x_grid)
    lo_grid = y_grid - tcrit * se_grid
    hi_grid = y_grid + tcrit * se_grid
    return slope, intercept, r, y_grid, lo_grid, hi_grid

# ---------- plotter ----------
def plot_with_ci(df: pd.DataFrame, x_col: str, y_col: str, out_path: str) -> bool:
    # data
    x = pd.to_numeric(df[x_col], errors="coerce").to_numpy()
    y = pd.to_numeric(df[y_col], errors="coerce").to_numpy()
    m = np.isfinite(x) & np.isfinite(y)
    x, y = x[m], y[m]
    if x.size < N_MIN or np.std(x) == 0 or np.std(y) == 0:
        print(f"[SKIP] {x_col} vs {y_col}: insufficient or constant data")
        return False

    # regression & CI
    x_grid = np.linspace(x.min(), x.max(), 300)
    slope, intercept, r, y_grid, lo_grid, hi_grid = regression_ci(x, y, x_grid, alpha=0.05)
    r_val, p_val = stats.pearsonr(x, y)

    # figure
    fig, ax = plt.subplots(figsize=(8, 6))

    # CI band
    if np.all(np.isfinite(lo_grid)):
        ax.fill_between(x_grid, lo_grid, hi_grid, color="gray", alpha=0.30, label="95% CI")

    # scatter by groups
    if x_col == "MSSQ" and "MSSQ_label" in df.columns:
        for lab in label_order:
            sub = df[df["MSSQ_label"] == lab]
            xs = pd.to_numeric(sub[x_col], errors="coerce").to_numpy()
            ys = pd.to_numeric(sub[y_col], errors="coerce").to_numpy()
            mm = np.isfinite(xs) & np.isfinite(ys)
            ax.scatter(xs[mm], ys[mm], s=90, alpha=0.9, label=lab, color=SET2[lab])
        for xv, lab in zip([11.3, 17.9, 25.9], ["Medium", "High", "Very High"]):
            ax.axvline(xv, linestyle="dotted", linewidth=LINEWIDTH, color=SET2[lab])
    elif x_col == "VIMSSQ" and "VIMSSQ_label" in df.columns:
        for lab in ["Low", "High"]:
            sub = df[df["VIMSSQ_label"] == lab]
            xs = pd.to_numeric(sub[x_col], errors="coerce").to_numpy()
            ys = pd.to_numeric(sub[y_col], errors="coerce").to_numpy()
            mm = np.isfinite(xs) & np.isfinite(ys)
            ax.scatter(xs[mm], ys[mm], s=90, alpha=0.9, label=lab, color=VIM_COL[lab])
        ax.axvline(7.0, linestyle="dotted", linewidth=LINEWIDTH, color="gray")
    else:
        ax.scatter(x, y, s=90, alpha=0.9)

    # regression line
    if np.all(np.isfinite(y_grid)):
        ax.plot(x_grid, y_grid, linestyle="--", linewidth=LINEWIDTH, color="black", label="Regression line")

    # ID labels (smaller)
    if ID_COL is not None and ID_COL in df.columns:
        dx = 0.01 * (x.max() - x.min() if x.max() > x.min() else 1.0)
        dy = 0.01 * (y.max() - y.min() if y.max() > y.min() else 1.0)
        for _, row in df[m].iterrows():
            try:
                xv = float(row[x_col]); yv = float(row[y_col])
                ax.text(xv + dx, yv + dy, str(row[ID_COL]), fontsize=10, color="black")
            except Exception:
                pass

    # SSQ_Total threshold
    if y_col == "SSQ_TOTAL":
        ax.axhline(40, color="black", linestyle="dotted", linewidth=LINEWIDTH)
        xmin = np.nanmin(x)
        ax.text(xmin, 40 + 1.5, "High Motion-Sickness", fontsize=12, color="black")

    # stats box
    ax.text(0.05, 0.95, f"r = {r_val:.3f}\np = {p_val:.4f}",
            transform=ax.transAxes, ha="left", va="top",
            fontsize=16, bbox=dict(facecolor="white", edgecolor="gray", alpha=0.7))

    # axes / legend (smaller)
    ax.set_xlabel(DISPLAY.get(x_col, x_col), fontsize=LABEL_FSIZE)
    ax.set_ylabel(DISPLAY.get(y_col, y_col), fontsize=LABEL_FSIZE)
    ax.set_title(f"{DISPLAY.get(x_col, x_col)} vs {DISPLAY.get(y_col, y_col)}", fontsize=TITLE_FSIZE)
    ax.tick_params(axis="both", labelsize=TICK_FSIZE)
    ax.grid(True)

    handles, labels = ax.get_legend_handles_labels()
    if len(labels) > 0:
        ax.legend(
            loc="lower right",
            fontsize=12,           # ← 小さく
            title="label" if x_col in ("MSSQ", "VIMSSQ") else None,
            title_fontsize=12,     # ← 小さく
            handlelength=1.6, handletextpad=0.5,
            borderpad=0.3, labelspacing=0.4
        )

    ensure_dirs(os.path.dirname(out_path))
    fig.tight_layout()
    fig.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close(fig)
    print(f"[OK]  {x_col} vs {y_col} -> {out_path}")
    return True

# ---------- run & also record tables ----------
init_tables(CORR_TABLE_PATH, REG_TABLE_PATH, overwrite=OVERWRITE_TABLES)

pairs = [
    ("MSSQ",   "SSQ_TOTAL"),
    ("MSSQ",   "SSQ_NAUSEA"),
    ("MSSQ",   "FMS_MAX"),
    ("VIMSSQ", "SSQ_TOTAL"),
    ("VIMSSQ", "SSQ_NAUSEA"),
    ("VIMSSQ", "FMS_MAX"),
    ("MSSQ",   "VIMSSQ"),  # 追加ペア
]

for x_col, y_col in pairs:
    if (x_col not in df_plot.columns) or (y_col not in df_plot.columns):
        print(f"[SKIP] {x_col} vs {y_col}: missing column(s)")
        continue

    # stats rows
    x = pd.to_numeric(df_plot[x_col], errors="coerce").to_numpy()
    y = pd.to_numeric(df_plot[y_col], errors="coerce").to_numpy()
    n, r, p, r_lo, r_hi = compute_correlation(x, y)
    if n < N_MIN or not np.isfinite(r):
        print(f"[SKIP] {x_col} vs {y_col}: insufficient data (N={n})")
        continue

    append_corr_row(CORR_TABLE_PATH, {
        "x_var": x_col, "y_var": y_col, "method": "pearson", "N": n,
        "r": r, "p_value": p, "r_ci_low": r_lo, "r_ci_high": r_hi, "note": ""
    })

    n2, slope, intercept, r_val, r2, p_lin = compute_regression(x, y)
    append_reg_row(REG_TABLE_PATH, {
        "x_var": x_col, "y_var": y_col, "slope": slope, "intercept": intercept,
        "r_value": r_val, "r_squared": r2, "p_value": p_lin, "N": n2
    })

    # figure
    out_png = os.path.join(OUT_PLOTS_CORR, f"{DISPLAY.get(x_col, x_col)}_vs_{DISPLAY.get(y_col, y_col)}.png")
    plot_with_ci(df_plot, x_col, y_col, out_png)

print(f"[OK]  Table(corr) -> {CORR_TABLE_PATH}")
print(f"[OK]  Table(reg)  -> {REG_TABLE_PATH}")


[OK]  MSSQ vs SSQ_TOTAL -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MSSQ_vs_SSQ_Total.png
[OK]  MSSQ vs SSQ_NAUSEA -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MSSQ_vs_SSQ_Nausea.png
[OK]  MSSQ vs FMS_MAX -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MSSQ_vs_MAX_FMS.png
[OK]  VIMSSQ vs SSQ_TOTAL -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\VIMSSQ_vs_SSQ_Total.png
[OK]  VIMSSQ vs SSQ_NAUSEA -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\VIMSSQ_vs_SSQ_Nausea.png
[OK]  VIMSSQ vs FMS_MAX -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\VIMSSQ_vs_MAX_FMS.png
[OK]  MSSQ vs VIMSSQ -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MSSQ_vs_VIMSSQ.png
[OK]  Table(corr) -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALY

In [7]:
# === Cell 2: MaxFMS vs SSQ plots (red OR-region, no CI) ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from scipy.stats import pearsonr, linregress

# しきい値
X_THRESH = 1.9   # MaxFMS
Y_THRESH = 19.0  # SSQ (Total/Nauseaともにここでは19を採用)

# データ収集
fms_peaks, ssq_totals, ssq_nauseas, labels, groups = [], [], [], [], []

def classify(max_fms: float, ssq_value: float) -> str:
    if not np.isfinite(max_fms) or not np.isfinite(ssq_value):
        return "Unclassified"
    return "Sick" if (max_fms >= X_THRESH or ssq_value >= Y_THRESH) else "Non-Sick"

print("【各被験者の MaxFMS・SSQ_Total・SSQ_Nausea】")
print("-" * 60)
for subject_id, person_name in subjects:
    folder = os.path.join(BASE_DIR, f"{subject_id}{person_name}", "epoch_summary")
    scores_csv = os.path.join(folder, f"{subject_id}_scores.csv")
    epoch_csv  = os.path.join(folder, f"{subject_id}_all_epoch_summary.csv")
    try:
        score_df = pd.read_csv(scores_csv)
        epoch_df = pd.read_csv(epoch_csv)

        # Max FMS（epoch summaryのFMS列を想定：3列目）
        fms_series = pd.to_numeric(epoch_df.iloc[:, 2], errors="coerce").dropna()
        max_fms = float(fms_series.max()) if not fms_series.empty else np.nan

        ssq_total  = float(pd.to_numeric(score_df.loc[0, "SSQ_Total"],  errors="coerce"))
        ssq_nausea = float(pd.to_numeric(score_df.loc[0, "SSQ_Nausea"], errors="coerce"))

        grp = classify(max_fms, ssq_total)

        fms_peaks.append(max_fms)
        ssq_totals.append(ssq_total)
        ssq_nauseas.append(ssq_nausea)
        labels.append(subject_id)
        groups.append(grp)

        print(f"{subject_id} {person_name:<6}: MaxFMS={max_fms:.1f}, SSQ_Total={ssq_total:.1f}, SSQ_Nausea={ssq_nausea:.1f} -> {grp}")
    except Exception as e:
        print(f"{subject_id} {person_name:<6}: 読み込みエラー: {e}")

print("-" * 60)

# カラー
COLORS = {"Sick": "red", "Non-Sick": "blue", "Unclassified": "gray"}

def plot_corr(x, y, x_label, y_label, title, out_png):
    # ---- 有効データ抽出（配列化 & 同一マスク）----
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    m = np.isfinite(x) & np.isfinite(y)
    x, y = x[m], y[m]
    if x.size < 2:
        print(f"[SKIP] {title}: data < 2")
        return

    # ラベル・グループも同じマスクでフィルタ
    groups_f = np.asarray(groups, dtype=object)[m]
    labels_f = np.asarray(labels, dtype=object)[m]

    # ---- 相関・回帰 ----
    r, p = pearsonr(x, y)
    slope, intercept, *_ = linregress(x, y)

    # ---- 軸範囲 ----
    x_min, x_max = -0.5, 4.5
    y_pad = max(5.0, 0.1 * (y.max() - y.min() + 1e-9))
    y_min = np.floor(min(0.0, y.min() - y_pad))
    y_max = np.ceil(y.max() + y_pad)

    fig, ax = plt.subplots(figsize=(6, 6))

    # 背景：全体赤（OR領域）+ 左下のみ青（ANDで非該当領域）
    ax.set_facecolor("mistyrose")
    rect = Rectangle((x_min, y_min), X_THRESH - x_min, Y_THRESH - y_min,
                     facecolor="lightblue", alpha=0.35, zorder=0)
    ax.add_patch(rect)

    # ---- 散布（グループ別）----
    for grp in ["Sick", "Non-Sick", "Unclassified"]:
        idx = np.where(groups_f == grp)[0]
        if idx.size == 0:
            continue
        ax.scatter(x[idx], y[idx], s=90, color=COLORS[grp], label=grp)

        # IDラベル（小さめ）
        dx = 0.03 * (x_max - x_min)
        dy = 0.02 * (y_max - y_min)
        for j in idx:
            ax.text(x[j] + dx, y[j] + dy, str(labels_f[j]), fontsize=10, color="black")

    # ---- 回帰線 ----
    x_line = np.linspace(x_min, x_max, 100)
    y_line = slope * x_line + intercept
    ax.plot(x_line, y_line, linestyle="--", color="gray",
            label="Regression Line", linewidth=LINEWIDTH)

    # r, p を右下に表示
    ax.text(0.70, 0.15, f"r = {r:.2f}\np = {p:.3f}",
            transform=ax.transAxes, fontsize=16,
            bbox=dict(facecolor="white", edgecolor="gray", alpha=0.7))

    # ---- 体裁 ----
    ax.set_xlabel(x_label, fontsize=LABEL_FSIZE)
    ax.set_ylabel(y_label, fontsize=LABEL_FSIZE)
    ax.set_title(title, fontsize=TITLE_FSIZE)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks([0, 1, 2, 3, 4])
    ax.tick_params(axis="both", labelsize=TICK_FSIZE)
    ax.grid(True)
    ax.legend(loc="upper left", fontsize=12)

    fig.tight_layout()
    ensure_dirs(OUT_PLOTS_CORR)
    fig.savefig(out_png, dpi=200, bbox_inches="tight")
    plt.close(fig)
    print(f"[OK]  Plot -> {out_png}")

    # ---- 表に追記（任意）----
    try:
        n = x.size
        append_corr_row(CORR_TABLE_PATH, {
            "x_var": x_label.replace(" ", "_"),
            "y_var": y_label.replace(" ", "_"),
            "method": "pearson", "N": n,
            "r": r, "p_value": p, "r_ci_low": np.nan, "r_ci_high": np.nan, "note": "bg OR region; no CI"
        })
        append_reg_row(REG_TABLE_PATH, {
            "x_var": x_label.replace(" ", "_"),
            "y_var": y_label.replace(" ", "_"),
            "slope": slope, "intercept": intercept,
            "r_value": r, "r_squared": r**2, "p_value": p, "N": n
        })
    except NameError:
        pass

# 実行（2図）
plot_corr(fms_peaks, ssq_totals,
          x_label="Max FMS", y_label="SSQ Total",
          title="Max FMS vs SSQ Total",
          out_png=os.path.join(OUT_PLOTS_CORR, "MaxFMS_vs_SSQ_Total_thresh19.png"))

plot_corr(fms_peaks, ssq_nauseas,
          x_label="Max FMS", y_label="SSQ Nausea",
          title="Max FMS vs SSQ Nausea",
          out_png=os.path.join(OUT_PLOTS_CORR, "MaxFMS_vs_SSQ_Nausea_thresh19.png"))


【各被験者の MaxFMS・SSQ_Total・SSQ_Nausea】
------------------------------------------------------------
0521 因幡先生  : MaxFMS=2.0, SSQ_Total=11.2, SSQ_Nausea=9.5 -> Sick
06021 今村さん  : MaxFMS=2.0, SSQ_Total=26.2, SSQ_Nausea=28.6 -> Sick
06022 梅野さん  : MaxFMS=2.0, SSQ_Total=11.2, SSQ_Nausea=19.1 -> Sick
06271       : MaxFMS=2.0, SSQ_Total=52.4, SSQ_Nausea=38.2 -> Sick
06272       : MaxFMS=3.0, SSQ_Total=33.7, SSQ_Nausea=28.6 -> Sick
06273       : MaxFMS=3.0, SSQ_Total=29.9, SSQ_Nausea=47.7 -> Sick
06274       : MaxFMS=1.0, SSQ_Total=18.7, SSQ_Nausea=9.5 -> Non-Sick
06275       : MaxFMS=0.0, SSQ_Total=0.0, SSQ_Nausea=0.0 -> Non-Sick
------------------------------------------------------------
[OK]  Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MaxFMS_vs_SSQ_Total_thresh19.png
[OK]  Plot -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\correlation\MaxFMS_vs_SSQ_Nausea_thresh19.png


In [8]:
# === Cell 4: FMS score distribution with 80th percentile position ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

subfolder = "epoch_summary"

# ---- すべての被験者の全エポックを結合 ----
df_list = []
for subject_id, name in subjects:
    file_path = os.path.join(BASE_DIR, f"{subject_id}{name}", subfolder, f"{subject_id}_all_epoch_summary.csv")
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            df["subject_id"] = subject_id
            df_list.append(df)
        except Exception as e:
            print(f"[SKIP] read: {file_path} ({e})")
    else:
        print(f"[SKIP] not found: {file_path}")

if len(df_list) == 0:
    raise SystemExit("[SKIP] no epoch_summary CSVs")

combined_df = pd.concat(df_list, ignore_index=True)

# ---- FMS列を取得（列名ゆらぎに対応：'FMS'が無ければ3列目を試す）----
if "FMS" in combined_df.columns:
    fms_series = pd.to_numeric(combined_df["FMS"], errors="coerce").dropna()
else:
    fms_series = pd.to_numeric(combined_df.iloc[:, 2], errors="coerce").dropna()

# 整数化（FMSは0,1,2,3,4想定）
fms_values = fms_series.astype(int).values
if fms_values.size == 0:
    raise SystemExit("[SKIP] no valid FMS values")

# ---- 80パーセンタイル計算（連続値と整数スコアの両方を報告）----
percentile_80_value = float(np.percentile(fms_values, 80))
sorted_fms = np.sort(fms_values)
percentile_index = int(0.8 * len(sorted_fms))  # 参考コードに合わせてfloor
percentile_index = min(max(percentile_index, 0), len(sorted_fms)-1)
score_at_percentile = int(sorted_fms[percentile_index])

# ---- スコアごとの件数 ----
fms_counts = pd.Series(fms_values).value_counts().sort_index()

# ---- ヒストグラム（ビンは0～4の整数）----
bin_edges = np.arange(-0.5, 4.5 + 1, 1)  # [-0.5, 0.5, ..., 4.5]
fig, ax = plt.subplots(figsize=(7, 5))
n, bins, patches = ax.hist(fms_values, bins=bin_edges, edgecolor="black", rwidth=0.8)

# ---- 80thが属する棒の中での位置（高さ）を算出して水平線を引く ----
target_score = score_at_percentile
count_in_bin = int(fms_counts.get(target_score, 0))
# 棒の中の並びで何番目か（0始まり）
position_in_bin = int(percentile_index - np.sum(sorted_fms < target_score))
# 棒の総高さを件数n[target_score]に対応させ、棒内の等間隔位置に水平線
if 0 <= target_score < len(n) and count_in_bin > 0:
    step_height = n[target_score] / count_in_bin
    y_line = step_height * position_in_bin
    ax.hlines(y=y_line,
              xmin=target_score - 0.4, xmax=target_score + 0.4,
              color="red", linestyle="-", linewidth=1.5,
              label=f"80th percentile in FMS = {target_score}")
else:
    y_line = None

# ---- 体裁 ----
ax.set_xticks(np.arange(0, 5, 1))
ax.set_xlabel("FMS Score", fontsize=24)
ax.set_ylabel("Frequency", fontsize=24)
ax.set_title("FMS Score Distribution", fontsize=30)
ax.tick_params(axis="both", labelsize=20)
ax.grid(True)
ax.legend(fontsize=12, loc="upper right")

ensure_dirs(OUT_PLOTS_DIST)
out_png = os.path.join(OUT_PLOTS_DIST, "FMS_histogram_80th_percentile_position.png")
fig.tight_layout()
fig.savefig(out_png, dpi=200, bbox_inches="tight")
plt.close(fig)
print(f"[OK]  FMS histogram (80th) -> {out_png}")

# ---- 統計情報出力 ----
total = len(fms_values)
num_ge_threshold = int(np.sum(fms_values >= 2))
print("\n[FMS score stats]")
print(f"Total samples: {total}")
print(f"80th percentile (value): {percentile_80_value:.2f}")
print(f"80th percentile (score): {score_at_percentile}")
print(f"Count FMS >= 2: {num_ge_threshold} ({num_ge_threshold / total * 100:.1f}%)\n")

print("[Frequency by score]")
for score in range(0, 5):
    cnt = int(fms_counts.get(score, 0))
    pct = cnt / total * 100
    print(f"FMS = {score} : {cnt} ({pct:.1f}%)")


[OK]  FMS histogram (80th) -> C:\Users\taiki\OneDrive - Science Tokyo\デスクトップ\研究\実験結果\ANALYSIS\plots\distribution\FMS_histogram_80th_percentile_position.png

[FMS score stats]
Total samples: 160
80th percentile (value): 1.00
80th percentile (score): 1
Count FMS >= 2: 30 (18.8%)

[Frequency by score]
FMS = 0 : 81 (50.6%)
FMS = 1 : 49 (30.6%)
FMS = 2 : 25 (15.6%)
FMS = 3 : 5 (3.1%)
FMS = 4 : 0 (0.0%)
