In [None]:
from google.colab import drive
drive.mount('/content/drive')  

Mounted at /content/drive


### Ankle - Wrist - Dual

#### tsfel+4 (with Baseline)

In [None]:
# 折均值±std + CM(counts/normalized) + Report
# 导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, f1_score, balanced_accuracy_score,
    precision_score, recall_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay
)
import os
import warnings
warnings.filterwarnings("ignore")

# 数据路径
TRAIN_CSV = "/content/drive/My Drive/final_project/ankle_wrist_only/train_filled.csv"
VAL_CSV = "/content/drive/My Drive/final_project/ankle_wrist_only/val_filled.csv"

# 列名
SUBJECT_COL = "Subject"
ACTIVITY_COL = "Activity"

# 人口学特征
DEMOGRAPHIC_CAT_COLS = []
DEMOGRAPHIC_NUM_COLS = []

# 要跑的模态
MODALITIES_TO_RUN = ["wrist", "ankle", "dual"]
MODALITY_DISPLAY = {"wrist": "wrist", "ankle": "ankle", "dual": "dual"}

# Baseline
SEED = 42
PCA_N_COMPONENTS = 0.6
RF_PARAMS = dict(
    n_estimators = 200,
    max_depth = 3,
    min_samples_split = 2,
    min_samples_leaf  = 2,
    max_features = "sqrt",
    class_weight = "balanced",
    random_state = SEED,
    n_jobs = -1,
)
BAGGING_N_ESTIMATORS = 10

# 输出目录
OUT_DIR_FIGS = "/content/drive/My Drive/final_project/ankle_wrist_only/final_outputs/dev_figs"
OUT_DIR_REPORTS = "/content/drive/My Drive/final_project/ankle_wrist_only/final_outputs/dev_reports"
os.makedirs(OUT_DIR_FIGS, exist_ok=True)
os.makedirs(OUT_DIR_REPORTS, exist_ok=True)

# 工具函数
def pick_columns_by_prefix(df, prefixes):
    return [c for c in df.columns if any(c.startswith(p) for p in prefixes)]

def build_modality_view(df, modality, subject_col, activity_col, demo_cat_cols, demo_num_cols):

    d = df.copy().replace([np.inf, -np.inf], np.nan)
    assert subject_col in d.columns, f"缺少列：{subject_col}"
    assert activity_col in d.columns, f"缺少列：{activity_col}"

    wrist_cols = pick_columns_by_prefix(d, ["wrist_"])
    ankle_cols = pick_columns_by_prefix(d, ["ankle_"])

    if modality == "wrist":
        feat_cols = wrist_cols
    elif modality == "ankle":
        feat_cols = ankle_cols
    elif modality == "dual": 
        feat_cols = wrist_cols + [c for c in ankle_cols if c not in wrist_cols]
    else:
        raise ValueError("modality 必须是 'wrist' | 'ankle' | 'dual'")

    demo_cat = [c for c in (demo_cat_cols or []) if c in d.columns]
    demo_num = [c for c in (demo_num_cols or []) if c in d.columns]

    # 数值特征管道
    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median"))])
    transformers = [("num", num_pipe, feat_cols)]

    # 分类特征 OHE
    if demo_cat:
        try:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ])
        transformers.append(("cat", cat_pipe, demo_cat))

    preprocessor = ColumnTransformer(
        transformers,
        remainder="drop",
        sparse_threshold=0.3  
    )

    used_cols = feat_cols + demo_cat + demo_num
    X_raw = d[used_cols].copy()
    y = d[activity_col].astype(str).values
    groups = d[subject_col].astype(str).values
    meta = d[[subject_col, activity_col]].copy()

    return X_raw, y, groups, preprocessor, meta, feat_cols

def build_fixed_baseline_pipeline(preprocessor):

    rf_base = RandomForestClassifier(**RF_PARAMS)
    bag = BaggingClassifier(
        estimator = rf_base,
        n_estimators = BAGGING_N_ESTIMATORS,
        random_state = SEED,
        n_jobs = -1
    )
    pipe = Pipeline([
        ("prep", preprocessor),
        ("scaler", StandardScaler(with_mean=True)),
        ("pca", PCA(n_components=PCA_N_COMPONENTS, random_state=SEED)),
        ("bag", bag)
    ])
    return pipe

def plot_cm_counts_and_norm(y_true, y_pred, labels, title_prefix, out_png_prefix):
    # counts
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    ConfusionMatrixDisplay(cm, display_labels=labels).plot(ax=ax, cmap="Blues", colorbar=True, values_format="d")
    ax.set_title(f"{title_prefix} — Confusion Matrix (Counts)")
    plt.tight_layout()
    png_counts = f"{OUT_DIR_FIGS}/{out_png_prefix}_tsfel_CM_counts.png"
    plt.savefig(png_counts, dpi=200)
    plt.close(fig)

    # normalized 
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)
    fig, ax = plt.subplots(figsize=(6,6))
    ConfusionMatrixDisplay(cm_norm, display_labels=labels).plot(ax=ax, cmap="Blues", colorbar=True, values_format=".2f")
    ax.set_title(f"{title_prefix} — Confusion Matrix (Normalized)")
    plt.tight_layout()
    png_norm = f"{OUT_DIR_FIGS}/{out_png_prefix}_tsfel_CM_normalized.png"
    plt.savefig(png_norm, dpi=200)
    plt.close(fig)

    return cm, cm_norm, png_counts, png_norm

def loso_evaluate_fixed(X_df, y, groups, preprocessor, label_order=None, run_name=""):

    logo = LeaveOneGroupOut()
    metrics_rows = []
    y_true_all = np.empty_like(y, dtype=object)
    y_pred_all = np.empty_like(y, dtype=object)

    # 用固定超参，逐折新建/拟合
    for fold_id, (tr, te) in enumerate(logo.split(X_df, y, groups), start=1):
        pipe = build_fixed_baseline_pipeline(preprocessor)
        pipe.fit(X_df.iloc[tr], y[tr])
        pred = pipe.predict(X_df.iloc[te])

        y_true_all[te] = y[te]
        y_pred_all[te] = pred

        row = dict(
            fold = fold_id,
            subject = ",".join(sorted(set(groups[te]))),
            acc = accuracy_score(y[te], pred),
            balAcc = balanced_accuracy_score(y[te], pred),
            macroF1 = f1_score(y[te], pred, average="macro", zero_division=0),
            prec = precision_score(y[te], pred, average="macro", zero_division=0),
            rec = recall_score(y[te], pred, average="macro", zero_division=0),
            n = len(te),
        )
        metrics_rows.append(row)

    folds_df = pd.DataFrame(metrics_rows)
    m = folds_df[["acc","balAcc","macroF1","prec","rec"]].mean()
    s = folds_df[["acc","balAcc","macroF1","prec","rec"]].std(ddof=1)

    labels = sorted(np.unique(y)) if label_order is None else label_order
    rep = classification_report(y_true_all, y_pred_all, labels=labels, output_dict=True, zero_division=0)
    rep_df = pd.DataFrame(rep).T.reset_index().rename(columns={"index":"class"})

    # 保存报告文件
    folds_csv = f"{OUT_DIR_REPORTS}/DEV_Folds__tsfel_{run_name}.csv"
    rep_csv = f"{OUT_DIR_REPORTS}/DEV_Report_tsfel_{run_name}.csv"
    folds_df.to_csv(folds_csv, index=False)
    rep_df.to_csv(rep_csv, index=False)

    print(f"\n[DEV-LOSO: {run_name}] folds={len(folds_df)} samples={len(y)} subjects={len(np.unique(groups))}")
    print("Acc : %.3f ± %.3f" % (m["acc"],    s["acc"]))
    print("Balanced Acc : %.3f ± %.3f" % (m["balAcc"], s["balAcc"]))
    print("Macro-F1 : %.3f ± %.3f" % (m["macroF1"], s["macroF1"]))
    print("Macro-Prec : %.3f ± %.3f" % (m["prec"],   s["prec"]))
    print("Macro-Rec : %.3f ± %.3f" % (m["rec"],    s["rec"]))
    print("Saved:", folds_csv)
    print("Saved:", rep_csv)

    return dict(
        folds_df=folds_df, mean=m, std=s,
        y_true_all=y_true_all, y_pred_all=y_pred_all,
        labels=labels, rep_df=rep_df, rep_csv=rep_csv,
    )

# 主流程
def main():
    # 读 DEV（train∪val）
    train_df = pd.read_csv(TRAIN_CSV)
    val_df = pd.read_csv(VAL_CSV)
    DEV_DF = pd.concat([train_df, val_df], ignore_index=True)

    # 汇总表（折均值±std）
    summary_rows = []

    for modality in MODALITIES_TO_RUN:
        Xd, yd, gd, pre_d, meta_d, feat_cols = build_modality_view(
            DEV_DF, modality, SUBJECT_COL, ACTIVITY_COL,
            DEMOGRAPHIC_CAT_COLS, DEMOGRAPHIC_NUM_COLS
        )
        disp_name = MODALITY_DISPLAY.get(modality, modality)
        run_name = f"LOSO_{disp_name}"

        print(f"\n Running LOSO for: {disp_name} | n_samples={len(yd)} | n_features={len(feat_cols)} ")
        res = loso_evaluate_fixed(
            X_df=Xd, y=yd, groups=gd, preprocessor=pre_d, run_name=run_name
        )

        # 画并保存 CM（counts + normalized）
        title_prefix = f"{disp_name} [DEV LOSO]"
        out_prefix = f"DEV_{run_name}"
        cm, cm_norm, png_counts, png_norm = plot_cm_counts_and_norm(
            res["y_true_all"], res["y_pred_all"], res["labels"], title_prefix, out_prefix
        )
        print("Saved:", png_counts)
        print("Saved:", png_norm)

        # 汇总一行
        summary_rows.append({
            "modality": disp_name,
            "Acc (mean±std)" : f"{res['mean']['acc']:.4f} ± {res['std']['acc']:.4f}",
            "Balanced Acc (mean±std)": f"{res['mean']['balAcc']:.4f} ± {res['std']['balAcc']:.4f}",
            "Macro-F1 (mean±std)" : f"{res['mean']['macroF1']:.4f} ± {res['std']['macroF1']:.4f}",
            "Macro-Prec (mean±std)" : f"{res['mean']['prec']:.4f} ± {res['std']['prec']:.4f}",
            "Macro-Rec (mean±std)" : f"{res['mean']['rec']:.4f} ± {res['std']['rec']:.4f}",
            "n_samples" : len(yd),
            "n_features" : len(feat_cols),
        })

    # 打印与保存汇总表
    summary_df = pd.DataFrame(summary_rows)
    print("\nSummary (DEV LOSO)")
    print(summary_df.to_string(index=False))

    summary_csv = f"{OUT_DIR_REPORTS}/DEV_Summary_LOSO_tsfel_.csv"
    summary_df.to_csv(summary_csv, index=False)
    print("Saved:", summary_csv)

if __name__ == "__main__":
    main()


-----

#### time-frequency (Baseline)

In [None]:
# 折均值±std + CM(counts/normalized) + Report
# 导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, f1_score, balanced_accuracy_score,
    precision_score, recall_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay
)
import os
import warnings
warnings.filterwarnings("ignore")

# 数据路径
TRAIN_CSV = "/content/drive/My Drive/final_project/ankle_wrist_only/7_2_sets/train.csv"
VAL_CSV = "/content/drive/My Drive/final_project/ankle_wrist_only/7_2_sets/val.csv"

# 列名
SUBJECT_COL = "Subject"
ACTIVITY_COL = "Activity"

# 人口学特征
DEMOGRAPHIC_CAT_COLS = []
DEMOGRAPHIC_NUM_COLS = []

# 要跑的模态
MODALITIES_TO_RUN = ["wrist", "ankle", "dual"]
MODALITY_DISPLAY = {"wrist": "wrist", "ankle": "ankle", "dual": "dual"}

# Baseline 固定超参数
SEED = 42
PCA_N_COMPONENTS = 0.6
RF_PARAMS = dict(
    n_estimators = 200,
    max_depth  = 3,
    min_samples_split = 2,
    min_samples_leaf  = 2,
    max_features = "sqrt",
    class_weight = "balanced",
    random_state = SEED,
    n_jobs = -1,
)
BAGGING_N_ESTIMATORS = 10

# 输出目录
OUT_DIR_FIGS = "/content/drive/My Drive/final_project/ankle_wrist_only/final_outputs/7_2_dev_figs"
OUT_DIR_REPORTS = "/content/drive/My Drive/final_project/ankle_wrist_only/final_outputs/7_2_dev_reports"
os.makedirs(OUT_DIR_FIGS, exist_ok=True)
os.makedirs(OUT_DIR_REPORTS, exist_ok=True)

# 工具函数
def pick_columns_by_prefix(df, prefixes):
    return [c for c in df.columns if any(c.startswith(p) for p in prefixes)]

def build_modality_view(df, modality, subject_col, activity_col, demo_cat_cols, demo_num_cols):

    d = df.copy().replace([np.inf, -np.inf], np.nan)
    assert subject_col in d.columns, f"缺少列：{subject_col}"
    assert activity_col in d.columns, f"缺少列：{activity_col}"

    wrist_cols = pick_columns_by_prefix(d, ["wrist_"])
    ankle_cols = pick_columns_by_prefix(d, ["ankle_"])

    if modality == "wrist":
        feat_cols = wrist_cols
    elif modality == "ankle":
        feat_cols = ankle_cols
    elif modality == "dual": 
        feat_cols = wrist_cols + [c for c in ankle_cols if c not in wrist_cols]
    else:
        raise ValueError("modality 必须是 'wrist' | 'ankle' | 'dual'")

    demo_cat = [c for c in (demo_cat_cols or []) if c in d.columns]
    demo_num = [c for c in (demo_num_cols or []) if c in d.columns]

    # 数值特征管道（含缺失填充）
    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median"))])
    transformers = [("num", num_pipe, feat_cols)]

    # 分类特征 OHE（稠密输出）
    if demo_cat:
        try:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe)
        ])
        transformers.append(("cat", cat_pipe, demo_cat))

    preprocessor = ColumnTransformer(
        transformers,
        remainder="drop",
        sparse_threshold=0.3   # 确保整体返回为 dense（结合上面的 dense OHE）
    )

    used_cols = feat_cols + demo_cat + demo_num
    X_raw = d[used_cols].copy()
    y = d[activity_col].astype(str).values
    groups = d[subject_col].astype(str).values
    meta = d[[subject_col, activity_col]].copy()

    return X_raw, y, groups, preprocessor, meta, feat_cols

def build_fixed_baseline_pipeline(preprocessor):

    rf_base = RandomForestClassifier(**RF_PARAMS)
    bag = BaggingClassifier(
        estimator = rf_base,
        n_estimators = BAGGING_N_ESTIMATORS,
        random_state = SEED,
        n_jobs = -1
    )
    pipe = Pipeline([
        ("prep", preprocessor),
        ("scaler", StandardScaler(with_mean=True)),
        ("pca", PCA(n_components=PCA_N_COMPONENTS, random_state=SEED)),
        ("bag", bag)
    ])
    return pipe

def plot_cm_counts_and_norm(y_true, y_pred, labels, title_prefix, out_png_prefix):
    # counts
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    ConfusionMatrixDisplay(cm, display_labels=labels).plot(ax=ax, cmap="Blues", colorbar=True, values_format="d")
    ax.set_title(f"{title_prefix} — Confusion Matrix (Counts)")
    plt.tight_layout()
    png_counts = f"{OUT_DIR_FIGS}/{out_png_prefix}_CM_counts.png"
    plt.savefig(png_counts, dpi=200)
    plt.close(fig)

    # normalized (row-wise)
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)
    fig, ax = plt.subplots(figsize=(6,6))
    ConfusionMatrixDisplay(cm_norm, display_labels=labels).plot(ax=ax, cmap="Blues", colorbar=True, values_format=".2f")
    ax.set_title(f"{title_prefix} — Confusion Matrix (Normalized)")
    plt.tight_layout()
    png_norm = f"{OUT_DIR_FIGS}/{out_png_prefix}_CM_normalized.png"
    plt.savefig(png_norm, dpi=200)
    plt.close(fig)

    return cm, cm_norm, png_counts, png_norm

def loso_evaluate_fixed(X_df, y, groups, preprocessor, label_order=None, run_name=""):

    logo = LeaveOneGroupOut()
    metrics_rows = []
    y_true_all = np.empty_like(y, dtype=object)
    y_pred_all = np.empty_like(y, dtype=object)

    # 用固定超参，逐折新建/拟合
    for fold_id, (tr, te) in enumerate(logo.split(X_df, y, groups), start=1):
        pipe = build_fixed_baseline_pipeline(preprocessor)
        pipe.fit(X_df.iloc[tr], y[tr])
        pred = pipe.predict(X_df.iloc[te])

        y_true_all[te] = y[te]
        y_pred_all[te] = pred

        row = dict(
            fold = fold_id,
            subject = ",".join(sorted(set(groups[te]))),
            acc = accuracy_score(y[te], pred),
            balAcc = balanced_accuracy_score(y[te], pred),
            macroF1 = f1_score(y[te], pred, average="macro", zero_division=0),
            prec = precision_score(y[te], pred, average="macro", zero_division=0),
            rec = recall_score(y[te], pred, average="macro", zero_division=0),
            n = len(te),
        )
        metrics_rows.append(row)

    folds_df = pd.DataFrame(metrics_rows)
    m = folds_df[["acc","balAcc","macroF1","prec","rec"]].mean()
    s = folds_df[["acc","balAcc","macroF1","prec","rec"]].std(ddof=1)

    labels = sorted(np.unique(y)) if label_order is None else label_order
    rep = classification_report(y_true_all, y_pred_all, labels=labels, output_dict=True, zero_division=0)
    rep_df = pd.DataFrame(rep).T.reset_index().rename(columns={"index":"class"})

    # 保存报告文件
    folds_csv = f"{OUT_DIR_REPORTS}/DEV_Folds_{run_name}.csv"
    rep_csv = f"{OUT_DIR_REPORTS}/DEV_Report_{run_name}.csv"
    folds_df.to_csv(folds_csv, index=False)
    rep_df.to_csv(rep_csv, index=False)

    print(f"\n[DEV-LOSO: {run_name}] folds={len(folds_df)} samples={len(y)} subjects={len(np.unique(groups))}")
    print("Acc : %.3f ± %.3f" % (m["acc"],    s["acc"]))
    print("Balanced Acc : %.3f ± %.3f" % (m["balAcc"], s["balAcc"]))
    print("Macro-F1 : %.3f ± %.3f" % (m["macroF1"], s["macroF1"]))
    print("Macro-Prec : %.3f ± %.3f" % (m["prec"],   s["prec"]))
    print("Macro-Rec : %.3f ± %.3f" % (m["rec"],    s["rec"]))
    print("Saved:", folds_csv)
    print("Saved:", rep_csv)

    return dict(
        folds_df=folds_df, mean=m, std=s,
        y_true_all=y_true_all, y_pred_all=y_pred_all,
        labels=labels, rep_df=rep_df, rep_csv=rep_csv,
    )

# 主流程
def main():
    # 读 DEV（train∪val）
    train_df = pd.read_csv(TRAIN_CSV)
    val_df = pd.read_csv(VAL_CSV)
    DEV_DF = pd.concat([train_df, val_df], ignore_index=True)

    # 汇总表（折均值±std）
    summary_rows = []

    for modality in MODALITIES_TO_RUN:
        Xd, yd, gd, pre_d, meta_d, feat_cols = build_modality_view(
            DEV_DF, modality, SUBJECT_COL, ACTIVITY_COL,
            DEMOGRAPHIC_CAT_COLS, DEMOGRAPHIC_NUM_COLS
        )
        disp_name = MODALITY_DISPLAY.get(modality, modality)
        run_name = f"LOSO_{disp_name}"

        print(f"\n Running LOSO for: {disp_name} | n_samples={len(yd)} | n_features={len(feat_cols)} ")
        res = loso_evaluate_fixed(
            X_df=Xd, y=yd, groups=gd, preprocessor=pre_d, run_name=run_name
        )

        # 画并保存 CM（counts + normalized）
        title_prefix = f"{disp_name} [DEV LOSO]"
        out_prefix = f"DEV_{run_name}"
        cm, cm_norm, png_counts, png_norm = plot_cm_counts_and_norm(
            res["y_true_all"], res["y_pred_all"], res["labels"], title_prefix, out_prefix
        )
        print("Saved:", png_counts)
        print("Saved:", png_norm)

        # 汇总一行
        summary_rows.append({
            "modality": disp_name,
            "Acc (mean±std)" : f"{res['mean']['acc']:.4f} ± {res['std']['acc']:.4f}",
            "Balanced Acc (mean±std)": f"{res['mean']['balAcc']:.4f} ± {res['std']['balAcc']:.4f}",
            "Macro-F1 (mean±std)" : f"{res['mean']['macroF1']:.4f} ± {res['std']['macroF1']:.4f}",
            "Macro-Prec (mean±std)" : f"{res['mean']['prec']:.4f} ± {res['std']['prec']:.4f}",
            "Macro-Rec (mean±std)" : f"{res['mean']['rec']:.4f} ± {res['std']['rec']:.4f}",
            "n_samples" : len(yd),
            "n_features" : len(feat_cols),
        })

    # 打印与保存汇总表
    summary_df = pd.DataFrame(summary_rows)
    print("\nSummary (DEV LOSO)")
    print(summary_df.to_string(index=False))

    summary_csv = f"{OUT_DIR_REPORTS}/DEV_Summary_LOSO.csv"
    summary_df.to_csv(summary_csv, index=False)
    print("Saved:", summary_csv)

if __name__ == "__main__":
    main()
