## Next Steps

Continue with:
1. **Noise covariance estimation** for each participant
2. **Forward model computation** (requires MRI/BEM)
3. **Inverse solution** (source reconstruction)
4. **Group-level analysis**


In [None]:
# installs

%pip install statsmodels

In [43]:
# imports

import os, re, glob, time
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib as mpl
import mne
from mne.preprocessing import ICA
from mne.minimum_norm import apply_inverse, make_inverse_operator, write_inverse_operator
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.inspection import permutation_importance
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from scipy import stats
import seaborn as sns
from sklearn.utils import shuffle as sk_shuffle
from joblib import Parallel, delayed
from tqdm import tqdm

## Noise Covariance Estimation

Compute noise covariance for whitening the data

In [None]:
## computing nosie cov for all participants


SAVE_DIR   = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds"
EPOCHS_DIR = f"{SAVE_DIR}/epochs"
COV_DIR    = f"{SAVE_DIR}/covariance"

os.makedirs(COV_DIR, exist_ok=True)

# loading in epochs 
epoch_files = [f for f in os.listdir(EPOCHS_DIR) if f.endswith('-epo_stim_withPAS_clean.fif')]

participant_noise_cov = {}

for fname in epoch_files:
    participant_id = fname.split('-epo')[0]
    print(f"\n=== Computing noise covariance for {participant_id} ===")
    epochs_path = os.path.join(EPOCHS_DIR, fname)
    epochs = mne.read_epochs(epochs_path, preload=True)

    # poise cov from baseline
    noise_cov = mne.compute_covariance(
        epochs,
        tmin=None,  
        tmax=0.0,   
        method='auto',   
        rank='info', 
        verbose=True
    )

    # saving
    cov_fname = os.path.join(COV_DIR, f"{participant_id}-cov.fif")
    mne.write_cov(cov_fname, noise_cov)

    # storing in dict
    participant_noise_cov[participant_id] = noise_cov
    print(f"✓ Done computing and saving noise covariance for {participant_id}")

print("\nAll participant noise covariance matrices have been computed and saved!")


In [None]:
## plotting noise cov for first participant 



SAVE_DIR   = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds"
EPOCHS_DIR = f"{SAVE_DIR}/epochs"
COV_DIR    = f"{SAVE_DIR}/covariance"

first_pid = list(participant_noise_cov.keys())[0]
print(f"\n=== Plotting noise covariance for {first_pid} ===")

epochs_path = os.path.join(EPOCHS_DIR, f"{first_pid}-epo_stim_withPAS_clean.fif")
epochs = mne.read_epochs(epochs_path, preload=False)

# noise cov
noise_cov = participant_noise_cov[first_pid]

# plottingg
fig = noise_cov.plot(epochs.info)
plt.suptitle(f'Noise Covariance – {first_pid}', fontsize=14)
plt.show()


## Forward model
Computing from BEM solution (already available) and trans files

In [None]:
## computing forward solution


SAVE_DIR    = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
EPOCHS_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/epochs"
FWD_DIR     = f"{SAVE_DIR}/forward"
os.makedirs(FWD_DIR, exist_ok=True)


SUBJECTS_DIR = "/work/freesurfer"                   
TRANS_ROOT   = "/work/MEG_data/workshop_data"        


# helpers
def find_trans(pid):
    hits = glob.glob(os.path.join(TRANS_ROOT, pid, "**", "workshop_2025-trans.fif"), recursive=True)
    if not hits:
        hits = glob.glob(os.path.join(TRANS_ROOT, pid, "**", "*trans.fif"), recursive=True)
    return hits[0] if hits else None

def bem_path(pid):
    return os.path.join(SUBJECTS_DIR, pid, "bem", f"{pid}-5120-bem-sol.fif")

# batch
epoch_files = sorted(glob.glob(os.path.join(EPOCHS_DIR, "*-epo_stim_withPAS_clean.fif")))
print(f"Found {len(epoch_files)} epoch files in {EPOCHS_DIR}")

for ep_path in epoch_files:
    pid = os.path.basename(ep_path).split("-epo")[0]
    print(f"\n→ {pid}")

    trans = find_trans(pid)
    bem_fif = bem_path(pid)
    if not trans or not os.path.exists(bem_fif):
        print(f"  Skipping (missing trans or BEM). trans_found={bool(trans)}, bem_found={os.path.exists(bem_fif)}")
        continue

    epochs  = mne.read_epochs(ep_path, preload=False)
    bem_sol = mne.read_bem_solution(bem_fif)

    src = mne.setup_source_space(pid, spacing="oct6", subjects_dir=SUBJECTS_DIR, add_dist=False, verbose=False)
    fwd = mne.make_forward_solution(epochs.info, trans=trans, src=src, bem=bem_sol,
                                    meg=True, eeg=False, mindist=5.0, verbose=False)
    fwd = mne.convert_forward_solution(fwd, surf_ori=True, force_fixed=True, use_cps=True)

    out = os.path.join(FWD_DIR, f"{pid}-fwd.fif")
    mne.write_forward_solution(out, fwd, overwrite=True)
    print(f"  ✓ saved {out}")


## Inverse operator

In [None]:
## building inverse operator

EEE_DIR   = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds"
SAVE_DIR  = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"

EPOCHS_DIR = f"{EEE_DIR}/epochs"
COV_DIR    = f"{EEE_DIR}/covariance"
FWD_DIR    = f"{SAVE_DIR}/forward"
INV_DIR    = f"{SAVE_DIR}/inverse"
os.makedirs(INV_DIR, exist_ok=True)

epoch_files = mne.read_epochs("/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/epochs/0163-epo_stim_withPAS_clean.fif")

# inverse settings
loose = 0.0          # fixed orientation - matches fwd


for ep_path in sorted(glob.glob(os.path.join(EPOCHS_DIR, "*-epo_stim_withPAS_clean.fif"))):
    pid = os.path.basename(ep_path).split("-epo")[0]
    fwd_path = os.path.join(FWD_DIR, f"{pid}-fwd.fif")
    cov_path = os.path.join(COV_DIR, f"{pid}-cov.fif")
    if not (os.path.exists(fwd_path) and os.path.exists(cov_path)):
        print(f"→ {pid}: missing fwd/cov, skipping."); continue

    epochs    = mne.read_epochs(ep_path, preload=False)
    fwd       = mne.read_forward_solution(fwd_path)
    noise_cov = mne.read_cov(cov_path)

    inv = make_inverse_operator(
        info=epochs.info, forward=fwd, noise_cov=noise_cov,
        loose=loose, depth=None, rank="info"
    )
    out = os.path.join(INV_DIR, f"{pid}-inv.fif")
    write_inverse_operator(out, inv, overwrite=True)
    print(f"✓ {pid}: inverse → {out}")


In [None]:
## applying inv

STC_DIR= f"{SAVE_DIR}/stc"
os.makedirs(STC_DIR, exist_ok=True)


method  = "dSPM"
lambda2 = 1.0 / 9.0

for ep_path in sorted(glob.glob(os.path.join(EPOCHS_DIR, "*-epo_stim_withPAS_clean.fif"))):
    pid = os.path.basename(ep_path).split("-epo")[0]
    inv_path = os.path.join(INV_DIR, f"{pid}-inv.fif")
    if not os.path.exists(inv_path):
        print(f"→ {pid}: missing inverse; run the inverse build chunk first."); continue

    epochs = mne.read_epochs(ep_path, preload=False)
    if epochs.metadata is None or "PAS" not in epochs.metadata.columns:
        print(f"→ {pid}: no PAS metadata; skipping."); continue

    # collapsing PAS labels: 1→"1", 2→"2", 3/4→"3_4"
    pas_raw = epochs.metadata["PAS"].astype(float)
    pas_collapsed = pas_raw.map(lambda x: "1" if x==1 else "2" if x==2 else "3_4" if x in (3,4) else np.nan)
    keep = pas_collapsed.notna().to_numpy()
    if keep.sum() == 0:
        print(f"→ {pid}: no epochs with PAS 1/2/3_4 after collapsing; skipping."); continue

    epochs = epochs[keep]
    pas_collapsed = pas_collapsed[keep].reset_index(drop=True)

    inv = mne.minimum_norm.read_inverse_operator(inv_path)

    # trial-wise source estimates
    stcs = mne.minimum_norm.apply_inverse_epochs(epochs, inv, lambda2=lambda2, method=method,
                                return_generator=False, verbose=False)

    # one file per trial 
    out_dir = os.path.join(STC_DIR, pid)
    os.makedirs(out_dir, exist_ok=True)
    for i, (stc_tr, lbl) in enumerate(zip(stcs, pas_collapsed)):
        stc_tr.save(os.path.join(out_dir, f"{pid}-trial{i+1:04d}-PAS{lbl}-{method}.h5"), overwrite=True)

    print(f"✓ {pid}: saved {len(stcs)} trial STCs → {out_dir}")


### Sanity checks

In [None]:
# whitening check - how is the noise covariance and the rank? (with plot)

pid        = "0164"
EPOCHS_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/epochs"
COV_DIR    = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/covariance"

epochs    = mne.read_epochs(os.path.join(EPOCHS_DIR, f"{pid}-epo_stim_withPAS_clean.fif"), preload=False)
noise_cov = mne.read_cov(os.path.join(COV_DIR, f"{pid}-cov.fif"))

# pickng any cond in epochs 
cond = list(epochs.event_id.keys())[0]
evk  = epochs[cond].average()

# whitening plot - want flat traces around 0 in baseline, GFP near dashed line
evk.plot_white(noise_cov)


In [None]:
## SC on collapsed PAS evokeds 

pid = "0163"
INV_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files/inverse"
EVK_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/evokeds/collapsed_evk"
COV_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/covariance"

inv  = mne.minimum_norm.read_inverse_operator(f"{INV_DIR}/{pid}-inv.fif")


for name in [f"{pid}-PAS2-ave.fif", f"{pid}-PAS1-ave.fif", f"{pid}-PAS3_4-ave.fif"]:
    p = os.path.join(EVK_DIR, name)
    if os.path.exists(p):
        evk = mne.read_evokeds(p, condition=0)
        break
else:
    raise FileNotFoundError("No collapsed PAS evoked found.")

# whitening check (just numeric)
noise_cov = mne.read_cov(os.path.join(COV_DIR, f"{pid}-cov.fif"))
evk.plot_white(noise_cov)

# Inverse
stc = mne.minimum_norm.apply_inverse(evk, inv, lambda2=1/9, method="dSPM")
try:
    vtx, t_peak, amp = stc.get_peak(hemi="lh", return_amplitude=True)
except TypeError:
    vtx, t_peak = stc.get_peak(hemi="lh")
    ti = int(np.argmin(np.abs(stc.times - t_peak)))
    li = int(np.where(stc.vertices[0] == vtx)[0][0])
    amp = float(stc.data[li, ti])

t = stc.times
base = t < 0
resp = (t >= max(t[0], t_peak-0.03)) & (t <= min(t[-1], t_peak+0.03))
src_ratio = np.nanpercentile(np.abs(stc.data[:, resp]), 95) / max(np.nanpercentile(np.abs(stc.data[:, base]), 95), 1e-12)
gfp_ratio = (np.mean(np.sqrt((evk.data[:, resp]**2).sum(axis=0))) /
             max(np.mean(np.sqrt((evk.data[:, base]**2).sum(axis=0))), 1e-12))

print(f"dSPM peak: {amp:.2f} at {t_peak*1000:.0f} ms | Source ratio: {src_ratio:.2f} | GFP ratio: {gfp_ratio:.2f}")


# Last step: Modeling subjective experience

## Model #1: multinomial logistic regression classifier trained on predefined time intervals

In [None]:
# setup and ROI loading 

SAVE_DIR   = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
STC_DIR    = f"{SAVE_DIR}/stc"
INV_DIR    = f"{SAVE_DIR}/inverse"
FEAT_DIR   = f"{SAVE_DIR}/features"
EPOCHS_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/epochs"
subjects_dir = "/work/freesurfer"

os.makedirs(FEAT_DIR, exist_ok=True)

# ROI groups (same as Model 3)
OT = {
    'pericalcarine','cuneus','lateraloccipital','lingual',
    'fusiform','inferiortemporal','middletemporal','superiortemporal'
}

FP = {
    'superiorfrontal','rostralmiddlefrontal','caudalmiddlefrontal',
    'lateralorbitofrontal','medialorbitofrontal',
    'superiorparietal','inferiorparietal','precuneus','postcentral',
    'rostralanteriorcingulate','isthmuscingulate'
}

# ROI loading - Desikan–Killiany, native space
roi_labels = {}

for ep_path in sorted(glob.glob(os.path.join(EPOCHS_DIR, "*-epo_stim_withPAS_clean.fif"))):
    pid = os.path.basename(ep_path).split("-epo")[0]
    try:
        labs = mne.read_labels_from_annot(subject=pid, parc="aparc", subjects_dir=subjects_dir)
    except FileNotFoundError:
        print(f"→ {pid}: missing aparc")
        continue

    # drop unknown
    sel = [lab for lab in labs if not lab.name.lower().startswith(("unknown","corpuscallosum"))]
    if not sel:
        print(f"→ {pid}: no cortical labels found")
        continue

    roi_labels[pid] = sel

    # just for information: how many ROIs fall into OT / FP / Other?
    base_names = [lab.name.split("-")[0].lower() for lab in sel]
    n_ot    = sum(b in OT for b in base_names)
    n_fp    = sum(b in FP for b in base_names)
    n_other = len(sel) - n_ot - n_fp

    print(f"✓ {pid}: cached {len(sel)} aparc labels "
          f"(OT={n_ot}, FP={n_fp}, Other={n_other})")


In [None]:
# feature extraction 

WIN_DEFS = {
    "VAN": (0.150, 0.250),   # visual awareness negativity
    "LP":  (0.330, 0.550),   # late positivity
}
BASELINE = (-0.200, 0.000)

def extract_features_for_participant(pid, do_baseline=True, save_to_disk=False):
    # find left-hemi files
    stc_files = sorted(glob.glob(os.path.join(STC_DIR, pid, f"{pid}-trial*-PAS*-dSPM.h5-lh.stc")))
    if not stc_files:
        raise FileNotFoundError(f"No STCs matched for {pid} in {os.path.join(STC_DIR, pid)}")

    inv = mne.minimum_norm.read_inverse_operator(os.path.join(INV_DIR, f"{pid}-inv.fif"))
    src = inv["src"]

    labels = roi_labels.get(pid)
    if not labels:
        raise RuntimeError(f"No ROI labels cached for {pid}. Run the ROI-loading step first.")

    # time indices
    ex = mne.read_source_estimate(stc_files[0])
    times = ex.times
    idx_win  = {w: np.where((times >= a) & (times <= b))[0] for w, (a,b) in WIN_DEFS.items()}
    idx_base = np.where((times >= BASELINE[0]) & (times <= BASELINE[1]))[0]
    if do_baseline and idx_base.size == 0:
        raise ValueError(f"Baseline {BASELINE} outside STC range [{times[0]:.3f},{times[-1]:.3f}]")

    def pas_to_class(fp):
        m = re.search(r"PAS(1|2|3_4)", fp)
        return {'1':0, '2':1, '3_4':2}[m.group(1)] if m else None

    X, y = [], []
    for fp in stc_files:
        c = pas_to_class(fp)
        if c is None:
            continue
        y.append(c)

        stc = mne.read_source_estimate(fp)  # loads both hemis
        ltc = mne.extract_label_time_course(stc, labels, src=src, mode='mean_flip')  # (n_rois, n_times)

        if do_baseline:
            base = ltc[:, idx_base].mean(axis=1, keepdims=True)
            ltc  = ltc - base

        # averaging within VAN and LP windows
        feats = []
        for w in ["VAN", "LP"]:
            feats.append(ltc[:, idx_win[w]].mean(axis=1))
        X.append(np.concatenate(feats))

    X = np.asarray(X)
    y = np.asarray(y)
    feat_names = [f"{lab.name}_{w}" for w in ["VAN","LP"] for lab in labels]

    if save_to_disk:
        out_path = os.path.join(FEAT_DIR, f"{pid}_features_baselineVANLP.npz")
        np.savez_compressed(
            out_path,
            X=X, y=y,
            feature_names=np.array(feat_names, dtype=object),
            pid=pid,
            windows=np.array(WIN_DEFS, dtype=object),
            baseline=np.array(BASELINE, dtype=float)
        )
        print(f"✓ {pid}: saved features → {out_path}")

    return X, y, feat_names


In [None]:
# executing feature extraction

pids = [os.path.basename(p) for p in sorted(glob.glob(os.path.join(STC_DIR, "*")))]
print(f"Found {len(pids)} participants in {STC_DIR}.")

for pid in pids:
    try:
        X, y, fn = extract_features_for_participant(pid, do_baseline=True, save_to_disk=True)
        print(f"✓ {pid}: X={X.shape}, classes={np.bincount(y)}")
    except Exception as e:
        print(f"⚠ {pid}: {e}")


In [None]:
## checking how feature extraction and ROI are doing - also make a plot potentially!

pid = "0163" 

# what files are we matching: 
pattern = os.path.join(STC_DIR, pid, f"{pid}-trial*-PAS*-dSPM.h5-lh.stc")
files = sorted(glob.glob(pattern))
print(f"\n[{pid}] matched {len(files)} left-hemi files. First 3:\n", files[:3])

# oading one STC 
t0 = time.perf_counter()
stc = mne.read_source_estimate(files[0]) 
t1 = time.perf_counter()
print(f"read_source_estimate time: {(t1 - t0)*1000:.1f} ms")
print("stc vertices per hemi:", [len(v) for v in stc.vertices], "| n_times:", len(stc.times))
print("time range:", f"{stc.times[0]:.3f} → {stc.times[-1]:.3f} s")

# oing baseline and 1 extraction
inv = mne.minimum_norm.read_inverse_operator(os.path.join(INV_DIR, f"{pid}-inv.fif"))
ltc = mne.extract_label_time_course(stc, roi_labels[pid], src=inv["src"], mode="mean_flip")
print("label time course shape:", ltc.shape)  # (n_rois, n_times)

# checking baseline 
BASELINE = (-0.200, 0.000)
idx_base = np.where((stc.times >= BASELINE[0]) & (stc.times <= BASELINE[1]))[0]
pre_means = ltc[:, idx_base].mean(axis=1)
print("baseline window samples:", idx_base.size, "| pre-stim means (first 5):", np.round(pre_means[:5], 3))

# running plus summary 
t0 = time.perf_counter()
X_demo, y_demo, fn_demo = extract_features_for_participant(pid, do_baseline=True)
t1 = time.perf_counter()
print(f"\nextract_features_for_participant time: {(t1 - t0)*1000:.1f} ms")
print("X shape:", X_demo.shape, "| y counts:", np.bincount(y_demo))
print("First 5 feature names:", fn_demo[:5])
print("First row (first 8 features):", np.round(X_demo[0, :8], 3))


In [None]:
# decoding per participant + check for overfitting 


OUT_CSV   = os.path.join(FEAT_DIR, "decoding_per_participant_VANLP_with_gap.csv")
CV_SPLITS = 5

def make_pipe():
    return Pipeline([
        ("z", StandardScaler()),
        ("clf", LogisticRegression(
            multi_class="multinomial",
            solver="lbfgs",
            max_iter=2000,
            class_weight="balanced",
            C=0.1,
            random_state=42
        ))
    ])

rows = []
cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=7)

for npz_path in sorted(glob.glob(os.path.join(FEAT_DIR, "*_features_baselineVANLP.npz"))):
    data = np.load(npz_path, allow_pickle=True)
    X, y  = data["X"], data["y"]
    pid   = str(data.get("pid", os.path.basename(npz_path).split("_features_")[0]))

    tr_accs, va_accs = [], []
    for tr, te in cv.split(X, y):
        clf = make_pipe()
        clf.fit(X[tr], y[tr])

        yhat_tr = clf.predict(X[tr])
        yhat_te = clf.predict(X[te])

        tr_accs.append(balanced_accuracy_score(y[tr], yhat_tr))
        va_accs.append(balanced_accuracy_score(y[te], yhat_te))

    tr_accs = np.array(tr_accs)
    va_accs = np.array(va_accs)
    gap     = tr_accs - va_accs

    rows.append({
        "pid": pid,
        "n_trials": int(X.shape[0]),
        "n_features": int(X.shape[1]),
        "val_bal_acc_mean": float(va_accs.mean()),
        "val_bal_acc_sd":   float(va_accs.std(ddof=1)),
        "train_bal_acc_mean": float(tr_accs.mean()),
        "gap_mean": float(gap.mean())
    })
    print(f"{pid}: val={va_accs.mean():.3f}±{va_accs.std(ddof=1):.3f} | train={tr_accs.mean():.3f} | gap={gap.mean():.3f}")

df = pd.DataFrame(rows).sort_values("pid")
df.to_csv(OUT_CSV, index=False)
print("\nSaved per-participant results: ", OUT_CSV)
df


In [None]:
# quick sum across participants 

## Purpose: compute group level summary of mean validation balanced accuracy and mean overfitting gap across all participants 

df["val_bal_acc_mean"].mean(), df["val_bal_acc_mean"].std()
df["gap_mean"].mean(), df["gap_mean"].std()

In [None]:
# one-sample t-test comparing each participant's mean balanced accuracy score against chance (0.333)

from scipy.stats import ttest_1samp
chance = 1/3
t, p = ttest_1samp(df["val_bal_acc_mean"], chance)
print(f"t={t:.2f}, p={p:.3f}")


In [None]:
# ranking participants

## Purpose: to rank participants based on which participants contributed most robustly to the group effect. 

df["val/train_ratio"] = df["val_bal_acc_mean"] / df["train_bal_acc_mean"]
df["efficiency"] = df["val_bal_acc_mean"] - df["gap_mean"]  # penalises overfitting

df.sort_values("efficiency", ascending=False)

In [None]:
# plotting group summary 


# loading data
FEAT_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files/features"
CSV = os.path.join(FEAT_DIR, "decoding_per_participant_VANLP_with_gap.csv")
df = pd.read_csv(CSV).sort_values("val_bal_acc_mean", ascending=False)

pids = df["pid"].astype(str).tolist()
chance = 1/3

# stats
val_mean, val_sd = df["val_bal_acc_mean"].mean(), df["val_bal_acc_mean"].std(ddof=1)
gap_mean, gap_sd = df["gap_mean"].mean(), df["gap_mean"].std(ddof=1)
t, p = ttest_1samp(df["val_bal_acc_mean"], chance)
print(f"Group val = {val_mean:.3f} ± {val_sd:.3f}")
print(f"Group gap = {gap_mean:.3f} ± {gap_sd:.3f}")
print(f"t-test vs chance (0.333): t={t:.2f}, p={p:.3f}")

# 1st plot: validation vs training accuracy
x = np.arange(len(pids))
w = 0.38

plt.figure(figsize=(8, 4))
plt.bar(x - w/2, df["val_bal_acc_mean"], width=w, label="Validation", alpha=0.9)
plt.bar(x + w/2, df["train_bal_acc_mean"], width=w, label="Training", alpha=0.5)
plt.axhline(chance, color="k", ls="--", lw=1, label="Chance (1/3)")
plt.xticks(x, pids, rotation=45, ha="right")
plt.ylabel("Balanced accuracy")
plt.title(f"Per-participant decoding (VAN + LP) — t={t:.2f}, p={p:.3f}")
plt.legend()
plt.tight_layout()
f1 = os.path.join(FEAT_DIR, "group_decoding_bar_VANLP.png")
plt.savefig(f1, dpi=300)
plt.close()
print("Saved:", f1)

# 2nd plot: overfitting per participant
plt.figure(figsize=(8, 3))
plt.bar(pids, df["gap_mean"], color="tab:orange")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Train − Val")
plt.title("Overfitting gap per participant (sorted by val. accuracy)")
plt.tight_layout()
f2 = os.path.join(FEAT_DIR, "overfit_gap_per_participant.png")
plt.savefig(f2, dpi=300)
plt.close()
print("Saved:", f2)

In [None]:

# paths
IMP_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files/importance"
OUT_DIR = os.path.join(IMP_DIR, "plots")
os.makedirs(OUT_DIR, exist_ok=True)

# loading participants
files = sorted(glob.glob(os.path.join(IMP_DIR, "*_features.csv")))
if not files:
    raise FileNotFoundError(f"No importance files found in {IMP_DIR}")

dfs = [pd.read_csv(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)


# Helper to split "feature" to ROI + window
def split_feature(f):
    roi, win = f.rsplit("_", 1)
    return roi, win

df_all[["roi", "window"]] = df_all["feature"].apply(
    lambda x: pd.Series(split_feature(x))
)

# eeping VAN and LP only 
df_filtered = df_all[df_all["window"].isin(["VAN", "LP"])]

mean_imp = df_filtered.groupby(["roi", "window"])["importance"].mean().unstack()

# lotting heatmap
plt.figure(figsize=(8, 12))
sns.heatmap(
    mean_imp,
    cmap="magma",
    cbar_kws={"label": "Mean permutation importance"},
    annot=False
)

plt.title("ROI × window importance (Model 1)")
plt.xlabel("Time window")
plt.ylabel("ROI")
plt.tight_layout()

save_path = os.path.join(OUT_DIR, "roi_window_importance_heatmap.png")
plt.savefig(save_path, dpi=300)
plt.close()

print("Saved heatmap to:", save_path)


In [None]:

BASE_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
MODEL_DIR = os.path.join(BASE_DIR, "model_1")
IMP_DIR   = os.path.join(MODEL_DIR, "importance")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(IMP_DIR, exist_ok=True)

FEAT_DIR  = os.path.join(BASE_DIR, "features")
RES_CSV   = os.path.join(FEAT_DIR, "decoding_per_participant_VANLP_with_gap.csv")

df = pd.read_csv(RES_CSV).sort_values("pid").reset_index(drop=True)
chance = 1/3

In [None]:
## group stats plus 95% CI - pretty much the same as the ones before, but most report-ready and pretty!

mean_acc  = df["val_bal_acc_mean"].mean()
sem_acc   = df["val_bal_acc_mean"].std(ddof=1)/np.sqrt(len(df))
ci95 = (mean_acc - 1.96*sem_acc, mean_acc + 1.96*sem_acc)

t, p = ttest_1samp(df["val_bal_acc_mean"], chance)

summary = pd.DataFrame([{
    "n_participants": len(df),
    "group_mean_val": mean_acc,
    "ci95_low": ci95[0],
    "ci95_high": ci95[1],
    "t_vs_chance": t,
    "p_vs_chance": p,
    "group_mean_gap": df["gap_mean"].mean(),
}])
summary_path = os.path.join(MODEL_DIR, "group_summary.csv")
summary.to_csv(summary_path, index=False)

print(f"Group mean = {mean_acc:.3f} (95% CI {ci95[0]:.3f}–{ci95[1]:.3f})")
print(f"t-test vs chance (0.333): t={t:.2f}, p={p:.4f}")
print("Saved:", summary_path)


In [None]:
# overfitting measures, focusing on the gap itself

# side-by-side bars
x = np.arange(len(df))
w = 0.38
plt.figure(figsize=(8,4))
plt.bar(x - w/2, df["val_bal_acc_mean"], width=w, label="Validation", alpha=0.9)
plt.bar(x + w/2, df["train_bal_acc_mean"], width=w, label="Training", alpha=0.5)
plt.axhline(chance, color="k", ls="--", lw=1, label="Chance (1/3)")
plt.xticks(x, df["pid"].astype(str), rotation=45, ha="right")
plt.ylabel("Balanced accuracy")
plt.title("Per-participant decoding (VAN + LP)")
plt.legend(); plt.tight_layout()
f1 = os.path.join(MODEL_DIR, "per_participant_val_vs_train.png")
plt.savefig(f1, dpi=200); plt.close(); print("Saved:", f1)

# gap per participant + histogram
plt.figure(figsize=(8,3))
plt.bar(df["pid"].astype(str), df["gap_mean"], color="tab:orange")
plt.xticks(rotation=45, ha="right"); plt.ylabel("Train − Val")
plt.title("Overfitting gap per participant"); plt.tight_layout()
f2 = os.path.join(MODEL_DIR, "overfit_gap_per_participant.png")
plt.savefig(f2, dpi=200); plt.close(); print("Saved:", f2)

plt.figure(figsize=(5,3))
plt.hist(df["gap_mean"], bins=8, edgecolor="k")
plt.xlabel("Train − Val"); plt.ylabel("Count")
plt.title("Distribution of overfitting gap"); plt.tight_layout()
f3 = os.path.join(MODEL_DIR, "overfit_gap_hist.png")
plt.savefig(f3, dpi=200); plt.close(); print("Saved:", f3)


In [None]:
# permutation importance per feature (fitted on all trials per pid)

## Purpose: answering the question: which source-space regions in VAN and LP regions drive awareness decoding most strongly? 

def make_pipe_for_imp():
    return Pipeline([
        ("z", StandardScaler()),
        ("clf", LogisticRegression(multi_class="multinomial",
                                  solver="lbfgs", max_iter=2000,
                                  class_weight="balanced"))
    ])

def compute_perm_importance_for_pid(npz_path, n_repeats=50, random_state=7):
    dat = np.load(npz_path, allow_pickle=True)
    X, y = dat["X"], dat["y"]
    fn   = dat["feature_names"].astype(str)
    pid  = str(dat.get("pid", os.path.basename(npz_path).split("_features_")[0]))

    pipe = make_pipe_for_imp().fit(X, y)
    res  = permutation_importance(pipe, X, y, scoring="balanced_accuracy",
                                  n_repeats=n_repeats, random_state=random_state)
    imp  = pd.DataFrame({
        "pid": pid,
        "feature": fn,
        "importance": res.importances_mean,
        "importance_sd": res.importances_std
    }).sort_values("importance", ascending=False)

    out_csv = os.path.join(IMP_DIR, f"{pid}_perm_importance_features.csv")
    imp.to_csv(out_csv, index=False)
    print(f"Saved feature importance → {out_csv}")
    return imp

all_imps = []
for npz in sorted(glob.glob(os.path.join(FEAT_DIR, "*_features_baselineVANLP.npz"))):
    all_imps.append(compute_perm_importance_for_pid(npz, n_repeats=50))


In [None]:
# aggregate confusion matrix - important, keep!

BASE_DIR  = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
FEAT_DIR  = os.path.join(BASE_DIR, "features")
MODEL_DIR = os.path.join(BASE_DIR, "model_1")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
def make_pipe():
    return Pipeline([("z", StandardScaler()),
                     ("clf", LogisticRegression(multi_class="multinomial",
                                               solver="lbfgs", max_iter=2000,
                                               class_weight="balanced", C=0.1))])

labels = ["PAS1","PAS2","PAS3_4"]
cm = np.zeros((3,3), dtype=float)
n_tot = 0

for npz in sorted(glob.glob(os.path.join(FEAT_DIR, "*_features_baselineVANLP.npz"))):
    d = np.load(npz, allow_pickle=True)
    X, y = d["X"], d["y"]
    y_pred = cross_val_predict(make_pipe(), X, y, cv=cv, method="predict")
    cm += confusion_matrix(y, y_pred, labels=[0,1,2]).astype(float)
    n_tot += len(y)

cm_norm = (cm.T / cm.sum(axis=1)).T
plt.figure(figsize=(4.2,3.8))
im = plt.imshow(cm_norm, cmap="Blues", vmin=0, vmax=1)
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(3), labels); plt.yticks(range(3), labels)
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title("Aggregate confusion matrix (recall normalised)")
for i in range(3):
    for j in range(3):
        plt.text(j, i, f"{cm_norm[i,j]:.2f}", ha="center", va="center", fontsize=9)
plt.tight_layout()
out = os.path.join(MODEL_DIR, "aggregate_confusion_matrix.png")
plt.savefig(out, dpi=200); plt.close(); print("Saved:", out)


In [None]:
# permutation null significance per participant 


BASE_DIR  = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
FEAT_DIR  = os.path.join(BASE_DIR, "features")
MODEL_DIR = os.path.join(BASE_DIR, "model_1")
os.makedirs(MODEL_DIR, exist_ok=True)

# CV + model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
def make_pipe():
    return Pipeline([
        ("z", StandardScaler()),
        ("clf", LogisticRegression(
            multi_class="multinomial",
            solver="lbfgs",
            max_iter=2000,
            class_weight="balanced",
            C=0.1  # keeping consistent with main Model 1
        ))
    ])

def perm_p_value(obs, null):
    # exact permutation p with add-one smoothing
    return (1 + np.sum(null >= obs)) / (1 + len(null))

def perm_null_stats(X, y, n_perm=200, rng_seed=0):
    pipe = make_pipe()
    # observed (cross-validated)
    obs = cross_val_score(pipe, X, y, cv=cv, scoring="balanced_accuracy").mean()
    # permutation null
    rng = np.random.RandomState(rng_seed)
    null = np.empty(n_perm, dtype=float)
    for i in range(n_perm):
        y_perm = sk_shuffle(y, random_state=int(rng.randint(0, 1e9)))
        null[i] = cross_val_score(pipe, X, y_perm, cv=cv, scoring="balanced_accuracy").mean()
    null_mean = float(null.mean())
    null_sd   = float(null.std(ddof=1))
    z         = (obs - null_mean) / null_sd if null_sd > 0 else np.nan
    p_perm    = perm_p_value(obs, null)
    return float(obs), null_mean, null_sd, float(z), float(p_perm)

rows = []
for npz in sorted(glob.glob(os.path.join(FEAT_DIR, "*_features_baselineVANLP.npz"))):
    d   = np.load(npz, allow_pickle=True)
    X   = d["X"]; y = d["y"]
    pid = str(d.get("pid", os.path.basename(npz).split("_features_")[0]))
    obs, m, s, z, p = perm_null_stats(X, y, n_perm=200, rng_seed=7)
    rows.append({"pid": pid, "observed": obs, "null_mean": m, "null_sd": s, "z_score": z, "p_perm": p})
    print(f"{pid}: obs={obs:.3f}, null={m:.3f}±{s:.3f}, z={z:.2f}, p_perm={p:.4f}")

df_z = pd.DataFrame(rows).sort_values("pid").reset_index(drop=True)

# FDR across participants
rej, p_fdr, _, _ = multipletests(df_z["p_perm"].values, method="fdr_bh")
df_z["p_fdr"] = p_fdr
df_z["sig_fdr_05"] = rej

csv_path = os.path.join(MODEL_DIR, "perm_zscores.csv")
df_z.to_csv(csv_path, index=False)
print("✓ saved z-scores & permutation p-values →", csv_path)

# --- Plot: z-scores with significance threshold ---
plt.figure(figsize=(6,4))
colors = np.where(df_z["sig_fdr_05"], "tab:purple", "lightgray")
plt.bar(df_z["pid"].astype(str), df_z["z_score"], color=colors, alpha=0.9, edgecolor="k", linewidth=0.4)
plt.axhline(1.96, color="gray", ls="--", lw=1, label="≈ p < 0.05 (z=1.96)")
plt.axhline(0, color="black", lw=0.8)
plt.ylabel("Permutation z-score")
plt.title("Significance of decoding per participant (FDR-highlighted)")
plt.legend()
plt.tight_layout()
out = os.path.join(MODEL_DIR, "zscore_per_participant.png")
plt.savefig(out, dpi=200); plt.close(); print("Saved:", out)


In [None]:
# oading per participant permutation importances

IMP_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files/importance"
files = sorted(glob.glob(os.path.join(IMP_DIR, "*features.csv")))
if not files:
    raise FileNotFoundError(f"No importance CSVs found in {IMP_DIR}")

dfs = [pd.read_csv(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)

# ean importance for participants
mean_imp = df_all.groupby("feature")[["LP", "VAN"]].mean()
mean_imp = mean_imp.sort_index()

# lotting heatmap
plt.figure(figsize=(8, 12))
sns.heatmap(
    mean_imp,
    cmap="magma",
    annot=False,
    cbar_kws={"label": "Mean permutation importance"}
)

plt.title("ROI × window importance (mean across participants)")
plt.xlabel("Time window")
plt.ylabel("ROI")
plt.tight_layout()
plt.show()


## Model #2: Pairwise PAS decoding

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
def make_pipe():
    return Pipeline([
        ("z", StandardScaler()),
        ("clf", LogisticRegression(solver="lbfgs", max_iter=2000,
                                   class_weight="balanced"))
    ])

PAIRWISE = [(0,1,"PAS1_vs_PAS2"), (1,2,"PAS2_vs_PAS3_4"), (0,2,"PAS1_vs_PAS3_4")]
rows = []
for npz_path in sorted(glob.glob(os.path.join(FEAT_DIR, "*_features_baselineVANLP.npz"))):
    d = np.load(npz_path, allow_pickle=True)
    X, y, pid = d["X"], d["y"], str(d.get("pid", os.path.basename(npz_path).split("_features_")[0]))
    for a,b,label in PAIRWISE:
        m = np.isin(y, [a,b]); 
        if m.sum() < 10: continue
        Xs, ys = X[m], y[m]
        ybin = (ys==b).astype(int)
        acc = cross_val_score(make_pipe(), Xs, ybin, cv=cv, scoring="balanced_accuracy").mean()
        rows.append({"pid": pid, "pair": label, "bal_acc": acc})

pair_df = pd.DataFrame(rows).sort_values(["pair","pid"])
pair_csv = os.path.join(MODEL_DIR, "pairwise_results.csv")
pair_df.to_csv(pair_csv, index=False)
print("Saved pairwise:", pair_csv)

# t-tests vs chance=0.5 for each pair
summ_rows = []
for p in pair_df["pair"].unique():
    vals = pair_df.loc[pair_df["pair"]==p, "bal_acc"].values
    t, pval = ttest_1samp(vals, 0.5)
    summ_rows.append({"pair": p, "mean": vals.mean(), "sd": vals.std(ddof=1), "t": t, "p": pval})
pair_summary = pd.DataFrame(summ_rows).sort_values("pair")
pair_summary.to_csv(os.path.join(MODEL_DIR, "pairwise_summary.csv"), index=False)
print(pair_summary)

# boxplot (matplotlib)
plt.figure(figsize=(6,4))
groups = [pair_df.loc[pair_df["pair"]==p, "bal_acc"].values for p in pair_summary["pair"]]
plt.boxplot(groups, labels=pair_summary["pair"], patch_artist=True)
plt.axhline(0.5, color="gray", ls="--"); plt.ylabel("Balanced accuracy")
plt.title("Pairwise PAS decoding"); plt.tight_layout()
bp = os.path.join(MODEL_DIR, "pairwise_boxplot.png")
plt.savefig(bp, dpi=200); plt.close(); print("Saved:", bp)


## Model #3: Cumulative time decoding model based on logistic regression fitting over entire epoch timepoints

Tasks: 

- build ROI×time data per participant (baseline + decimate)
- run cumulative decoding (all ROIs, OT-only, FP-only)
- save per-participant curves and a group plot with VAN/LP shading

In [None]:
# imports and setup

# paths
SAVE_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files"
STC_DIR  = f"{SAVE_DIR}/stc"
INV_DIR  = f"{SAVE_DIR}/inverse"
OUT_DIR  = f"{SAVE_DIR}/cumulative_fast"
os.makedirs(OUT_DIR, exist_ok=True)

BASELINE = (-0.200, 0.000)
VAN_WIN  = (0.150, 0.250)
LP_WIN   = (0.330, 0.550)
chance   = 1/3

# runtime/sampling specs
DECIMATE  = 4
STEP      = 3
CUTOFF    = 0.55
CV_SPLITS = 5         

# Regularisation/feature
SUPER_K     = 4        # trials to average into one super-trial (train side)
AVG_TEST    = False    # do not average test fold
TOPN_ROI    = 40       # keeping top-N ROIs by variance in training fold
RAND_SEED   = 13       # reproducibility

# ROI groups (Desikan–Killiany) - same as model 1!
OT = {'pericalcarine','cuneus','lateraloccipital','lingual',
      'fusiform','inferiortemporal','middletemporal','superiortemporal'}
FP = {'superiorfrontal','rostralmiddlefrontal','caudalmiddlefrontal',
      'lateralorbitofrontal','medialorbitofrontal',
      'superiorparietal','inferiorparietal','precuneus','postcentral',
      'rostralanteriorcingulate','isthmuscingulate'}



In [None]:
# getting subjects and labels

subjects_dir = "/work/freesurfer"
EPOCHS_DIR   = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/events_epochs_evokeds/epochs"

roi_labels = {}
for ep_path in sorted(glob.glob(os.path.join(EPOCHS_DIR, "*-epo_stim_withPAS_clean.fif"))):
    pid = os.path.basename(ep_path).split("-epo")[0]
    try:
        labs = mne.read_labels_from_annot(subject=pid, parc="aparc", subjects_dir=subjects_dir)
    except FileNotFoundError:
        print(f"→ {pid}: missing aparc"); continue
    sel = [lab for lab in labs if not lab.name.lower().startswith(("unknown","corpuscallosum"))]
    roi_labels[pid] = sel
    print(f"✓ {pid}: {len(sel)} labels")


In [29]:
def load_roi_time_data_fast(pid, labels_dict, decimate=DECIMATE):
    stc_files = sorted(glob.glob(os.path.join(STC_DIR, pid, f"{pid}-trial*-PAS*-dSPM.h5-lh.stc")))
    if not stc_files:
        raise FileNotFoundError(f"No STCs for {pid}")

    inv = mne.minimum_norm.read_inverse_operator(os.path.join(INV_DIR, f"{pid}-inv.fif"))
    src = inv["src"]
    labels = labels_dict[pid]

    ex = mne.read_source_estimate(stc_files[0])
    times_full = ex.times
    idx_base = np.where((times_full >= BASELINE[0]) & (times_full <= BASELINE[1]))[0]
    idx_cut  = np.where(times_full <= CUTOFF)[0]

    X_list, y_list = [], []
    for fp in stc_files:
        m = re.search(r"PAS(1|2|3_4)", fp)
        if not m: continue
        y_list.append({'1':0,'2':1,'3_4':2}[m.group(1)])
        stc = mne.read_source_estimate(fp)
        ltc = mne.extract_label_time_course(stc, labels, src=src, mode='mean_flip')
        base = ltc[:, idx_base].mean(axis=1, keepdims=True)
        ltc  = (ltc - base)[:, idx_cut][:, ::decimate]
        X_list.append(ltc)

    X_time = np.stack(X_list, axis=0)  # (n_trials, n_rois, n_times)
    y = np.asarray(y_list)
    times = times_full[idx_cut][::decimate]
    feat_names = [lab.name for lab in labels]
    return X_time, y, times, feat_names

In [30]:
def make_lda_shrink():
    # stronger shrinkage to cut overfit
    return Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("lda", LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.5))
    ])

def features_from_window(X_time, t_index):
    return X_time[:, :, :t_index+1].mean(axis=2)   # mean-pool time

def _make_supertrials(X, y, k, random_state, avg_test=False, is_train=True):
    if not is_train and not avg_test or k <= 1:
        return X, y
    rs = np.random.RandomState(random_state)
    X_out, y_out = [], []
    for cls in np.unique(y):
        idx = np.where(y == cls)[0]
        rs.shuffle(idx)
        if len(idx) < k:
            X_out.append(X[idx].mean(axis=0, keepdims=True))
            y_out.append([cls]); continue
        n_grp = len(idx)//k
        idx = idx[:n_grp*k].reshape(n_grp, k)
        X_grp = X[idx].mean(axis=1)
        y_grp = np.full(n_grp, cls)
        X_out.append(X_grp); y_out.append(y_grp)
    return np.vstack(X_out), np.concatenate(y_out)

def _topn_roi_indices(X_train, topn):
    if topn is None or topn <= 0 or topn >= X_train.shape[1]:
        return None
    var = X_train.var(axis=0)
    return np.argsort(var)[-topn:]

def cumulative_decoding_curve_overfit(X_time, y, times, roi_mask=None):
    if roi_mask is not None:
        X_time = X_time[:, roi_mask, :]

    if len(np.unique(y)) < 2:
        raise ValueError("Only one label present.")
    if min(np.bincount(y)) < CV_SPLITS:
        raise ValueError(f"Too few samples for {CV_SPLITS}-fold CV.")

    n_times = X_time.shape[2]
    idxs = list(range(0, n_times, STEP))
    t_sub = times[idxs]

    acc_train, acc_val = np.zeros(len(idxs)), np.zeros(len(idxs))
    cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=7)

    for j, t in enumerate(idxs):
        X_feat = features_from_window(X_time, t)
        fold_tr, fold_va = [], []
        for fold_id, (tr, te) in enumerate(cv.split(X_feat, y)):
            X_tr, y_tr = X_feat[tr], y[tr]
            X_te, y_te = X_feat[te], y[te]

            # top-N ROI selection
            roi_idx = _topn_roi_indices(X_tr, TOPN_ROI)
            if roi_idx is not None:
                X_tr, X_te = X_tr[:, roi_idx], X_te[:, roi_idx]

            # super-trial averaging
            X_tr_st, y_tr_st = _make_supertrials(X_tr, y_tr, k=SUPER_K,
                random_state=RAND_SEED + j + 17*fold_id, avg_test=False, is_train=True)
            X_te_st, y_te_st = _make_supertrials(X_te, y_te, k=SUPER_K,
                random_state=RAND_SEED + j + 17*fold_id, avg_test=AVG_TEST, is_train=False)

            clf = make_lda_shrink()
            clf.fit(X_tr_st, y_tr_st)
            fold_tr.append(balanced_accuracy_score(y_tr_st, clf.predict(X_tr_st)))
            fold_va.append(balanced_accuracy_score(y_te_st, clf.predict(X_te_st)))

        acc_train[j] = np.mean(fold_tr)
        acc_val[j]   = np.mean(fold_va)

    gap = acc_train - acc_val
    return acc_train, acc_val, gap, t_sub


### NOTE: run below chunk with 64-core machine for quicker output. Otherwise change parallelisation settings to lower bounds so the kernel doesn't crash! Make sure to switch to smaller machine 

In [None]:

# unning for all participants

def run_participant(pid):
    try:
        X_time, y, times, feat_labels = load_roi_time_data_fast(pid, roi_labels)
        base_names = [nm.split("-")[0].lower() for nm in feat_labels]
        roi_mask_ot = np.array([bn in OT for bn in base_names])
        roi_mask_fp = np.array([bn in FP for bn in base_names])

        tr_all, va_all, gap_all, t_sub = cumulative_decoding_curve_overfit(X_time, y, times)
        tr_ot,  va_ot,  gap_ot,  _     = cumulative_decoding_curve_overfit(X_time, y, times, roi_mask_ot)
        tr_fp,  va_fp,  gap_fp,  _     = cumulative_decoding_curve_overfit(X_time, y, times, roi_mask_fp)

        df = pd.DataFrame({
            "time_s": t_sub,
            "train_all": tr_all, "val_all": va_all, "gap_all": gap_all,
            "train_OT": tr_ot, "val_OT": va_ot, "gap_OT": gap_ot,
            "train_FP": tr_fp, "val_FP": va_fp, "gap_FP": gap_fp
        })
        out_csv = os.path.join(OUT_DIR, f"{pid}_cumulative_curves_overfit.csv")
        df.to_csv(out_csv, index=False)
        return {"pid": pid, "status": "ok", "n_trials": len(y)}
    except Exception as e:
        return {"pid": pid, "status": "error", "msg": str(e)}

pids = sorted(next(os.walk(STC_DIR))[1])
print(f"Running {len(pids)} participants in parallel…")

results = Parallel(n_jobs=8)(
    delayed(run_participant)(pid) for pid in tqdm(pids)
)

print("\n=== Summary ===")
for r in results:
    if r["status"] == "ok":
        print(f"✓ {r['pid']} ({r['n_trials']} trials)")
    else:
        print(f"✗ {r['pid']}: {r['msg']}")


In [None]:
## group mean and plot

# getting participant info
files = sorted(glob.glob(os.path.join(OUT_DIR, "*_cumulative_curves_overfit.csv")))
if not files:
    raise FileNotFoundError(f"No overfit CSVs found in {OUT_DIR}")

curves_all_val = [pd.read_csv(f)["val_all"].to_numpy() for f in files]
curves_ot_val  = [pd.read_csv(f)["val_OT"].to_numpy()  for f in files]
curves_fp_val  = [pd.read_csv(f)["val_FP"].to_numpy()  for f in files]
times = pd.read_csv(files[0])["time_s"].to_numpy()

gm_all = np.vstack(curves_all_val).mean(axis=0)
gm_ot  = np.vstack(curves_ot_val).mean(axis=0)
gm_fp  = np.vstack(curves_fp_val).mean(axis=0)

plt.figure(figsize=(7.5, 4.3))
plt.plot(times, gm_all, label="All ROIs (val)", lw=2)
plt.plot(times, gm_ot,  label="Occipito-temporal (val)", lw=1.8)
plt.plot(times, gm_fp,  label="Frontal–parietal (val)", lw=1.8)
plt.axvspan(VAN_WIN[0], VAN_WIN[1], color="grey", alpha=0.15, label="VAN (150–250 ms)")
plt.axvspan(LP_WIN[0],  LP_WIN[1],  color="grey", alpha=0.10, label="LP (330–550 ms)")
plt.axhline(1/3, color="k", ls="--", lw=0.8, label="Chance (3-class)")
plt.xlim(times[0], times[-1])
plt.ylim(0.30, max(gm_all.max(), gm_ot.max(), gm_fp.max()) + 0.06)
plt.xlabel("Time (s)"); plt.ylabel("Balanced accuracy (validation)")
plt.title("Cumulative decoding over time (group mean)")
plt.legend(loc="lower right", ncol=2, fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "group_cumulative_plot_val.png"), dpi=200)
plt.show()

# overfitting gap plot 
gm_gap = np.vstack([pd.read_csv(f)["gap_all"].to_numpy() for f in files]).mean(axis=0)
plt.figure(figsize=(7.5, 2.8))
plt.plot(times, gm_gap, lw=2, label="All ROIs (mean gap)")
plt.axhline(0, color="k", ls="--", lw=0.8)
plt.axvspan(VAN_WIN[0], VAN_WIN[1], color="grey", alpha=0.15)
plt.axvspan(LP_WIN[0],  LP_WIN[1],  color="grey", alpha=0.10)
plt.xlabel("Time (s)"); plt.ylabel("Train − Val"); plt.title("Group mean overfitting gap")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "group_overfit_gap_plot.png"), dpi=200)
plt.show()




In [None]:
## TRY: nicer and more informative plot: 

chance = 1/3

def mean_sem(arr_list):
    M = np.vstack(arr_list)
    m = M.mean(axis=0)
    se = M.std(axis=0, ddof=1) / np.sqrt(M.shape[0])
    return m, se

def first_exceed(time, curve, base=chance, delta=0.02, min_dur=0.03):
    thr = base + delta
    ok = curve > thr
    step = float(np.median(np.diff(time)))
    run = 0
    for i, flag in enumerate(ok):
        run = run + 1 if flag else 0
        if run * step >= min_dur:
            return float(time[i - run + 1])
    return np.nan

def peak_in_window(time, curve, w):
    m = (time >= w[0]) & (time <= w[1])
    if not np.any(m):
        return np.nan, np.nan
    idx = np.argmax(curve[m])
    sel = np.where(m)[0][idx]
    return float(curve[sel]), float(time[sel])


files = sorted(glob.glob(os.path.join(OUT_DIR, "*_cumulative_curves_overfit.csv")))
if not files:
    raise FileNotFoundError(f"No overfit CSVs found in {OUT_DIR}")

# loading curves
curves_all_val = [pd.read_csv(f)["val_all"].to_numpy() for f in files]
curves_ot_val  = [pd.read_csv(f)["val_OT"].to_numpy()  for f in files]
curves_fp_val  = [pd.read_csv(f)["val_FP"].to_numpy()  for f in files]
times = pd.read_csv(files[0])["time_s"].to_numpy()

# group mean ± SEM
gm_all, se_all = mean_sem(curves_all_val)
gm_ot,  se_ot  = mean_sem(curves_ot_val)
gm_fp,  se_fp  = mean_sem(curves_fp_val)

# 1st plot: cumulative decoding with SEM
plt.figure(figsize=(7.8, 4.4))

for gm, se, lab, lw, alpha in [
    (gm_all, se_all, "All ROIs (val)", 2.2, 0.18),
    (gm_ot,  se_ot,  "Occipito-temporal (val)", 1.9, 0.18),
    (gm_fp,  se_fp,  "Frontal–parietal (val)", 1.9, 0.18),
]:
    plt.plot(times, gm, label=lab, lw=lw)
    plt.fill_between(times, gm - se, gm + se, alpha=alpha)

plt.axvspan(VAN_WIN[0], VAN_WIN[1], color="grey", alpha=0.15, label="VAN (150–250 ms)")
plt.axvspan(LP_WIN[0],  LP_WIN[1],  color="grey", alpha=0.10, label="LP (330–550 ms)")
plt.axhline(chance, color="k", ls="--", lw=0.9, label="Chance (3-class)")

ymax = max(gm_all.max(), gm_ot.max(), gm_fp.max()) + 0.06
plt.xlim(times[0], times[-1])
plt.ylim(0.30, ymax)
plt.xlabel("Time (s)")
plt.ylabel("Balanced accuracy (validation)")
plt.title("Cumulative decoding over time (group mean ± SEM)")
plt.legend(loc="lower right", ncol=2, fontsize=8)
plt.tight_layout()

f_val = os.path.join(OUT_DIR, "group_cumulative_plot_val_sem.png")
plt.savefig(f_val, dpi=300)
plt.close()
print("Saved:", f_val)

# 2nd plot: overfitting gap (mean ± SEM, All ROIs)
curves_all_gap = [pd.read_csv(f)["gap_all"].to_numpy() for f in files]
gm_gap, se_gap = mean_sem(curves_all_gap)

plt.figure(figsize=(7.8, 2.9))
plt.plot(times, gm_gap, lw=2.2, label="All ROIs (mean gap)")
plt.fill_between(times, gm_gap - se_gap, gm_gap + se_gap, alpha=0.2)
plt.axhline(0, color="k", ls="--", lw=0.9)
plt.axvspan(VAN_WIN[0], VAN_WIN[1], color="grey", alpha=0.15)
plt.axvspan(LP_WIN[0],  LP_WIN[1],  color="grey", alpha=0.10)
plt.xlabel("Time (s)")
plt.ylabel("Train − Val")
plt.title("Group mean overfitting gap (± SEM)")
plt.tight_layout()

f_gap = os.path.join(OUT_DIR, "group_overfit_gap_plot_sem.png")
plt.savefig(f_gap, dpi=300)
plt.close()
print("Saved:", f_gap)

# onset and peak summaries 
ot_onset = first_exceed(times, gm_ot, base=chance, delta=0.02, min_dur=0.03)
fp_onset = first_exceed(times, gm_fp, base=chance, delta=0.02, min_dur=0.03)

all_peak_val, all_peak_t = peak_in_window(times, gm_all, (VAN_WIN[0], LP_WIN[1]))
ot_peak_val,  ot_peak_t  = peak_in_window(times, gm_ot,  (VAN_WIN[0], LP_WIN[1]))
fp_peak_val,  fp_peak_t  = peak_in_window(times, gm_fp,  (VAN_WIN[0], LP_WIN[1]))

print(
    f"Onset > chance+0.02 for ≥30 ms — OT: {ot_onset*1000:.0f} ms, FP: {fp_onset*1000:.0f} ms"
    if (not np.isnan(ot_onset) and not np.isnan(fp_onset))
    else "Onset criterion not reached for at least one curve."
)
print(
    f"Peak (0.15–0.55 s) — All: {all_peak_val:.3f} @ {all_peak_t*1000:.0f} ms | "
    f"OT: {ot_peak_val:.3f} @ {ot_peak_t*1000:.0f} ms | "
    f"FP: {fp_peak_val:.3f} @ {fp_peak_t*1000:.0f} ms")

In [None]:
# VAN/LP window means + paired tests for Model 3 cumulative curves (might not use)
import os, numpy as np, pandas as pd
from scipy.stats import ttest_rel

# paths and time windows
OUT_DIR = "/work/GrétaHarsányi#3675/Assignment2/2025Neuro/analysis_files/cumulative_fast"
VAN_WIN = (0.150, 0.250)
LP_WIN  = (0.330, 0.550)

files = sorted([f for f in os.listdir(OUT_DIR) if f.endswith("_cumulative_curves_overfit.csv")])
if not files:
    raise FileNotFoundError(f"No cumulative curve CSVs found in {OUT_DIR}")

# er participanzt window means
rows = []
for f in files:
    df = pd.read_csv(os.path.join(OUT_DIR, f))
    pid = f.split("_")[0]
    for roi in ["acc_all", "acc_OT", "acc_FP"]:
        van_mean = df.loc[(df["time_s"] >= VAN_WIN[0]) & (df["time_s"] <= VAN_WIN[1]), roi].mean()
        lp_mean  = df.loc[(df["time_s"] >= LP_WIN[0])  & (df["time_s"] <= LP_WIN[1]),  roi].mean()
        rows.append({"pid": pid, "roi": roi, "VAN": van_mean, "LP": lp_mean})

win_means = pd.DataFrame(rows).sort_values(["roi", "pid"]).reset_index(drop=True)

# aving
means_path = os.path.join(OUT_DIR, "cumulative_window_means.csv")
win_means.to_csv(means_path, index=False)
print("✓ Saved per-participant VAN/LP means →", means_path)

# aired tests: van vs lp for each ROI set
tests = []
for roi in ["acc_all", "acc_OT", "acc_FP"]:
    sub = win_means[win_means["roi"] == roi]
    m = sub[["VAN", "LP"]].dropna()
    t, p = ttest_rel(m["VAN"], m["LP"])
    tests.append({
        "roi": roi,
        "n": int(len(m)),
        "VAN_mean": float(m["VAN"].mean()),
        "VAN_sd":   float(m["VAN"].std(ddof=1)),
        "LP_mean":  float(m["LP"].mean()),
        "LP_sd":    float(m["LP"].std(ddof=1)),
        "t_VAN_vs_LP": float(t),
        "p_VAN_vs_LP": float(p)
    })
    print(f"{roi}: VAN={m['VAN'].mean():.3f}±{m['VAN'].std(ddof=1):.3f} "
          f"vs LP={m['LP'].mean():.3f}±{m['LP'].std(ddof=1):.3f} | "
          f"t={t:.2f}, p={p:.4f}, n={len(m)}")

tests_df = pd.DataFrame(tests).sort_values("roi")
tests_path = os.path.join(OUT_DIR, "cumulative_window_tests.csv")
tests_df.to_csv(tests_path, index=False)
print("✓ Saved VAN vs LP paired-test summary →", tests_path)



In [None]:
## testing time window means against chance and comparing OT vs FP within each window 


chance = 1/3

def ci95(x):
    x = np.asarray(x, float)
    x = x[~np.isnan(x)]
    n  = len(x)
    m  = x.mean()
    se = x.std(ddof=1) / np.sqrt(n)
    tcrit = stats.t.ppf(0.975, n-1)
    return m - tcrit*se, m + tcrit*se

# t-tests
rows_vs_chance = []
for w in ["VAN", "LP"]:
    print(f"\n== {w} window ==")
    for roi in ["acc_all","acc_OT","acc_FP"]:
        vals = win_means.loc[win_means["roi"]==roi, w].astype(float).dropna().values
        t, p = stats.ttest_1samp(vals, chance)
        lo, hi = ci95(vals)
        m = float(vals.mean())
        print(f"{roi:8s}: mean={m:.3f}  95%CI[{lo:.3f},{hi:.3f}]  t={t:.2f}, p={p:.4f}  (n={len(vals)})")
        rows_vs_chance.append({"window": w, "roi": roi, "n": len(vals),
                               "mean": m, "ci95_low": lo, "ci95_high": hi,
                               "t": float(t), "p": float(p)})

# paired OT vs FP
rows_pair = []
for w in ["VAN","LP"]:
    sub = win_means[win_means["roi"].isin(["acc_OT","acc_FP"])][["pid","roi",w]].dropna()
    wide = sub.pivot(index="pid", columns="roi", values=w).dropna()
    ot = wide["acc_OT"].values
    fp = wide["acc_FP"].values
    t, p = stats.ttest_rel(ot, fp)
    print(f"OT vs FP ({w}): t={t:.2f}, p={p:.4f}  (n={len(wide)})")
    rows_pair.append({"window": w, "n": len(wide), "t": float(t), "p": float(p)})
