STEP 1: Preprocessing

In [None]:
# Install required libs
!pip install --quiet xgboost shap imbalanced-learn streamlit pyngrok tqdm

# Mount Drive
from google.colab import drive
import os, glob
drive.mount('/content/drive')

# Set dataset path (adjust if your STData sits elsewhere in Drive)
DATA_DIR = "/content/drive/MyDrive/STData/STData"  # <- change if needed

# Quick check
if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}. Check path in Drive.")

subjects = sorted([d for d in os.listdir(DATA_DIR) if d.isdigit()], key=lambda x:int(x))
print(f"Found {len(subjects)} subject folders (examples): {subjects[:6]}")
# show example files in first subject
if subjects:
    print("Example files in subject", subjects[0], ":", sorted(glob.glob(os.path.join(DATA_DIR, subjects[0], "*")) )[:12])

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Found 38 subject folders (examples): ['1', '2', '3', '4', '5', '6']
Example files in subject 1 : ['/content/drive/MyDrive/STData/STData/1/1_BlankScreenData.csv', '/content/drive/MyDrive/STData/STData/1/1_DLOT.xlsx', '/content/drive/MyDrive/STData/STData/1/1_EEG.csv', '/content/drive/MyDrive/STData/STData/1/1_EYE.csv', '/content/drive/MyDrive/STData/STData/1/1_GSR.csv', '/content/drive/MyDrive/STData/STData/1/1_IVT.csv', '/content/drive/MyDrive/STData/STData/1/1_NSTLX.csv', '/content/drive/MyDrive/STData/STData/1/1_PSY.csv', '/content/drive/MyDrive/STData/STData/1/1_TIVA.csv', '/content/drive/MyDrive/STData/STData/1/1_externalEvents.csv']


In [None]:
# Standard imports
import numpy as np, pandas as pd, os, math, gc
from pathlib import Path
from tqdm import tqdm
import scipy.signal as signal
from collections import Counter

# sklearn / xgboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight

import joblib
import matplotlib.pyplot as plt

# Hyperparams
FS = 256                # assumed EEG sample rate (adjust if your data says otherwise)
WINDOW_SEC = 5
WINDOW_SIZE = FS * WINDOW_SEC
STEP_SEC = 5
STEP_SIZE = FS * STEP_SEC

OUT_DIR = "data_out"
os.makedirs(OUT_DIR, exist_ok=True)
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

print("Config:", dict(FS=FS, WINDOW_SEC=WINDOW_SEC, WINDOW_SIZE=WINDOW_SIZE, STEP_SEC=STEP_SEC, STEP_SIZE=STEP_SIZE))

Config: {'FS': 256, 'WINDOW_SEC': 5, 'WINDOW_SIZE': 1280, 'STEP_SEC': 5, 'STEP_SIZE': 1280}


In [None]:
from pathlib import Path

def read_table(path):
    """Robustly read csv/xlsx/parquet. Returns empty DataFrame if missing or read fails."""
    path = str(path)
    if not os.path.exists(path):
        return pd.DataFrame()
    ext = Path(path).suffix.lower()
    try:
        if ext in [".csv", ".txt"]:
            return pd.read_csv(path)
        if ext in [".xls", ".xlsx"]:
            return pd.read_excel(path)
        if ext == ".parquet":
            return pd.read_parquet(path)
        return pd.read_csv(path, engine="python")
    except Exception as e:
        print(f"Failed to read {path}: {e}")
        return pd.DataFrame()

def find_file_with_prefix(folder, prefix):
    """Find a file in folder that starts with prefix (prefix includes subject id and underscore)."""
    files = sorted([f for f in os.listdir(folder) if f.startswith(prefix)])
    return os.path.join(folder, files[0]) if files else None

def window_iter(sig, window_size=WINDOW_SIZE, step=STEP_SIZE):
    sig = np.asarray(sig)
    n = len(sig)
    if n < window_size:
        return []
    idxs = range(0, n - window_size + 1, step)
    return [sig[s:s+window_size] for s in idxs]

def eeg_bandpower(sig, fs=FS):
    sig = np.asarray(sig, dtype=float)
    if sig.size < 4:
        return {"delta":0.0,"theta":0.0,"alpha":0.0,"beta":0.0,"gamma":0.0}
    freqs, psd = signal.welch(sig, fs=fs, nperseg=min(len(sig), fs*2))
    bands = {"delta":(1,4),"theta":(4,8),"alpha":(8,12),"beta":(12,30),"gamma":(30,45)}
    out = {}
    for name,(lo,hi) in bands.items():
        mask = (freqs>=lo)&(freqs<=hi)
        out[name] = float(np.trapz(psd[mask], freqs[mask])) if mask.sum() else 0.0
    return out

In [None]:
def extract_eeg_windows(eeg_df):
    """Return list of dicts; each dict is features for one window."""
    if eeg_df is None or eeg_df.empty:
        return []
    # Attempt to find band columns first (like Alpha_..). Fallback to RAW channels.
    band_cols = [c for c in eeg_df.columns if any(b in c.lower() for b in ["delta","theta","alpha","beta","gamma"])]
    if len(band_cols) > 0:
        per_col_windows = {}
        for c in band_cols:
            sig = pd.to_numeric(eeg_df[c], errors="coerce").dropna().values
            per_col_windows[c] = window_iter(sig)
        # ensure at least one column had windows
        lens = [len(v) for v in per_col_windows.values()]
        if not lens or min(lens)==0:
            return []
        nwin = min(lens)
        windows = []
        for i in range(nwin):
            row = {}
            for c in band_cols:
                seg = per_col_windows[c][i]
                row[f"{c}_mean"] = float(np.mean(seg)) if len(seg)>0 else 0.0
                row[f"{c}_std"]  = float(np.std(seg)) if len(seg)>0 else 0.0
            windows.append(row)
        return windows
    else:
        # find raw EEG channel columns
        raw_cols = [c for c in eeg_df.columns if c.lower().startswith("raw_") or any(ch in c.lower() for ch in ["tp9","af7","af8","tp10","fp1","fp2","f3","f4"])]
        if len(raw_cols) == 0:
            return []
        per_col_windows={}
        for c in raw_cols:
            sig = pd.to_numeric(eeg_df[c], errors="coerce").dropna().values
            per_col_windows[c] = window_iter(sig)
        lens = [len(v) for v in per_col_windows.values()]
        if not lens or min(lens)==0:
            return []
        nwin = min(lens)
        windows=[]
        for i in range(nwin):
            row={}
            for c in raw_cols:
                seg = per_col_windows[c][i]
                bp = eeg_bandpower(seg)
                for band,val in bp.items():
                    row[f"{c}_{band}"] = float(val)
            windows.append(row)
        return windows

def extract_gsr_windows(gsr_df):
    if gsr_df is None or gsr_df.empty:
        return []
    candidate = None
    for c in gsr_df.columns:
        if "conductance" in c.lower() or "gsr" in c.lower() or "eda" in c.lower():
            candidate = c; break
    # fallback to first numeric column if none matched
    if candidate is None:
        numeric_cols = [c for c in gsr_df.columns if pd.api.types.is_numeric_dtype(gsr_df[c])]
        if numeric_cols:
            candidate = numeric_cols[0]
        else:
            return []
    sig = pd.to_numeric(gsr_df[candidate], errors="coerce").dropna().values
    segs = window_iter(sig)
    out=[]
    for w in segs:
        out.append({
            f"{candidate}_mean": float(np.mean(w)),
            f"{candidate}_std": float(np.std(w)),
            f"{candidate}_peaks": int(np.sum(np.diff(w) > 0.05 * max(1.0, np.max(np.abs(w)))))
        })
    return out

def extract_tiva_windows(tiva_df, n_windows):
    if tiva_df is None or tiva_df.empty:
        return [ {} for _ in range(n_windows) ]
    # drop obvious meta columns but keep AUs/emotion cols
    meta_cols = set(["UnixTime","Row","QuestionKey","Timestamp","SampleNumber","Sample_Index"])
    cols = [c for c in tiva_df.columns if c not in meta_cols]
    if len(cols) == 0:
        return [ {} for _ in range(n_windows) ]
    stats = {}
    for c in cols:
        vals = pd.to_numeric(tiva_df[c], errors="coerce").dropna().values
        stats[f"{c}_mean"] = float(vals.mean()) if vals.size>0 else 0.0
        stats[f"{c}_std"]  = float(vals.std())  if vals.size>0 else 0.0
    # broadcast same summary across windows (TIVA often lower-rate)
    return [dict(stats) for _ in range(max(1, n_windows))]

In [None]:
def verdict_to_class(v):
    """Map various label formats to 0/1/2. Adjust heuristics as dataset dictates."""
    if pd.isna(v):
        return 0
    s = str(v).strip().lower()
    if s in ["positive","pos","+","1","true","yes","correct"]:
        return 1
    if s in ["negative","neg","-","-1","false","no","incorrect"]:
        return 2
    # try numeric
    try:
        vi = float(s)
        if vi > 0: return 1
        if vi < 0: return 2
        return 0
    except:
        return 0

def build_windows_for_subject(sid):
    subj_dir = os.path.join(DATA_DIR, str(sid))
    # find files by prefix robustly
    eeg_fp = find_file_with_prefix(subj_dir, f"{sid}_EEG")
    gsr_fp = find_file_with_prefix(subj_dir, f"{sid}_GSR")
    tiva_fp = find_file_with_prefix(subj_dir, f"{sid}_TIVA")
    psy_fp = find_file_with_prefix(subj_dir, f"{sid}_PSY")
    # read
    eeg = read_table(eeg_fp) if eeg_fp else pd.DataFrame()
    gsr = read_table(gsr_fp) if gsr_fp else pd.DataFrame()
    tiva = read_table(tiva_fp) if tiva_fp else pd.DataFrame()
    psy = read_table(psy_fp) if psy_fp else pd.DataFrame()

    rows = []
    if psy.empty:
        # no PSY file: cannot extract trial labels -> skip
        print(f"⚠️ subject {sid}: missing or empty PSY ({psy_fp})")
        return rows

    # try to infer trial id column names in PSY
    possible_key_cols = [c for c in psy.columns if c.lower() in ("key","trial","questionkey","question","trialid","question_id")]
    possible_label_cols = [c for c in psy.columns if c.lower() in ("verdict","label","sentiment","score","response","rating","valence")]
    # choose best column
    key_col = possible_key_cols[0] if possible_key_cols else None
    label_col = possible_label_cols[0] if possible_label_cols else None

    if key_col is None:
        # fallback: iterate rows with index as key
        psy["_key_fallback"] = psy.index.astype(str)
        key_col = "_key_fallback"
    if label_col is None:
        # fallback: try any numeric column or the last column
        numeric = [c for c in psy.columns if pd.api.types.is_numeric_dtype(psy[c])]
        label_col = numeric[0] if numeric else psy.columns[-1]

    for _, trial in psy.iterrows():
        qkey = trial.get(key_col, None)
        label_raw = trial.get(label_col, None)
        label = verdict_to_class(label_raw)

        # filter signals by QuestionKey if present, otherwise use whole file
        if "QuestionKey" in eeg.columns:
            eeg_trial = eeg[eeg["QuestionKey"] == qkey]
        else:
            eeg_trial = eeg
        if "QuestionKey" in gsr.columns:
            gsr_trial = gsr[gsr["QuestionKey"] == qkey]
        else:
            gsr_trial = gsr
        if "QuestionKey" in tiva.columns:
            tiva_trial = tiva[tiva["QuestionKey"] == qkey]
        else:
            tiva_trial = tiva

        # extract windows
        eeg_w = extract_eeg_windows(eeg_trial)
        gsr_w = extract_gsr_windows(gsr_trial)
        n_windows = max(len(eeg_w), len(gsr_w), 1)
        tiva_w = extract_tiva_windows(tiva_trial, n_windows)

        # combine windows
        for i in range(n_windows):
            r = {}
            if i < len(eeg_w): r.update(eeg_w[i])
            if i < len(gsr_w): r.update(gsr_w[i])
            if i < len(tiva_w): r.update(tiva_w[i])
            r["_subject"] = sid
            r["_trial"] = str(qkey)
            r["_win"] = int(i)
            r["_y"] = int(label)
            rows.append(r)
    return rows

In [None]:
rows = []
for sid in tqdm(subjects, desc="Building windows"):
    try:
        out = build_windows_for_subject(sid)
        if len(out) == 0:
            # print minimal info but don't spam
            print(f"Subject {sid}: 0 windows")
        rows.extend(out)
    except Exception as e:
        print("Error processing subject", sid, ":", e)

features_windows = pd.DataFrame(rows)
print("Built windows dataset shape:", features_windows.shape)
if not features_windows.empty:
    print("Columns sample:", features_windows.columns.tolist()[:30])
    print("Label distribution:", features_windows["_y"].value_counts().to_dict())
else:
    print("⚠️ No windows extracted. Inspect the prints above for missing PSY or files.")

# Save for later
if not features_windows.empty:
    features_windows = features_windows.fillna(0)
    features_windows.to_parquet(os.path.join(OUT_DIR, "features_windows.parquet"), index=False)
    print("Saved features to:", os.path.join(OUT_DIR, "features_windows.parquet"))
else:
    features_windows.to_parquet(os.path.join(OUT_DIR, "features_windows.parquet"), index=False)
    print("Saved empty dataframe to:", os.path.join(OUT_DIR, "features_windows.parquet"))

  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  x = asanyarray(arr - arrmean)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(path)
  return pd.read_csv(pa

Built windows dataset shape: (4924, 125)
Columns sample: ['Delta_TP9_mean', 'Delta_TP9_std', 'Delta_AF7_mean', 'Delta_AF7_std', 'Delta_AF8_mean', 'Delta_AF8_std', 'Delta_TP10_mean', 'Delta_TP10_std', 'Theta_TP9_mean', 'Theta_TP9_std', 'Theta_AF7_mean', 'Theta_AF7_std', 'Theta_AF8_mean', 'Theta_AF8_std', 'Theta_TP10_mean', 'Theta_TP10_std', 'Alpha_TP9_mean', 'Alpha_TP9_std', 'Alpha_AF7_mean', 'Alpha_AF7_std', 'Alpha_AF8_mean', 'Alpha_AF8_std', 'Alpha_TP10_mean', 'Alpha_TP10_std', 'Beta_TP9_mean', 'Beta_TP9_std', 'Beta_AF7_mean', 'Beta_AF7_std', 'Beta_AF8_mean', 'Beta_AF8_std']
Label distribution: {1: 3422, 2: 1288, 0: 214}
Saved features to: data_out/features_windows.parquet
