STEP 2: Feature Engineering

In [None]:
import pandas as pd, numpy as np, os

# Load the preprocessed windows
fw_path = "data_out/features_windows.parquet"
features_windows = pd.read_parquet(fw_path)
print("Loaded features:", features_windows.shape)
print("Columns sample:", features_windows.columns[:15].tolist())
print("Label distribution:", features_windows["_y"].value_counts().to_dict())

Loaded features: (4924, 125)
Columns sample: ['Delta_TP9_mean', 'Delta_TP9_std', 'Delta_AF7_mean', 'Delta_AF7_std', 'Delta_AF8_mean', 'Delta_AF8_std', 'Delta_TP10_mean', 'Delta_TP10_std', 'Theta_TP9_mean', 'Theta_TP9_std', 'Theta_AF7_mean', 'Theta_AF7_std', 'Theta_AF8_mean', 'Theta_AF8_std', 'Theta_TP10_mean']
Label distribution: {1: 3422, 2: 1288, 0: 214}


In [None]:
def add_eeg_features(df):
    # Frontal asymmetry: log(alpha right / alpha left)
    if "Alpha_AF8_mean" in df and "Alpha_AF7_mean" in df:
        df["eeg_frontal_asymmetry"] = np.log1p(df["Alpha_AF8_mean"] / (df["Alpha_AF7_mean"] + 1e-6))
    elif "Alpha_F4_mean" in df and "Alpha_F3_mean" in df:
        df["eeg_frontal_asymmetry"] = np.log1p(df["Alpha_F4_mean"] / (df["Alpha_F3_mean"] + 1e-6))
    else:
        df["eeg_frontal_asymmetry"] = 0.0

    # Engagement index = Beta / (Alpha + Theta)
    beta_cols  = [c for c in df.columns if "Beta" in c and c.endswith("_mean")]
    alpha_cols = [c for c in df.columns if "Alpha" in c and c.endswith("_mean")]
    theta_cols = [c for c in df.columns if "Theta" in c and c.endswith("_mean")]

    if beta_cols and alpha_cols and theta_cols:
        df["eeg_engagement_index"] = df[beta_cols].mean(axis=1) / (df[alpha_cols].mean(axis=1) + df[theta_cols].mean(axis=1) + 1e-6)
    else:
        df["eeg_engagement_index"] = 0.0

    return df

features_windows = add_eeg_features(features_windows)
print("Added EEG features")

Added EEG features


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
from scipy.stats import linregress
import numpy as np

def add_gsr_features(df):
    # find a GSR-related mean column
    gsr_mean_cols = [c for c in df.columns if "gsr" in c.lower() or "conductance" in c.lower()]
    gsr_mean_cols = [c for c in gsr_mean_cols if c.endswith("_mean")]

    if not gsr_mean_cols:
        df["gsr_slope"] = 0.0
        return df

    col = gsr_mean_cols[0]  # use the first GSR mean column

    slopes = []
    for i, val in enumerate(df[col].values):
        # very simple slope proxy (trend over windows, here just index*value regression)
        slopes.append(val * 0.01)   # fallback proxy, not raw slope since we don’t have per-window series
    df["gsr_slope"] = slopes

    return df

features_windows = add_gsr_features(features_windows)
print("Added GSR features. Current shape:", features_windows.shape)

Added GSR features. Current shape: (4924, 128)


In [None]:
from scipy.stats import linregress
import numpy as np

def add_gsr_features(df):
    gsr_mean_cols = [c for c in df.columns if "gsr" in c.lower() or "conductance" in c.lower()]
    gsr_mean_cols = [c for c in gsr_mean_cols if c.endswith("_mean")]

    if not gsr_mean_cols:
        df["gsr_slope"] = 0.0
        return df

    col = gsr_mean_cols[0]
    slopes = [val * 0.01 for val in df[col].values]  # simple proxy
    df["gsr_slope"] = slopes

    return df

features_windows = add_gsr_features(features_windows)
print("Added GSR features. Current shape:", features_windows.shape)

Added GSR features. Current shape: (4924, 128)


In [None]:
def add_tiva_features(df):
    au12 = [c for c in df.columns if "AU12" in c]
    au4  = [c for c in df.columns if "AU4" in c]
    au1  = [c for c in df.columns if "AU1" in c]

    df["tiva_valence_proxy"] = df[au12[0]] - df[au4[0]] if au12 and au4 else 0.0
    df["tiva_arousal_proxy"] = df[au1[0]] if au1 else 0.0

    return df

features_windows = add_tiva_features(features_windows)
print("Added TIVA features")

Added TIVA features


In [None]:
# Fill missing values
features_windows = features_windows.fillna(0)

# Save engineered dataset
out_path = "data_out/features_engineered.parquet"
features_windows.to_parquet(out_path, index=False)

print("Final engineered dataset shape:", features_windows.shape)
print("Saved to:", out_path)

Final engineered dataset shape: (4924, 130)
Saved to: data_out/features_engineered.parquet
