In [14]:
# imports
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
warnings.filterwarnings("ignore")

# sklearn imports
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, precision_recall_curve
)

# seed for reproducibility
SEED = 27
np.random.seed(SEED)

# file paths
DATA_DIR = Path("../data")
EEG_FILE = DATA_DIR / "clean_eeg.csv"
SUMMARY_FILE = DATA_DIR / "challenger_insight_session_summary.csv"



In [15]:
# load data
eeg = pd.read_csv(EEG_FILE)
summary = pd.read_csv(SUMMARY_FILE)

# clean column names
eeg.columns = eeg.columns.str.strip().str.lower()
summary.columns = summary.columns.str.strip().str.lower()

print("EEG shape:", eeg.shape)
print("Summary shape:", summary.shape)
print("\nSummary columns:", summary.columns.tolist())

EEG shape: (4126, 8)
Summary shape: (76, 11)

Summary columns: ['subject_id', 'session_id', 'stimulus_type', 'task_difficulty', 'modality', 'start_time', 'end_time', 'n_segments', 'mean_cog_load', 'pct_engaged', 'mean_disengage_risk']


In [16]:

# CREATE LABELS USING PCT_ENGAGED


# Use median as threshold (from your data exploration: 61.2%)
ENGAGEMENT_THRESHOLD = summary["pct_engaged"].median()
summary["disengaged"] = (summary["pct_engaged"] < ENGAGEMENT_THRESHOLD).astype(int)

print(f"\nThreshold: pct_engaged < {ENGAGEMENT_THRESHOLD:.1f}% = disengaged")
print("\nClass distribution:")
print(summary["disengaged"].value_counts().sort_index())
print(summary["disengaged"].value_counts(normalize=True).sort_index())

# Show distribution
print("\nExample sessions:")
display(summary[["subject_id", "session_id", "pct_engaged", "disengaged", 
                  "stimulus_type", "task_difficulty", "modality"]].head(10))


LABELING USING PCT_ENGAGED (MEDIAN THRESHOLD)

Threshold: pct_engaged < 61.2% = disengaged

Class distribution:
disengaged
0    39
1    37
Name: count, dtype: int64
disengaged
0    0.513158
1    0.486842
Name: proportion, dtype: float64

Example sessions:


Unnamed: 0,subject_id,session_id,pct_engaged,disengaged,stimulus_type,task_difficulty,modality
0,S001,U8KXB4N9,46.8,1,Discussion,Easy,Live Workshop
1,S002,U5HKN6BB,66.7,0,Hands-on Lab,Moderate,Async Video
2,S005,UOWZ6RUE,62.2,0,Lecture,Moderate,Async Video
3,S006,U56BXK8B,59.4,1,Discussion,Moderate,Self-Paced
4,S007,U0L23F7L,66.1,0,Code-Along,Moderate,Async Video
5,S007,U3KXUTRO,57.1,1,Case Study,Moderate,Async Video
6,S008,UBQGOMTZ,73.0,0,Hands-on Lab,Moderate,Self-Paced
7,S010,U7YFF1W7,68.8,0,Code-Along,Moderate,Live Workshop
8,S011,UOVQNW8D,70.0,0,Lecture,Easy,Live Workshop
9,S011,UUEA40RV,69.1,0,Code-Along,Moderate,Self-Paced


In [17]:
# Merge EEG segments with session-level info
df = eeg.merge(
    summary[["subject_id", "session_id", "disengaged", 
             "stimulus_type", "task_difficulty", "modality"]],
    on=["subject_id", "session_id"],
    how="left",
    validate="many_to_one"
)

df = df.dropna(subset=["disengaged"]).copy()
df["disengaged"] = df["disengaged"].astype(int)

print(f"\nMerged dataset shape: {df.shape}")
print(f"Class balance: {df['disengaged'].value_counts(normalize=True).to_dict()}")
print(f"\nCategorical feature values:")
print(f"  stimulus_type: {df['stimulus_type'].unique()}")
print(f"  task_difficulty: {df['task_difficulty'].unique()}")
print(f"  modality: {df['modality'].unique()}")


Merged dataset shape: (4126, 12)
Class balance: {0: 0.5004847309743092, 1: 0.49951526902569077}

Categorical feature values:
  stimulus_type: ['Discussion' 'Hands-on Lab' 'Lecture' 'Code-Along' 'Case Study' 'Quiz']
  task_difficulty: ['Easy' 'Moderate' 'Hard']
  modality: ['Live Workshop' 'Async Video' 'Self-Paced' 'Blended']


In [18]:
# EEG features (numeric)
eeg_features = ["alpha_power", "beta_power", "theta_power", "gamma_power"]

# Categorical features (context)
categorical_features = ["stimulus_type", "task_difficulty", "modality"]

# All features
all_features = eeg_features + categorical_features

TARGET = "disengaged"
GROUPS = "session_id"

print(f"\nEEG Features ({len(eeg_features)}): {eeg_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")
print(f"Total Features: {len(all_features)}")
print(f"Target: {TARGET}")
print(f"Groups: {GROUPS}")


EEG Features (4): ['alpha_power', 'beta_power', 'theta_power', 'gamma_power']
Categorical Features (3): ['stimulus_type', 'task_difficulty', 'modality']
Total Features: 7
Target: disengaged
Groups: session_id


In [19]:
# Train and test, group based

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
groups_all = df[GROUPS]
train_idx, test_idx = next(gss.split(df[all_features], df[TARGET], groups_all))

X_train = df.loc[train_idx, all_features].reset_index(drop=True)
X_test = df.loc[test_idx, all_features].reset_index(drop=True)
y_train = df.loc[train_idx, TARGET].reset_index(drop=True)
y_test = df.loc[test_idx, TARGET].reset_index(drop=True)
groups_train = groups_all.iloc[train_idx].reset_index(drop=True)

print(f"\nTrain size: {len(X_train)} segments")
print(f"Test size: {len(X_test)} segments")
print(f"Train class ratio (1s): {y_train.mean():.3f}")
print(f"Test class ratio (1s): {y_test.mean():.3f}")


Train size: 3244 segments
Test size: 882 segments
Train class ratio (1s): 0.482
Test class ratio (1s): 0.562


In [20]:



# Preprocessor that handles numeric and categorical features separately
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), eeg_features),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features)
    ]
)

print("\nPreprocessing pipeline created!")
print("   - Numeric features: StandardScaler")
print("   - Categorical features: OneHotEncoder")


Preprocessing pipeline created!
   - Numeric features: StandardScaler
   - Categorical features: OneHotEncoder


In [21]:

# HELPER FUNCTIONS


def choose_threshold_max_acc(y_true, proba):
    """Find threshold that maximizes accuracy"""
    thresholds = np.linspace(0, 1, 100)
    accs = [accuracy_score(y_true, (proba >= t).astype(int)) for t in thresholds]
    return float(thresholds[np.argmax(accs)])

def show_confusion_matrix(model_name, y_true, proba, thr):
    """Display confusion matrix and key metrics"""
    y_pred = (proba >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    df_cm = pd.DataFrame(
        cm,
        index=["Actual 1 (disengaged)", "Actual 0 (engaged)"],
        columns=["Pred 1 (disengaged)", "Pred 0 (engaged)"]
    )
    display(df_cm)

    TP, FN = cm[0, 0], cm[0, 1]
    FP, TN = cm[1, 0], cm[1, 1]
    prec = TP / (TP + FP + 1e-9)
    rec = TP / (TP + FN + 1e-9)
    acc = (TP + TN) / (TP + TN + FP + FN)

    print(f"{model_name} → Precision={prec:.3f}, Recall={rec:.3f}, Accuracy={acc:.3f}")

experiment_log = []



In [22]:

# TRAIN AND EVALUATE FUNCTION


def train_and_evaluate(model, name, X_train, X_test, y_train, y_test, groups):
    """Train model with pipeline and evaluate"""
    
    # Create pipeline: preprocessor + model
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    # Cross-validation for threshold selection
    cv = GroupKFold(5)
    oof = cross_val_predict(
        pipe, X_train, y_train, groups=groups,
        cv=cv, method="predict_proba", n_jobs=-1
    )[:, 1]
    
    # Find best threshold
    thr = choose_threshold_max_acc(y_train, oof)
    
    # Train on full training set
    pipe.fit(X_train, y_train)
    proba_test = pipe.predict_proba(X_test)[:, 1]
    y_pred = (proba_test >= thr).astype(int)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {acc*100:.2f}% | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
    show_confusion_matrix(name, y_test, proba_test, thr)
    
    # Log results
    experiment_log.append({
        "Model": name,
        "Accuracy": round(acc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1": round(f1, 3),
        "Thr": round(thr, 3)
    })
    
    return pipe


In [23]:

# DEFINE MODELS WITH TUNED HYPERPARAMETERS

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=5000,
        random_state=SEED
    ),
    "Decision Tree": DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=SEED
    ),
    "Decision Tree (Tuned)": DecisionTreeClassifier(
        max_depth=15,
        min_samples_split=3,
        min_samples_leaf=1,
        random_state=SEED
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_leaf=2,
        random_state=SEED,
        n_jobs=-1
    ),
    "Random Forest (Tuned)": RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_leaf=1,
        random_state=SEED,
        n_jobs=-1
    )
}

print("\n✅ Models defined:")
for name in models.keys():
    print(f"   - {name}")


✅ Models defined:
   - Logistic Regression
   - Decision Tree
   - Decision Tree (Tuned)
   - Random Forest
   - Random Forest (Tuned)


In [24]:

# TRAIN ALL MODELS


print("\n" + "="*60)
print("TRAINING MODELS WITH EEG + CATEGORICAL FEATURES")
print("="*60)

trained_models = {}
for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print(f"{'='*60}")
    trained_models[name] = train_and_evaluate(
        model, name, X_train, X_test, y_train, y_test, groups_train
    )


TRAINING MODELS WITH EEG + CATEGORICAL FEATURES

Training: Logistic Regression

Logistic Regression Results:
Accuracy: 62.24% | Precision: 0.625 | Recall: 0.823 | F1: 0.710


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),408,88
Actual 0 (engaged),245,141


Logistic Regression → Precision=0.625, Recall=0.823, Accuracy=0.622

Training: Decision Tree

Decision Tree Results:
Accuracy: 63.49% | Precision: 0.815 | Recall: 0.454 | F1: 0.583


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),225,271
Actual 0 (engaged),51,335


Decision Tree → Precision=0.815, Recall=0.454, Accuracy=0.635

Training: Decision Tree (Tuned)

Decision Tree (Tuned) Results:
Accuracy: 66.67% | Precision: 0.801 | Recall: 0.542 | F1: 0.647


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),269,227
Actual 0 (engaged),67,319


Decision Tree (Tuned) → Precision=0.801, Recall=0.542, Accuracy=0.667

Training: Random Forest

Random Forest Results:
Accuracy: 50.23% | Precision: 0.535 | Recall: 0.881 | F1: 0.666


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),437,59
Actual 0 (engaged),380,6


Random Forest → Precision=0.535, Recall=0.881, Accuracy=0.502

Training: Random Forest (Tuned)

Random Forest (Tuned) Results:
Accuracy: 49.32% | Precision: 0.530 | Recall: 0.871 | F1: 0.659


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),432,64
Actual 0 (engaged),383,3


Random Forest (Tuned) → Precision=0.530, Recall=0.871, Accuracy=0.493


In [25]:

# FINAL RESULTS




df_results = pd.DataFrame(experiment_log)
df_results = df_results.sort_values("F1", ascending=False).reset_index(drop=True)
display(df_results)

best = df_results.iloc[0]
print(f"\n Best Model: {best['Model']}")
print(f"   Accuracy: {best['Accuracy']*100:.2f}%")
print(f"   F1 Score: {best['F1']:.3f}")
print(f"   Precision: {best['Precision']:.3f}")
print(f"   Recall: {best['Recall']:.3f}")
print(f"   Threshold: {best['Thr']:.3f}")

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Thr
0,Logistic Regression,0.622,0.625,0.823,0.71,0.212
1,Random Forest,0.502,0.535,0.881,0.666,0.101
2,Random Forest (Tuned),0.493,0.53,0.871,0.659,0.051
3,Decision Tree (Tuned),0.667,0.801,0.542,0.647,0.909
4,Decision Tree,0.635,0.815,0.454,0.583,0.97



 Best Model: Logistic Regression
   Accuracy: 62.20%
   F1 Score: 0.710
   Precision: 0.625
   Recall: 0.823
   Threshold: 0.212
