In [4]:
# --- Cell 1: Imports ---
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, GroupKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Seed for reproducibility
SEED = 42


In [5]:
# --- Cell 2: Load data using Path references ---
from pathlib import Path

# Define data directory
DATA_DIR = Path("../data")  # adjust relative path as needed

# Define file paths
EEG_FILE = DATA_DIR / "clean_eeg.csv"  # EEG segment-level data
SUMMARY_FILE = DATA_DIR / "challenger_insight_session_summary.csv"  # session summary

# Load data
eeg = pd.read_csv(EEG_FILE)
summary = pd.read_csv(SUMMARY_FILE)

# Quick look at data
display(eeg.head())
display(summary.head())


Unnamed: 0,subject_id,session_id,segment_idx,timestamp,alpha_power,beta_power,theta_power,gamma_power
0,S001,U8KXB4N9,0,2025-04-01 14:00:00,1.5964,0.9354,1.362,0.998
1,S001,U8KXB4N9,1,2025-04-01 14:00:10,0.9799,0.9322,1.4147,0.7717
2,S001,U8KXB4N9,2,2025-04-01 14:00:20,1.4633,0.7045,1.6506,1.0507
3,S001,U8KXB4N9,4,2025-04-01 14:00:40,1.1389,0.8947,0.8671,0.2875
4,S001,U8KXB4N9,6,2025-04-01 14:01:00,1.3752,0.8834,1.1537,0.5461


Unnamed: 0,subject_id,session_id,stimulus_type,task_difficulty,modality,start_time,end_time,n_segments,mean_cog_load,pct_engaged,mean_disengage_risk
0,S001,U8KXB4N9,Discussion,Easy,Live Workshop,2025-04-01 14:00:00,2025-04-01 14:10:10,62,0.29,46.8,0.466
1,S002,U5HKN6BB,Hands-on Lab,Moderate,Async Video,2025-02-10 14:00:00,2025-02-10 14:05:50,36,0.269,66.7,0.347
2,S005,UOWZ6RUE,Lecture,Moderate,Async Video,2025-01-16 14:00:00,2025-01-16 14:06:00,37,0.284,62.2,0.41
3,S006,U56BXK8B,Discussion,Moderate,Self-Paced,2025-02-18 13:00:00,2025-02-18 13:05:10,32,0.27,59.4,0.422
4,S007,U0L23F7L,Code-Along,Moderate,Async Video,2025-03-21 15:00:00,2025-03-21 15:19:00,115,0.28,66.1,0.373


In [6]:
# --- Cell 3: Prepare features and target ---
# For simplicity, use only EEG band powers as features
features = ['alpha_power', 'beta_power', 'theta_power', 'gamma_power']

X = eeg[features]
y = eeg['disengaged']  # target

# Optional: use session_id as groups to avoid leakage
groups = eeg['session_id']


KeyError: 'disengaged'

In [None]:
# --- Cell 4: Split train/test sets ---
X_train, X_test, y_train, y_test, train_groups, test_groups = train_test_split(
    X, y, groups, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")


Train size: 3300, Test size: 826


In [None]:
# --- Cell 5: Threshold selection and metrics functions ---
def choose_threshold_max_f1(y_true, proba):
    from sklearn.metrics import precision_recall_curve
    p, r, t = precision_recall_curve(y_true, proba)
    if len(t) == 0:
        return 0.5
    f1 = (2 * p * r) / np.clip(p + r, 1e-9, None)
    return float(t[np.nanargmax(f1[:-1])])

def show_confusion_and_explain(model_name, y_true, proba, thr):
    y_pred = (proba >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    df_cm = pd.DataFrame(
        cm,
        index=["Actual 1 (disengaged)", "Actual 0 (engaged)"],
        columns=["Pred 1 (disengaged)", "Pred 0 (engaged)"]
    )
    display(df_cm)
    
    TP, FN = cm[0, 0], cm[0, 1]
    FP, TN = cm[1, 0], cm[1, 1]
    prec = TP / (TP + FP + 1e-9)
    rec = TP / (TP + FN + 1e-9)
    acc = (TP + TN) / (TP + TN + FP + FN)

    print(f"{model_name} → Precision={prec:.3f}, Recall={rec:.3f}, Accuracy={acc:.3f}")


In [None]:
# --- Cell 6: Train & evaluate function ---
experiment_log = []

def train_and_evaluate(model, name, X_train, X_test, y_train, y_test, groups, scale_features=False):
    cv = GroupKFold(n_splits=5)
    
    if scale_features:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled, X_test_scaled = X_train, X_test
    
    # Cross-validated probabilities for threshold selection
    oof = cross_val_predict(
        model, X_train_scaled, y_train, groups=groups,
        cv=cv, method="predict_proba", n_jobs=-1
    )[:, 1]
    
    thr = choose_threshold_max_f1(y_train, oof)
    
    # Fit model
    model.fit(X_train_scaled, y_train)
    proba_test = model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    acc = accuracy_score(y_test, (proba_test >= thr).astype(int))
    prec = precision_score(y_test, (proba_test >= thr).astype(int))
    rec = recall_score(y_test, (proba_test >= thr).astype(int))
    f1 = f1_score(y_test, (proba_test >= thr).astype(int))
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {acc*100:.2f}% | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
    
    show_confusion_and_explain(name, y_test, proba_test, thr)
    
    experiment_log.append({
        "Model": name,
        "Accuracy": round(acc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1": round(f1, 3),
        "Thr": round(thr, 3)
    })


In [None]:
# --- Cell 7: Define models ---
lr_model = LogisticRegression(max_iter=5000, class_weight="balanced", random_state=SEED)
dt_model = DecisionTreeClassifier(max_depth=12, min_samples_split=5, class_weight="balanced", random_state=SEED)
rf_model = RandomForestClassifier(
    n_estimators=1000, max_depth=16, min_samples_leaf=2,
    class_weight="balanced", random_state=SEED, n_jobs=-1
)


In [None]:
# --- Cell 8: Train & evaluate all models ---
train_and_evaluate(lr_model, "Logistic Regression", X_train, X_test, y_train, y_test, train_groups, scale_features=True)
train_and_evaluate(dt_model, "Decision Tree", X_train, X_test, y_train, y_test, train_groups)
train_and_evaluate(rf_model, "Random Forest", X_train, X_test, y_train, y_test, train_groups)



Logistic Regression Results:
Accuracy: 52.06% | Precision: 0.513 | Recall: 0.964 | F1: 0.669


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),401,15
Actual 0 (engaged),381,29


Logistic Regression → Precision=0.513, Recall=0.964, Accuracy=0.521

Decision Tree Results:
Accuracy: 50.36% | Precision: 0.504 | Recall: 1.000 | F1: 0.670


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),416,0
Actual 0 (engaged),410,0


Decision Tree → Precision=0.504, Recall=1.000, Accuracy=0.504

Random Forest Results:
Accuracy: 50.85% | Precision: 0.506 | Recall: 0.957 | F1: 0.662


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),398,18
Actual 0 (engaged),388,22


Random Forest → Precision=0.506, Recall=0.957, Accuracy=0.508


In [None]:
# --- Cell 9: Leaderboard ---
df_results = pd.DataFrame(experiment_log)
df_results = df_results.sort_values("Accuracy", ascending=False).reset_index(drop=True)
display(df_results)
print(f"Best by Accuracy: {df_results.iloc[0]['Model']} ({df_results.iloc[0]['Accuracy']*100:.2f}%)")


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Thr
0,Logistic Regression,0.521,0.513,0.964,0.669,0.329
1,Random Forest,0.508,0.506,0.957,0.662,0.246
2,Decision Tree,0.504,0.504,1.0,0.67,0.0


Best by Accuracy: Logistic Regression (52.10%)


In [None]:
df_results = pd.DataFrame(experiment_log)
df_results = df_results.sort_values("Accuracy", ascending=False).reset_index(drop=True)
display(df_results)
print(f"Best by Accuracy: {df_results.iloc[0]['Model']} ({df_results.iloc[0]['Accuracy']*100:.2f}%)")


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Thr
0,Logistic Regression,0.521,0.513,0.964,0.669,0.329
1,Random Forest,0.508,0.506,0.957,0.662,0.246
2,Decision Tree,0.504,0.504,1.0,0.67,0.0


Best by Accuracy: Logistic Regression (52.10%)
