In [3]:
#imports
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# load data easier
DATA = Path("../data")

In [4]:
# load clean eeg and session summary data
eeg = pd.read_csv(DATA / "clean_eeg.csv")
summary = pd.read_csv(DATA / "challenger_insight_session_summary.csv")

print("EEG shape:", eeg.shape)
print("Summary shape:", summary.shape)

eeg.head(), summary.head()

EEG shape: (4126, 8)
Summary shape: (76, 11)


(  subject_id session_id  segment_idx            timestamp  alpha_power  \
 0       S001   U8KXB4N9            0  2025-04-01 14:00:00       1.5964   
 1       S001   U8KXB4N9            1  2025-04-01 14:00:10       0.9799   
 2       S001   U8KXB4N9            2  2025-04-01 14:00:20       1.4633   
 3       S001   U8KXB4N9            4  2025-04-01 14:00:40       1.1389   
 4       S001   U8KXB4N9            6  2025-04-01 14:01:00       1.3752   
 
    beta_power  theta_power  gamma_power  
 0      0.9354       1.3620       0.9980  
 1      0.9322       1.4147       0.7717  
 2      0.7045       1.6506       1.0507  
 3      0.8947       0.8671       0.2875  
 4      0.8834       1.1537       0.5461  ,
   subject_id session_id stimulus_type task_difficulty       modality  \
 0       S001   U8KXB4N9    Discussion            Easy  Live Workshop   
 1       S002   U5HKN6BB  Hands-on Lab        Moderate    Async Video   
 2       S005   UOWZ6RUE       Lecture        Moderate    Async Video 

In [5]:
# label the data based on the mean disengage risk
threshold = summary['mean_disengage_risk'].median()
summary['disengaged'] = (summary['mean_disengage_risk'] > threshold).astype(int)

print("Threshold used:", round(threshold, 3))
print(summary['disengaged'].value_counts(normalize=True).rename("class_share"))
summary.head()

Threshold used: 0.388
disengaged
1    0.5
0    0.5
Name: class_share, dtype: float64


Unnamed: 0,subject_id,session_id,stimulus_type,task_difficulty,modality,start_time,end_time,n_segments,mean_cog_load,pct_engaged,mean_disengage_risk,disengaged
0,S001,U8KXB4N9,Discussion,Easy,Live Workshop,2025-04-01 14:00:00,2025-04-01 14:10:10,62,0.29,46.8,0.466,1
1,S002,U5HKN6BB,Hands-on Lab,Moderate,Async Video,2025-02-10 14:00:00,2025-02-10 14:05:50,36,0.269,66.7,0.347,0
2,S005,UOWZ6RUE,Lecture,Moderate,Async Video,2025-01-16 14:00:00,2025-01-16 14:06:00,37,0.284,62.2,0.41,1
3,S006,U56BXK8B,Discussion,Moderate,Self-Paced,2025-02-18 13:00:00,2025-02-18 13:05:10,32,0.27,59.4,0.422,1
4,S007,U0L23F7L,Code-Along,Moderate,Async Video,2025-03-21 15:00:00,2025-03-21 15:19:00,115,0.28,66.1,0.373,0


In [6]:
# create a csv of the labeled session summary data
out_sessions = DATA/"session_summary_labeled.csv"
summary.to_csv(out_sessions, index=False)
out_sessions

PosixPath('../data/session_summary_labeled.csv')

In [7]:
# clean the column names to be more consistent
eeg.columns = eeg.columns.str.strip().str.lower()
summary.columns = summary.columns.str.strip().str.lower()

# merge the eeg and session summary data
segments_labeled = eeg.merge(summary[["subject_id", "session_id", "disengaged"]], on=["subject_id", "session_id"], how="left", validate="many_to_one")

print("Segments labeleed shape:", segments_labeled.shape)
segments_labeled.head()

Segments labeleed shape: (4126, 9)


Unnamed: 0,subject_id,session_id,segment_idx,timestamp,alpha_power,beta_power,theta_power,gamma_power,disengaged
0,S001,U8KXB4N9,0,2025-04-01 14:00:00,1.5964,0.9354,1.362,0.998,1
1,S001,U8KXB4N9,1,2025-04-01 14:00:10,0.9799,0.9322,1.4147,0.7717,1
2,S001,U8KXB4N9,2,2025-04-01 14:00:20,1.4633,0.7045,1.6506,1.0507,1
3,S001,U8KXB4N9,4,2025-04-01 14:00:40,1.1389,0.8947,0.8671,0.2875,1
4,S001,U8KXB4N9,6,2025-04-01 14:01:00,1.3752,0.8834,1.1537,0.5461,1


In [8]:
# create a csv of the labeled eeg segments
out_segments = DATA/"eeg_segments_labeled.csv"
segments_labeled.to_csv(out_segments, index=False)
out_segments


PosixPath('../data/eeg_segments_labeled.csv')

In [9]:
# check if the needed columns are present
needed_cols = ["alpha_power", "beta_power", "theta_power", "gamma_power"]
missing = [c for c in needed_cols if c not in segments_labeled.columns]
if missing:
    raise ValueError(f"Missing expected EEG columns: {missing}."
                    f"Got: {segments_labeled.columns.tolist()}")

# created new features
X = segments_labeled[needed_cols].copy()
X["beta_alpha"] = segments_labeled["beta_power"] / (segments_labeled["alpha_power"] + 1e-6)
X["theta_alpha"] = segments_labeled["theta_power"] / (segments_labeled["alpha_power"] + 1e-6)

# create the target variable
y = segments_labeled["disengaged"].astype(int)

X.shape, y.value_counts()



((4126, 6),
 disengaged
 1    2076
 0    2050
 Name: count, dtype: int64)

In [10]:
from sklearn.model_selection import GroupShuffleSplit

groups = segments_labeled["session_id"]
# split the data into training and test sets using GroupShuffleSplit because the data is grouped by session rather than randomly
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()

# even though the data was split by session randomly, the test set ended up more disengaged heavy than the training set
train_ratio = y_train.value_counts(normalize=True)
test_ratio = y_test.value_counts(normalize=True)
pd.DataFrame({'Train': train_ratio, 'Test': test_ratio})

Unnamed: 0_level_0,Train,Test
disengaged,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.566986,0.196931
1,0.433014,0.803069


In [11]:
# trying to pick a more balanced split
for seed in [1, 5, 12, 27, 42]:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    train_idx, test_idx = next(gss.split(X, y, groups))
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    print(seed, np.round(y_train.mean(), 2), np.round(y_test.mean(), 2))

1 0.5 0.51
5 0.49 0.56
12 0.47 0.62
27 0.5 0.52
42 0.43 0.8


In [12]:
# 27 is way more balanced so I will be using that seed instead
BEST_SEED = 27

groups = segments_labeled["session_id"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=BEST_SEED)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

train_ratio = y_train.value_counts(normalize=True)
test_ratio = y_test.value_counts(normalize=True)
display(pd.DataFrame({'Train': train_ratio, 'Test': test_ratio}))


Unnamed: 0_level_0,Train,Test
disengaged,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.501541,0.479592
1,0.498459,0.520408


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# deining the different models to try
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(random_state=BEST_SEED, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=BEST_SEED, class_weight="balanced_subsample")
}

# function to evaluate the models
def evaluate(m):
    m.fit(X_train, y_train)
    p = m.predict(X_test)
    return dict(
        Accuracy = round(accuracy_score(y_test, p), 3),
        Precision = round(precision_score(y_test, p), 3),
        Recall = round(recall_score(y_test, p), 3),
        F1 = round(f1_score(y_test, p), 3)
    )

# evaluate the models
results = pd.DataFrame({name: evaluate(m) for name, m in models.items()}).T
results

#models arent performing well, so they will need to be improved


Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.541,0.565,0.51,0.536
Decision Tree,0.514,0.534,0.514,0.524
Random Forest,0.526,0.548,0.51,0.528


In [14]:
#First improvement attempt

#scaling and tuning the Logistic Regression model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.metrics import confusion_matrix

X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
groups_tr = segments_labeled.iloc[train_idx]["session_id"]

pipe_log = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, class_weight="balanced", random_state=BEST_SEED)),
])

param_grid = {"clf__C": [0.1, 0.5, 1, 2, 5]}

gkf = GroupKFold(n_splits=5)

gs = GridSearchCV(
    pipe_log, param_grid,
    cv=gkf,  # Pass the object, not the split() result
    scoring="f1", n_jobs=-1
)
gs.fit(X_tr, y_tr, groups=groups_tr)  # Pass groups to fit()

best_log = gs.best_estimator_
pred = best_log.predict(X_te)

metrics_scaled = {
    "Accuracy": round(accuracy_score(y_te, pred), 3),
    "Precision": round(precision_score(y_te, pred), 3),
    "Recall": round(recall_score(y_te, pred), 3),
    "F1": round(f1_score(y_te, pred), 3),
    "Best_C": gs.best_params_["clf__C"]
}

metrics_scaled, confusion_matrix(y_te, pred)

({'Accuracy': 0.543,
  'Precision': 0.568,
  'Recall': 0.512,
  'F1': 0.538,
  'Best_C': 0.5},
 array([[244, 179],
        [224, 235]]))

In [15]:
# trying a new label for disengaged
summary_clean = summary.copy()

summary_clean["disengaged"] = (summary_clean["pct_engaged"] < 0.4).astype(int)
summary_clean["disengaged"].value_counts()

summary["pct_engaged"].describe()

count    76.000000
mean     61.528947
std       8.555494
min      40.000000
25%      55.675000
50%      61.200000
75%      67.400000
max      79.400000
Name: pct_engaged, dtype: float64

In [16]:
q25 = summary["pct_engaged"].quantile(0.25)  # â‰ˆ 55.7
summary_clean = summary.copy()
summary_clean["disengaged"] = (summary_clean["pct_engaged"] <= q25).astype(int)
summary_clean["disengaged"].value_counts()

disengaged
0    57
1    19
Name: count, dtype: int64

In [17]:
segments_labeled = pd.merge(
    eeg,
    summary_clean[["session_id", "disengaged"]],
    on="session_id", how="inner"
)

# Create engineered features
segments_labeled["beta_alpha"] = segments_labeled["beta_power"] / (segments_labeled["alpha_power"] + 1e-6)
segments_labeled["theta_alpha"] = segments_labeled["theta_power"] / (segments_labeled["alpha_power"] + 1e-6)

X = segments_labeled[["alpha_power","beta_power","theta_power","gamma_power","beta_alpha","theta_alpha"]]
y = segments_labeled["disengaged"]

y.value_counts(normalize=True)

disengaged
0    0.759816
1    0.240184
Name: proportion, dtype: float64

In [18]:
groups = segments_labeled["session_id"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

train_ratio = y_train.value_counts(normalize=True)
test_ratio = y_test.value_counts(normalize=True)
pd.DataFrame({'Train': train_ratio, 'Test': test_ratio})


Unnamed: 0_level_0,Train,Test
disengaged,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.82177,0.494885
1,0.17823,0.505115


In [19]:
target = y.mean()  
best = (None, 1.0, None)
groups = segments_labeled["session_id"]

for seed in [1, 5, 12, 27, 42, 73, 99, 123, 333]:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    tr_idx, te_idx = next(gss.split(X, y, groups))
    y_te_ratio = y.iloc[te_idx].mean()
    diff = abs(y_te_ratio - target)
    if diff < best[1]:
        best = (seed, diff, (tr_idx, te_idx))

BEST_SEED1 = best[0]
train_idx, test_idx = best[2]
print("Chosen seed:", BEST_SEED1)
print("Overall:", round(target,3),
      "Train:", round(y.iloc[train_idx].mean(),3),
      "Test:", round(y.iloc[test_idx].mean(),3))


Chosen seed: 1
Overall: 0.24 Train: 0.239 Test: 0.243


In [20]:
groups = segments_labeled["session_id"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=BEST_SEED1)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

train_ratio = y_train.value_counts(normalize=True)
test_ratio = y_test.value_counts(normalize=True)
pd.DataFrame({'Train': train_ratio, 'Test': test_ratio})

Unnamed: 0_level_0,Train,Test
disengaged,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.760668,0.756726
1,0.239332,0.243274


In [21]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(random_state=BEST_SEED1, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=BEST_SEED1, class_weight="balanced_subsample"
    ),
}

def evaluate(m):
    m.fit(X_train, y_train)
    p = m.predict(X_test)
    return dict(
        Accuracy = round(accuracy_score(y_test, p), 3),
        Precision = round(precision_score(y_test, p), 3),
        Recall = round(recall_score(y_test, p), 3),
        F1 = round(f1_score(y_test, p), 3)
    )

results = pd.DataFrame({name: evaluate(m) for name, m in models.items()}).T
results


Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.541,0.255,0.461,0.328
Decision Tree,0.62,0.218,0.217,0.217
Random Forest,0.737,0.263,0.046,0.078
