In [157]:
# imports
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GroupKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

# initialize experiment log
experiment_log = []

In [159]:
# functions for threshold selection
def choose_threshold_max_f1(y_true, proba):
    from sklearn.metrics import precision_recall_curve
    p, r, t = precision_recall_curve(y_true, proba)
    if len(t) == 0:
        return 0.5
    f1 = (2 * p * r) / np.clip(p + r, 1e-9, None)
    return float(t[np.nanargmax(f1[:-1])])

def choose_threshold_max_acc(y_true, proba):
    thresholds = np.linspace(0, 1, 100)
    accs = [accuracy_score(y_true, (proba >= t).astype(int)) for t in thresholds]
    return float(thresholds[np.argmax(accs)])

In [161]:
# function to display confusion matrix and key metrics
def show_confusion_and_explain(model_name, y_true, proba, thr):
    y_pred = (proba >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    df_cm = pd.DataFrame(
        cm,
        index=["Actual 1 (disengaged)", "Actual 0 (engaged)"],
        columns=["Pred 1 (disengaged)", "Pred 0 (engaged)"]
    )
    display(df_cm)

    TP, FN = cm[0, 0], cm[0, 1]
    FP, TN = cm[1, 0], cm[1, 1]
    prec = TP / (TP + FP + 1e-9)
    rec = TP / (TP + FN + 1e-9)
    acc = (TP + TN) / (TP + TN + FP + FN)

    print(f"{model_name} → Precision={prec:.3f}, Recall={rec:.3f}, Accuracy={acc:.3f}")

In [163]:
# function to train, evaluate, and log results
def train_and_evaluate(model, name, X_train, X_test, y_train, y_test, groups):
    cv = GroupKFold(5)

    # cross-validated probabilities and best accuracy threshold
    oof = cross_val_predict(
        model, X_train, y_train, groups=groups,
        cv=cv, method="predict_proba", n_jobs=-1
    )[:, 1]
    thr = choose_threshold_max_acc(y_train, oof)

    # fit and evaluate on test set
    model.fit(X_train, y_train)
    proba_test = model.predict_proba(X_test)[:, 1]
    y_pred = (proba_test >= thr).astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name} Results:")
    print(f"Accuracy: {acc*100:.2f}% | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
    show_confusion_and_explain(name, y_test, proba_test, thr)

    # log results
    experiment_log.append({
        "Model": name,
        "Accuracy": round(acc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1": round(f1, 3),
        "Thr": round(thr, 3)
    })

In [165]:
# define models without class weights
lr_model = LogisticRegression(max_iter=5000, random_state=SEED)
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=3, random_state=SEED)
rf_model = RandomForestClassifier(
    n_estimators=800, max_depth=14, min_samples_leaf=3,
    random_state=SEED, n_jobs=-1
)

In [167]:
# train and evaluate each model
train_and_evaluate(lr_model, "Logistic Regression",
                   X_tr_s, X_te_s, y_tr_s, y_te_s, g_tr_s)

train_and_evaluate(dt_model, "Decision Tree",
                   X_tr_s, X_te_s, y_tr_s, y_te_s, g_tr_s)

train_and_evaluate(rf_model, "Random Forest",
                   X_tr_s, X_te_s, y_tr_s, y_te_s, g_tr_s)


Logistic Regression Results:
Accuracy: 52.94% | Precision: 0.455 | Recall: 0.714 | F1: 0.556


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),5,2
Actual 0 (engaged),6,4


Logistic Regression → Precision=0.455, Recall=0.714, Accuracy=0.529

Decision Tree Results:
Accuracy: 82.35% | Precision: 0.700 | Recall: 1.000 | F1: 0.824


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),7,0
Actual 0 (engaged),3,7


Decision Tree → Precision=0.700, Recall=1.000, Accuracy=0.824

Random Forest Results:
Accuracy: 76.47% | Precision: 0.636 | Recall: 1.000 | F1: 0.778


Unnamed: 0,Pred 1 (disengaged),Pred 0 (engaged)
Actual 1 (disengaged),7,0
Actual 0 (engaged),4,6


Random Forest → Precision=0.636, Recall=1.000, Accuracy=0.765


In [168]:
# final leaderboard
df = pd.DataFrame(experiment_log)
df = df.sort_values("Accuracy", ascending=False).reset_index(drop=True)
display(df)
print(f"Best by Accuracy: {df.iloc[0]['Model']} ({df.iloc[0]['Accuracy']*100:.2f}%)")

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Thr
0,Decision Tree,0.824,0.7,1.0,0.824,0.01
1,Random Forest,0.765,0.636,1.0,0.778,0.404
2,Logistic Regression,0.529,0.455,0.714,0.556,0.434


Best by Accuracy: Decision Tree (82.40%)
