In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

mat = pd.read_csv("student-mat.csv", sep=";")
por = pd.read_csv("student-por.csv", sep=";")

mat["course"] = "math"
por["course"] = "portuguese"

df = pd.concat([mat, por], ignore_index=True)

df["pass"] = (df["G3"] >= 10).astype(int)

# Drop grades to avoid leakage
X = df.drop(columns=["G1", "G2", "G3", "pass"])
y = df["pass"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

clf = Pipeline(steps=[
    ("prep", preprocess),
    ("lr", LogisticRegression(max_iter=5000))
])

clf.fit(X_train, y_train)

def evaluate(name, X_split, y_true):
    y_pred = clf.predict(X_split)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n{name}")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
    print("F1       :", f1_score(y_true, y_pred, zero_division=0))
    print("Confusion matrix:\n", cm)

    return y_pred, cm

val_pred, _ = evaluate("VALIDATION", X_val, y_val)
test_pred, test_cm = evaluate("TEST", X_test, y_test)

plt.figure(figsize=(5,4))
plt.imshow(test_cm, interpolation="nearest")
plt.title("Confusion Matrix (Test)")
plt.colorbar()
plt.xticks([0, 1], ["Fail (0)", "Pass (1)"])
plt.yticks([0, 1], ["Fail (0)", "Pass (1)"])

for i in range(2):
    for j in range(2):
        plt.text(j, i, str(test_cm[i, j]), ha="center", va="center")

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

ohe = clf.named_steps["prep"].named_transformers_["cat"]
cat_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_cols.to_numpy(), cat_names])

coefs = clf.named_steps["lr"].coef_[0]

top_k = 15
top_idx = np.argsort(np.abs(coefs))[::-1][:top_k]

top_features = feature_names[top_idx]
top_coefs = coefs[top_idx]

plt.figure(figsize=(9,5))
plt.barh(top_features, top_coefs)
plt.gca().invert_yaxis()
plt.title("Top 15 Logistic Regression Coefficients (by |coef|)")
plt.xlabel("Coefficient  ( + increases pass probability,  - decreases )")
plt.tight_layout()
plt.show()

print("\nTop 15 features:")
for f, c in zip(top_features, top_coefs):
    print(f"{f:35s}  {c:+.4f}")

FileNotFoundError: [Errno 2] No such file or directory: 'student-mat.csv'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedKFold, cross_val_predict
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, balanced_accuracy_score
)

# progress bar stuff
from tqdm.auto import tqdm
import joblib


class TqdmJoblib(joblib.parallel.BatchCompletionCallBack):
    def __init__(self, *args, **kwargs):
        self.pbar = kwargs.pop("pbar")
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        self.pbar.update(n=self.batch_size)
        return super().__call__(*args, **kwargs)


# ----------------------------
#  load + combine
# ----------------------------
df_mat = pd.read_csv("student-mat.csv", sep=";")
df_por = pd.read_csv("student-por.csv", sep=";")
df = pd.concat([df_mat, df_por], ignore_index=True)

# ----------------------------
# label (pass/fail)
# ----------------------------
df["pass"] = (df["G3"] >= 10).astype(int)

X = df.drop(columns=["G1", "G2", "G3", "pass"])
y = df["pass"]

print("shape:", X.shape, flush=True)
print("pass rate:", y.mean(), flush=True)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("train:", X_train.shape, "val:", X_val.shape, "test:", X_test.shape, flush=True)

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

prep = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

pipe = Pipeline([
    ("prep", prep),
    ("interactions", PolynomialFeatures(
        degree=2,
        interaction_only=True,
        include_bias=False
    )),
    ("lr", LogisticRegression(
        penalty="elasticnet",
        solver="saga",            # required for elastic net
        class_weight="balanced",
        max_iter=30000,
        n_jobs=-1
    ))
])

# ----------------------------
# 7) tune C + l1_ratio w/ 10-fold CV 
# ----------------------------
param_grid = {
    "lr__C": [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
    "lr__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
}

cv10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=cv10,
    n_jobs=-1,
    refit=True
)

n_candidates = len(param_grid["lr__C"]) * len(param_grid["lr__l1_ratio"])
n_fits = n_candidates * cv10.get_n_splits()

print(f"\nstarting gridsearch: {n_candidates} param combos x {cv10.get_n_splits()} folds = {n_fits} fits", flush=True)

with tqdm(total=n_fits, desc="gridsearch fits", leave=True) as pbar:
    old_cb = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = lambda *args, **kwargs: TqdmJoblib(*args, pbar=pbar, **kwargs)
    try:
        grid.fit(X_train, y_train)
    finally:
        joblib.parallel.BatchCompletionCallBack = old_cb

model = grid.best_estimator_

print("\nbest params:", grid.best_params_, flush=True)
print("best cv balanced acc:", grid.best_score_, flush=True)

def eval_it(tag, X_split, y_true, thr=0.5):
    probs = model.predict_proba(X_split)[:, 1]
    preds = (probs >= thr).astype(int)

    cm = confusion_matrix(y_true, preds)
    TN, FP = cm[0, 0], cm[0, 1]
    fail_recall = TN / (TN + FP) if (TN + FP) else 0.0

    print("\n" + tag)
    print("threshold:", round(thr, 3))
    print("accuracy:", round(accuracy_score(y_true, preds), 4))
    print("balanced acc:", round(balanced_accuracy_score(y_true, preds), 4))
    print("precision (pass):", round(precision_score(y_true, preds, zero_division=0), 4))
    print("recall (pass):   ", round(recall_score(y_true, preds, zero_division=0), 4))
    print("f1 (pass):       ", round(f1_score(y_true, preds, zero_division=0), 4))
    print("recall (fail):   ", round(fail_recall, 4))
    print("confusion matrix:\n", cm)

    return probs, preds, cm

# ----------------------------
# 9) choose threshold using OUT-OF-FOLD probs on train (stable)
# ----------------------------
print("\ngetting out-of-fold probs for threshold (this part can take a bit)...", flush=True)

oof_probs = cross_val_predict(
    model, X_train, y_train,
    cv=cv10,
    method="predict_proba",
    n_jobs=-1
)[:, 1]

ths = np.linspace(0.1, 0.9, 81)

best_thr = 0.5
best_bal = -1

for t in ths:
    oof_pred = (oof_probs >= t).astype(int)
    bal = balanced_accuracy_score(y_train, oof_pred)
    if bal > best_bal:
        best_bal = bal
        best_thr = t

print("\nthreshold picked from OOF train preds:", round(best_thr, 3), flush=True)
print("OOF train balanced acc at that threshold:", round(best_bal, 4), flush=True)

# ----------------------------
# 10) final eval
# ----------------------------
_ = eval_it("VALIDATION (OOF threshold)", X_val, y_val, thr=best_thr)
test_probs, test_preds, test_cm = eval_it("TEST (OOF threshold)", X_test, y_test, thr=best_thr)

# ----------------------------
# 11) plot confusion matrix (test)
# ----------------------------
plt.figure(figsize=(5, 4))
plt.imshow(test_cm, interpolation="nearest")
plt.title("confusion matrix (test)")
plt.colorbar()
plt.xticks([0, 1], ["fail (0)", "pass (1)"])
plt.yticks([0, 1], ["fail (0)", "pass (1)"])

for i in range(2):
    for j in range(2):
        plt.text(j, i, str(test_cm[i, j]), ha="center", va="center")

plt.xlabel("predicted")
plt.ylabel("actual")
plt.tight_layout()
plt.show()

# ----------------------------
# 12) print top non-zero coefficients (includes interactions)
# ----------------------------
prep_fitted = model.named_steps["prep"]
poly_fitted = model.named_steps["interactions"]
lr_fitted = model.named_steps["lr"]

ohe = prep_fitted.named_transformers_["cat"]
cat_names = ohe.get_feature_names_out(cat_cols)
base_feat_names = np.concatenate([num_cols.to_numpy(), cat_names])

poly_feat_names = poly_fitted.get_feature_names_out(base_feat_names)

coefs = lr_fitted.coef_[0]
nz = np.where(coefs != 0)[0]

print("\nnon-zero features kept:", len(nz), "out of", len(poly_feat_names), flush=True)

top_k = 25
top_idx = nz[np.argsort(np.abs(coefs[nz]))[::-1][:top_k]]

print("\ntop features (by |coef|):", flush=True)
for i in top_idx:
    print(f"{poly_feat_names[i]:55s}  {coefs[i]:+.4f}")

plt.figure(figsize=(11, 6))
plt.barh(poly_feat_names[top_idx][::-1], coefs[top_idx][::-1])
plt.title("top elastic net logistic regression coefficients (with interactions)")
plt.xlabel("coef  (+ helps pass, - hurts pass)")
plt.tight_layout()
plt.show()

In [None]:

target_recall_fail = 0.80 

val_probs_pass = clf.predict_proba(X_val)[:, 1]
# val_probs_pass = model.predict_proba(X_val)[:, 1]  # use this line instead if your estimator is named model

thresholds = np.linspace(0.01, 0.99, 99)

best_thr = None
best_score = -1

for thr in thresholds:
    val_pred = (val_probs_pass >= thr).astype(int)  # 1=pass, 0=fail
    cm = confusion_matrix(y_val, val_pred, labels=[0, 1])
    TN, FP, FN, TP = cm.ravel()

    # Recall for FAIL (class 0) = TN / (TN + FP)
    recall_fail = TN / (TN + FP) if (TN + FP) else 0.0

    if recall_fail >= target_recall_fail:
        # pick whichever "quality" metric you want AFTER meeting the fail-recall requirement:
        score = balanced_accuracy_score(y_val, val_pred)  # good default
        # score = f1_score(y_val, val_pred, zero_division=0)  # alternative
        if score > best_score:
            best_score = score
            best_thr = thr

if best_thr is None:
    best_thr = 0.5
    best_recall_fail = -1
    for thr in thresholds:
        val_pred = (val_probs_pass >= thr).astype(int)
        cm = confusion_matrix(y_val, val_pred, labels=[0, 1])
        TN, FP, FN, TP = cm.ravel()
        recall_fail = TN / (TN + FP) if (TN + FP) else 0.0
        if recall_fail > best_recall_fail:
            best_recall_fail = recall_fail
            best_thr = thr

print("Chosen threshold (from VAL):", round(best_thr, 3))
print("VAL best balanced acc (subject to recall_fail target):", round(best_score, 4) if best_score != -1 else "N/A")

def report_split(tag, X_split, y_true, thr):
    probs_pass = clf.predict_proba(X_split)[:, 1]
    # probs_pass = model.predict_proba(X_split)[:, 1]  # use if estimator is named model
    pred = (probs_pass >= thr).astype(int)

    cm = confusion_matrix(y_true, pred, labels=[0, 1])
    TN, FP, FN, TP = cm.ravel()

    recall_fail = TN / (TN + FP) if (TN + FP) else 0.0  # recall for fail (class 0)
    recall_pass = TP / (TP + FN) if (TP + FN) else 0.0  # recall for pass (class 1)

    print("\n" + tag)
    print("thr:", round(thr, 3))
    print("accuracy:", round(accuracy_score(y_true, pred), 4))
    print("balanced acc:", round(balanced_accuracy_score(y_true, pred), 4))
    print("precision (pass=1):", round(precision_score(y_true, pred, zero_division=0), 4))
    print("recall (pass=1):   ", round(recall_pass, 4))
    print("recall (fail=0):   ", round(recall_fail, 4))
    print("f1 (pass=1):       ", round(f1_score(y_true, pred, zero_division=0), 4))
    print("confusion matrix [ [TN FP], [FN TP] ]:\n", cm)

report_split("VALIDATION (chosen thr)", X_val, y_val, best_thr)
report_split("TEST (chosen thr)", X_test, y_test, best_thr)