In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import RandomizedSearchCV


#XGBoost
from xgboost import XGBClassifier


# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    average_precision_score
)

In [2]:
#Global Variables
csv_path = "StudentPerformanceFactors.csv"
target = "Pass"

In [3]:
#Load data

if not os.path.exists(csv_path):
    raise FileNotFoundError("Path/File not found")
else: 
    df = pd.read_csv(csv_path)


In [4]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [5]:
#Cleaning data

df = df.drop_duplicates().copy()
for col in df.select_dtypes(include = ["object"]).columns:
        df[col] = df[col].astype(str).str.strip()
df = df.dropna(subset = ["Exam_Score"])

In [6]:
#Creating target column

df[target] = [0 if c<60 else 1 for c in df["Exam_Score"]]
df = df.drop(columns = ["Exam_Score"])
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Pass
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,1
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,1
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,1
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,1
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,1


In [7]:
#Proportion of passing students
passed = df[target].sum()
print("% of Students Passing:", (passed/6606)*100)

% of Students Passing: 98.98577051165607


In [8]:
#Split data
X = df.drop(columns = [target])
y = df[target]

features = X.columns

cat_cols = [c for c in features if X[c].dtype == "object"]
num_cols = X.select_dtypes(include=['number']).columns

print("cat_cols:", cat_cols)
print("num_cols:", num_cols)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)


cat_cols: ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']
num_cols: Index(['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores',
       'Tutoring_Sessions', 'Physical_Activity'],
      dtype='object')


In [9]:


def evaluate_clf(y_true, y_pred, y_proba = None, label = "Model"):

    print(f"\n=== {label} ===")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")          # fraction of correct predictions
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")         # of predicted 1s, how many are actually 1
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")            # of actual 1s, how many we caught
    print(f"F1       : {f1_score(y_true, y_pred):.4f}")                # harmonic mean of precision & recall
    if y_proba is not None:
        print(f"ROC-AUC : {roc_auc_score(y_true, y_proba):.4f}")       # probability threshold-independent metric
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))   # [[TN, FP],[FN, TP]]
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))

def evaluate_cls(y_true, y_pred, y_proba=None, label="Model", pos_label=1):
    print(f"\n=== {label} ===")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0, pos_label=pos_label))
    print("Recall   :", recall_score(y_true, y_pred, zero_division=0, pos_label=pos_label))
    print("F1       :", f1_score(y_true, y_pred, zero_division=0, pos_label=pos_label))
    if y_proba is not None:
        # accept either 1D positive-class probs or 2D predict_proba output
        proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba
        print("ROC-AUC  :", roc_auc_score(y_true, proba))
        print("PR-AUC   :", average_precision_score(y_true, proba))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, zero_division=0))


In [10]:
#Pre processor

cat_pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output = False))
])

num_pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scalar", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ],
    remainder = "drop",
    verbose_feature_names_out = False
)

In [11]:
logreg_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("smote",SMOTE(random_state = 42)),
    ("selectkbest", SelectKBest(score_func=f_classif)),
    ("clf", LogisticRegression(class_weight="balanced"))
])

param_grid = {
    "selectkbest__k": [5, 10, 15, 20],  # must match step name
    "clf__C": [0.01, 0.1, 1, 10]
}

logreg_model = GridSearchCV(logreg_pipe, param_grid, cv=5, scoring="f1")
logreg_model.fit(X_train, y_train)

print("Best parameters:", logreg_model.best_params_)
print("Best score:", logreg_model.best_score_)


Best parameters: {'clf__C': 10, 'selectkbest__k': 20}
Best score: 0.9966459947387936


In [12]:
def majority_baseline(y_train, n_pred):
    majority = pd.Series(y_train).mode()[0]

    return np.full(shape = n_pred, fill_value = majority, dtype = int)

y_pred_base = majority_baseline(y_train, n_pred = len(y_test))
evaluate_clf(y_test, y_pred_base, label = "Baseline (Majority Class)")



=== Baseline (Majority Class) ===


NameError: name 'accuracy_score' is not defined

In [None]:
y_pred = logreg_model.predict(X_test)
evaluate_clf(y_test, y_pred, label = "Evaluate Logistic Regression")

In [None]:
#Random Forest

In [None]:
rf_pipeline = Pipeline([
        ("prep", preprocessor),
        ("smote",SMOTE(random_state = 42)),
        ("select", SelectKBest(score_func = mutual_info_classif, k = 10)),
        ("clf", RandomForestClassifier(random_state = 42))
    ])

y_perm = y.sample(frac=1.0, random_state=0).to_numpy()  # shuffled labels
print("CV on permuted labels:", cross_val_score(rf_pipeline, X, y_perm, cv=3, scoring="accuracy"))

rf_pipeline.fit(X_train, y_train)
print("Shallow tree test accuracy:", rf_pipeline.score(X_test, y_test))

param_dist = {
"select__k": [5, 10],
"clf__n_estimators": [50, 100],
"clf__max_depth": [8, 12, 16],
"clf__min_samples_split": [10, 20],
"clf__min_samples_leaf": [4, 8],
"clf__max_features": ["sqrt"],   # limit features per split
"clf__max_samples": [0.5, 0.75], # subsample rows per tree (bootstrap=True)
"clf__class_weight": ["balanced", "balanced_subsample"],
}

search = RandomizedSearchCV(
    rf_pipeline, 
    param_distributions = param_dist,
    n_iter = 20, #test 20 random combos
    scoring = "f1_weighted",
    cv = 3,
    n_jobs = -1,
    random_state = 42
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Best score:", search.best_score_)

best = search.best_estimator_

y_pred = search.predict(X_test)

y_proba = search.predict_proba(X_test)[:,1]
evaluate_clf(y_test, y_pred, y_proba = y_proba, label="Random Forest Tree")

In [None]:
xgb_pipeline = Pipeline([
        ("prep", preprocessor),
        ("smote",SMOTE(random_state = 42)),
        ("select", SelectKBest(score_func = mutual_info_classif, k = 10)),
        ("clf", XGBClassifier(objective='binary:logistic', eval_metric='logloss',random_state = 42, scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])))
    ])

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xg_model = RandomizedSearchCV(xgb_pipeline, param_grid, scoring='f1_macro', cv=3, n_iter=20, n_jobs=-1, random_state=42)
xg_model.fit(X_train, y_train)

print(xg_model.best_params_)
evaluate_clf(y_test, y_pred, y_proba = y_proba, label="XGBoost")