In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [22]:
df = pd.read_csv("data/student_data.csv")

df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [23]:
df.drop("StudentID", axis=1, inplace=True)

In [24]:
df.head(2)

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0


In [25]:
df_target = df[['GradeClass']]
df_features = df.drop(columns=["GradeClass"])

In [26]:
df_target.head(2)

Unnamed: 0,GradeClass
0,2.0
1,1.0


In [27]:
df_features.head(2)

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915


In [28]:
numeric_cols = ['Age', 'StudyTimeWeekly', 'Absences']
onehot_cols  = ['Ethnicity']               
ordinal_cols = ['ParentalEducation', 'ParentalSupport']  
binary_cols  = ['Gender','Tutoring','Extracurricular','Sports','Music','Volunteering']

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('oh', OneHotEncoder(handle_unknown='ignore', sparse_output=False), onehot_cols),
        ('passthrough', 'passthrough', ordinal_cols + binary_cols),
    ]
)

In [31]:
X = df_features.drop(columns=['GPA'])
y = df_target['GradeClass']

In [32]:
X

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,17,1,0,2,19.833723,7,1,2,0,0,1,0
1,18,0,0,1,15.408756,0,0,1,0,0,0,0
2,15,0,2,3,4.210570,26,0,2,0,0,0,0
3,17,1,0,3,10.028829,14,0,3,1,0,0,0
4,17,1,0,2,4.672495,17,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2387,18,1,0,3,10.680555,2,0,4,1,0,0,0
2388,17,0,0,1,7.583217,4,1,4,0,1,0,0
2389,16,1,0,2,6.805500,20,0,2,0,0,0,1
2390,16,1,1,0,12.416653,17,0,2,0,1,1,0


In [33]:
y

0       2.0
1       1.0
2       4.0
3       3.0
4       4.0
       ... 
2387    0.0
2388    4.0
2389    2.0
2390    1.0
2391    1.0
Name: GradeClass, Length: 2392, dtype: float64

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [36]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [37]:
print("Train:", X_train.shape, " Val:", X_val.shape, " Test:", X_test.shape)

print("Class distribution (train):")
print(y_train.value_counts(normalize=True).sort_index())

print("Class distribution (val):")
print(y_val.value_counts(normalize=True).sort_index())

print("Class distribution (test):")
print(y_test.value_counts(normalize=True).sort_index())

Train: (1674, 12)  Val: (359, 12)  Test: (359, 12)
Class distribution (train):
GradeClass
0.0    0.044803
1.0    0.112306
2.0    0.163680
3.0    0.173238
4.0    0.505974
Name: proportion, dtype: float64
Class distribution (val):
GradeClass
0.0    0.044568
1.0    0.111421
2.0    0.164345
3.0    0.172702
4.0    0.506964
Name: proportion, dtype: float64
Class distribution (test):
GradeClass
0.0    0.044568
1.0    0.114206
2.0    0.161560
3.0    0.172702
4.0    0.506964
Name: proportion, dtype: float64


In [38]:
X_train

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
1238,15,1,2,3,2.670311,28,0,3,0,0,0,0
1932,15,1,2,0,4.452985,6,0,4,1,0,0,0
1640,18,0,0,2,11.723647,28,0,3,0,0,1,1
1599,15,0,0,1,2.527403,19,0,2,0,0,0,0
34,16,0,0,3,9.126336,27,1,2,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
739,18,0,2,2,4.895997,4,0,1,0,0,0,0
2008,15,1,0,1,14.041944,13,1,4,1,0,0,0
1709,16,1,0,1,13.794031,13,0,4,0,0,1,1
1557,17,0,0,2,9.083349,8,0,1,1,1,0,0


In [39]:
# All Models!

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import numpy as np
import pandas as pd
import joblib

In [40]:
RANDOM_STATE = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
SCORING = 'f1_macro'
N_ITER = 20 
N_JOBS = -1

In [41]:
def build_pipeline(estimator, use_smote=False):
    """
    Returns a pipeline that applies:
        preprocessor -> (optionally) SMOTE -> estimator
    """
    if use_smote:
        return ImbPipeline(steps=[
            ('prep', preprocessor),
            ('smote', SMOTE(random_state=RANDOM_STATE)),
            ('model', estimator)
        ])
    else:
        return Pipeline(steps=[
            ('prep', preprocessor),
            ('model', estimator)
        ])

In [69]:
search_space = []

In [70]:
# logistic regression
search_space.append((
    "LogReg",
    LogisticRegression(multi_class='multinomial', solver='saga', max_iter=3000, n_jobs=N_JOBS, random_state=RANDOM_STATE),
    {
        'model__C': np.logspace(-3, 2, 20),
        'model__penalty': ['l1', 'l2', 'elasticnet'],
        'model__l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0], 
        'model__class_weight': [None, 'balanced']
    }
))


In [71]:
# SVC
search_space.append((
    "SVC",
    SVC(probability=False, random_state=RANDOM_STATE),
    {
        'model__C': np.logspace(-2, 2, 15),
        'model__kernel': ['rbf', 'linear'],
        'model__gamma': ['scale', 'auto'],
        'model__class_weight': [None, 'balanced']
    }
))

In [72]:
# Random Forest
search_space.append((
    "RandomForest",
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=N_JOBS),
    {
        'model__n_estimators': [200, 300, 400, 600],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None],
        'model__class_weight': [None, 'balanced_subsample', 'balanced']
    }
))

In [73]:
# knn
search_space.append((
    "KNN",
    KNeighborsClassifier(),
    {
        'model__n_neighbors': list(range(3, 41, 2)),
        'model__weights': ['uniform', 'distance'],
        'model__p': [1, 2]  # Manhattan or Euclidean
    }
))

In [74]:
# XGB
search_space.append((
    "XGB",
    XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        random_state=RANDOM_STATE,
        n_estimators=400,
        tree_method='hist'  # fast on CPU
    ),
    {
        'model__max_depth': [3, 4, 5, 6, 8],
        'model__learning_rate': np.logspace(-2, -0.1, 10),
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__min_child_weight': [1, 3, 5]
    }
))

In [75]:
search_space

[('LogReg',
  LogisticRegression(max_iter=3000, multi_class='multinomial', n_jobs=-1,
                     random_state=42, solver='saga'),
  {'model__C': array([1.00000000e-03, 1.83298071e-03, 3.35981829e-03, 6.15848211e-03,
          1.12883789e-02, 2.06913808e-02, 3.79269019e-02, 6.95192796e-02,
          1.27427499e-01, 2.33572147e-01, 4.28133240e-01, 7.84759970e-01,
          1.43844989e+00, 2.63665090e+00, 4.83293024e+00, 8.85866790e+00,
          1.62377674e+01, 2.97635144e+01, 5.45559478e+01, 1.00000000e+02]),
   'model__penalty': ['l1', 'l2', 'elasticnet'],
   'model__l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0],
   'model__class_weight': [None, 'balanced']}),
 ('SVC',
  SVC(random_state=42),
  {'model__C': array([1.00000000e-02, 1.93069773e-02, 3.72759372e-02, 7.19685673e-02,
          1.38949549e-01, 2.68269580e-01, 5.17947468e-01, 1.00000000e+00,
          1.93069773e+00, 3.72759372e+00, 7.19685673e+00, 1.38949549e+01,
          2.68269580e+01, 5.17947468e+01, 1.00000000e+02]),
 

In [76]:
len(search_space)

5

In [77]:
results = []

In [78]:
def run_search(name, estimator, param_dist, use_smote):
    pipe = build_pipeline(estimator, use_smote=use_smote)
    label = f"{name}{' + SMOTE' if use_smote else ''}"

    try:
        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_dist,
            n_iter=N_ITER,
            scoring=SCORING,
            cv=cv,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE,
            refit=True,
            verbose=1
        )
        search.fit(X_train, y_train)
        y_val_pred = search.predict(X_val)
        val_f1 = f1_score(y_val, y_val_pred, average='macro')
        val_acc = accuracy_score(y_val, y_val_pred)

        print(f"\n=== {label} ===")
        print("Best CV macro-F1:", round(search.best_score_, 4))
        print("VAL macro-F1:", round(val_f1, 4), "| VAL acc:", round(val_acc, 4))
        print("VAL report:\n", classification_report(y_val, y_val_pred, digits=3))
        print("VAL confusion matrix:\n", confusion_matrix(y_val, y_val_pred))

        results.append({
            'model': label,
            'best_params': search.best_params_,
            'val_f1_macro': val_f1,
            'val_accuracy': val_acc,
            'cv_best_score': search.best_score_,
            'estimator': search.best_estimator_
        })
    except Exception as e:
        print(f"[SKIPPED] {label} due to error:", e)

In [79]:
for use_smote in [False, True]:
    for name, est, param_dist in search_space:
        run_search(name, est, param_dist, use_smote)

Fitting 5 folds for each of 20 candidates, totalling 100 fits





=== LogReg ===
Best CV macro-F1: 0.5204
VAL macro-F1: 0.5377 | VAL acc: 0.7298
VAL report:
               precision    recall  f1-score   support

         0.0      0.333     0.125     0.182        16
         1.0      0.487     0.475     0.481        40
         2.0      0.494     0.644     0.559        59
         3.0      0.667     0.452     0.538        62
         4.0      0.897     0.962     0.928       182

    accuracy                          0.730       359
   macro avg      0.576     0.531     0.538       359
weighted avg      0.720     0.730     0.717       359

VAL confusion matrix:
 [[  2   7   2   3   2]
 [  3  19  14   0   4]
 [  1   9  38   8   3]
 [  0   1  22  28  11]
 [  0   3   1   3 175]]
Fitting 5 folds for each of 20 candidates, totalling 100 fits

=== SVC ===
Best CV macro-F1: 0.5538
VAL macro-F1: 0.5114 | VAL acc: 0.6462
VAL report:
               precision    recall  f1-score   support

         0.0      0.250     0.438     0.318        16
         1.0      




=== LogReg + SMOTE ===
Best CV macro-F1: 0.5019
VAL macro-F1: 0.443 | VAL acc: 0.5877
VAL report:
               precision    recall  f1-score   support

         0.0      0.130     0.375     0.194        16
         1.0      0.364     0.400     0.381        40
         2.0      0.457     0.356     0.400        59
         3.0      0.351     0.419     0.382        62
         4.0      0.953     0.780     0.858       182

    accuracy                          0.588       359
   macro avg      0.451     0.466     0.443       359
weighted avg      0.665     0.588     0.618       359

VAL confusion matrix:
 [[  6   4   2   3   1]
 [ 16  16   4   1   3]
 [ 12  18  21   8   0]
 [  9   5  19  26   3]
 [  3   1   0  36 142]]
Fitting 5 folds for each of 20 candidates, totalling 100 fits

=== SVC + SMOTE ===
Best CV macro-F1: 0.5613
VAL macro-F1: 0.5314 | VAL acc: 0.6769
VAL report:
               precision    recall  f1-score   support

         0.0      0.140     0.375     0.203        16
   

In [80]:
results_df = pd.DataFrame(results).sort_values(by='val_f1_macro', ascending=False)

In [81]:
results_df

Unnamed: 0,model,best_params,val_f1_macro,val_accuracy,cv_best_score,estimator
9,XGB + SMOTE,"{'model__subsample': 0.8, 'model__min_child_we...",0.549036,0.690808,0.587219,"(ColumnTransformer(transformers=[('num', Stand..."
2,RandomForest,"{'model__n_estimators': 300, 'model__min_sampl...",0.543152,0.682451,0.593051,"(ColumnTransformer(transformers=[('num', Stand..."
7,RandomForest + SMOTE,"{'model__n_estimators': 300, 'model__min_sampl...",0.542618,0.67688,0.569333,"(ColumnTransformer(transformers=[('num', Stand..."
0,LogReg,"{'model__penalty': 'l1', 'model__l1_ratio': 0....",0.5377,0.729805,0.52045,"(ColumnTransformer(transformers=[('num', Stand..."
6,SVC + SMOTE,"{'model__kernel': 'linear', 'model__gamma': 'a...",0.531357,0.67688,0.561293,"(ColumnTransformer(transformers=[('num', Stand..."
4,XGB,"{'model__subsample': 0.6, 'model__min_child_we...",0.522773,0.682451,0.57701,"(ColumnTransformer(transformers=[('num', Stand..."
1,SVC,"{'model__kernel': 'rbf', 'model__gamma': 'auto...",0.511449,0.64624,0.553754,"(ColumnTransformer(transformers=[('num', Stand..."
5,LogReg + SMOTE,"{'model__penalty': 'l1', 'model__l1_ratio': 0....",0.442972,0.587744,0.501911,"(ColumnTransformer(transformers=[('num', Stand..."
3,KNN,"{'model__weights': 'uniform', 'model__p': 1, '...",0.419699,0.612813,0.426305,"(ColumnTransformer(transformers=[('num', Stand..."
8,KNN + SMOTE,"{'model__weights': 'distance', 'model__p': 1, ...",0.407797,0.568245,0.444974,"(ColumnTransformer(transformers=[('num', Stand..."
