In [5]:
import os
import sys
import joblib

os.chdir("..")
sys.path.append(os.getcwd())

from src.preprocessing import preprocess_pipeline
from src.evaluate_model import evaluate_model
from src.hyperparameter_optimization import *
import warnings
warnings.filterwarnings('ignore')

In [6]:
X_train, X_val, y_train, y_val, _ = preprocess_pipeline()

In [3]:
# Random Forest

best_rf, best_params_rf = optimize_random_forest(X_train, y_train, method='grid')
print('Best Random Forest Parameters:', best_params_rf)
evaluate_model(best_rf, X_val, y_val, model_name="Optimized RandomForest")

Best Random Forest Parameters: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 200}

 Classification Report for Optimized RandomForest:

              precision    recall  f1-score   support

           1       0.79      0.82      0.80       216
           2       0.79      0.77      0.78       216
           3       0.90      0.90      0.90       216
           4       0.97      0.96      0.97       216
           5       0.95      0.95      0.95       216
           6       0.89      0.91      0.90       216
           7       0.97      0.93      0.95       216

    accuracy                           0.89      1512
   macro avg       0.89      0.89      0.89      1512
weighted avg       0.89      0.89      0.89      1512



{'1': {'precision': 0.7866666666666666,
  'recall': 0.8194444444444444,
  'f1-score': 0.8027210884353742,
  'support': 216.0},
 '2': {'precision': 0.7877358490566038,
  'recall': 0.7731481481481481,
  'f1-score': 0.780373831775701,
  'support': 216.0},
 '3': {'precision': 0.8981481481481481,
  'recall': 0.8981481481481481,
  'f1-score': 0.8981481481481481,
  'support': 216.0},
 '4': {'precision': 0.9674418604651163,
  'recall': 0.9629629629629629,
  'f1-score': 0.9651972157772621,
  'support': 216.0},
 '5': {'precision': 0.9534883720930233,
  'recall': 0.9490740740740741,
  'f1-score': 0.951276102088167,
  'support': 216.0},
 '6': {'precision': 0.8873873873873874,
  'recall': 0.9120370370370371,
  'f1-score': 0.8995433789954338,
  'support': 216.0},
 '7': {'precision': 0.966183574879227,
  'recall': 0.9259259259259259,
  'f1-score': 0.9456264775413712,
  'support': 216.0},
 'accuracy': 0.8915343915343915,
 'macro avg': {'precision': 0.8924359798137388,
  'recall': 0.8915343915343915,
 

In [4]:
# LightGBM

best_lgbm, best_params_lgbm = optimize_lightgbm(X_train, y_train, method='bayesian')
print('Best LightGBM Parameters:', best_params_lgbm)
evaluate_model(best_lgbm, X_val, y_val, model_name="Optimized LightGBM")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3920
[LightGBM] [Info] Number of data points in the train set: 19440, number of used features: 63
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.203973
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
Best LightGBM Parameters: OrderedDict([('learning_rate', 0.1), ('n_estimators', 200), ('num_leaves', 127)])

 Classification Report for Optimized LightGBM:

              precision    recall  f1-score   support

           1       0.83      0.84      0.84       216
           2       0.83      0.79      0.81       216
           3 

{'1': {'precision': 0.8348623853211009,
  'recall': 0.8425925925925926,
  'f1-score': 0.8387096774193549,
  'support': 216.0},
 '2': {'precision': 0.8341463414634146,
  'recall': 0.7916666666666666,
  'f1-score': 0.8123515439429929,
  'support': 216.0},
 '3': {'precision': 0.9276018099547512,
  'recall': 0.9490740740740741,
  'f1-score': 0.9382151029748284,
  'support': 216.0},
 '4': {'precision': 0.986046511627907,
  'recall': 0.9814814814814815,
  'f1-score': 0.9837587006960556,
  'support': 216.0},
 '5': {'precision': 0.9324324324324325,
  'recall': 0.9583333333333334,
  'f1-score': 0.9452054794520548,
  'support': 216.0},
 '6': {'precision': 0.9227272727272727,
  'recall': 0.9398148148148148,
  'f1-score': 0.9311926605504587,
  'support': 216.0},
 '7': {'precision': 0.9715639810426541,
  'recall': 0.9490740740740741,
  'f1-score': 0.9601873536299765,
  'support': 216.0},
 'accuracy': 0.916005291005291,
 'macro avg': {'precision': 0.9156258192242189,
  'recall': 0.916005291005291,
 

In [6]:
# XGBoost + Optuna
best_xgb, best_params_xgb = optimize_xgboost_with_optuna(X_train, y_train, n_trials=20)
print('Best XGBoost Parameters:', best_params_xgb)
evaluate_model(best_xgb, X_val, y_val, LabelEncoder(), model_name="Optimized XGBoost + Optuna")


[I 2025-04-13 21:52:18,570] A new study created in memory with name: no-name-8de3f468-ebb2-474f-ae3e-d21d29c8efc3
[I 2025-04-13 21:52:26,406] Trial 0 finished with value: 0.9726851851851852 and parameters: {'n_estimators': 429, 'learning_rate': 0.05041988031625554, 'max_depth': 7, 'subsample': 0.7401677738394841, 'colsample_bytree': 0.8217010356172736, 'gamma': 0.6148375933251887, 'reg_alpha': 0.8797841696272523, 'reg_lambda': 5.226291157532746, 'min_child_weight': 8}. Best is trial 0 with value: 0.9726851851851852.
[I 2025-04-13 21:52:35,155] Trial 1 finished with value: 0.8635802469135803 and parameters: {'n_estimators': 362, 'learning_rate': 0.022471283335776776, 'max_depth': 5, 'subsample': 0.7623296480049301, 'colsample_bytree': 0.8502472368733746, 'gamma': 0.97826758130165, 'reg_alpha': 0.885132962621057, 'reg_lambda': 9.850918345047036, 'min_child_weight': 2}. Best is trial 0 with value: 0.9726851851851852.
[I 2025-04-13 21:52:47,123] Trial 2 finished with value: 0.9939300411522

Best XGBoost Parameters: {'n_estimators': 395, 'learning_rate': 0.07416808349362083, 'max_depth': 12, 'subsample': 0.9103670296402947, 'colsample_bytree': 0.8942993312162386, 'gamma': 0.00998966257243918, 'reg_alpha': 0.7301952095976469, 'reg_lambda': 6.858922762004385, 'min_child_weight': 7}

 Classification Report for Optimized XGBoost + Optuna:

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       216
           1       0.80      0.79      0.80       216
           2       0.90      0.92      0.91       216
           3       0.97      0.97      0.97       216
           4       0.93      0.94      0.93       216
           5       0.91      0.93      0.92       216
           6       0.97      0.94      0.95       216

    accuracy                           0.90      1512
   macro avg       0.90      0.90      0.90      1512
weighted avg       0.90      0.90      0.90      1512



{'0': {'precision': 0.8110599078341014,
  'recall': 0.8148148148148148,
  'f1-score': 0.812933025404157,
  'support': 216.0},
 '1': {'precision': 0.8028169014084507,
  'recall': 0.7916666666666666,
  'f1-score': 0.7972027972027972,
  'support': 216.0},
 '2': {'precision': 0.9041095890410958,
  'recall': 0.9166666666666666,
  'f1-score': 0.9103448275862069,
  'support': 216.0},
 '3': {'precision': 0.967741935483871,
  'recall': 0.9722222222222222,
  'f1-score': 0.9699769053117783,
  'support': 216.0},
 '4': {'precision': 0.9269406392694064,
  'recall': 0.9398148148148148,
  'f1-score': 0.9333333333333333,
  'support': 216.0},
 '5': {'precision': 0.91324200913242,
  'recall': 0.9259259259259259,
  'f1-score': 0.9195402298850575,
  'support': 216.0},
 '6': {'precision': 0.9711538461538461,
  'recall': 0.9351851851851852,
  'f1-score': 0.9528301886792453,
  'support': 216.0},
 'accuracy': 0.8994708994708994,
 'macro avg': {'precision': 0.8995806897604559,
  'recall': 0.8994708994708994,
  

In [12]:
# Histogram Gradient Boosting

best_hgb, best_params_hgb = optimize_histgb(X_train, y_train, method='bayesian')
print('Best Histogram Gradient Boosting Parameters:', best_params_hgb)
evaluate_model(best_hgb, X_val, y_val, model_name="Optimized Histogram Gradient Boosting")

Best Histogram Gradient Boosting Parameters: OrderedDict([('l2_regularization', 0), ('learning_rate', 0.1), ('max_bins', 255), ('max_iter', 200), ('max_leaf_nodes', 127)])

 Classification Report for Optimized Histogram Gradient Boosting:

              precision    recall  f1-score   support

           1       0.83      0.83      0.83       216
           2       0.81      0.81      0.81       216
           3       0.93      0.93      0.93       216
           4       0.97      0.96      0.97       216
           5       0.94      0.95      0.94       216
           6       0.92      0.94      0.93       216
           7       0.97      0.94      0.95       216

    accuracy                           0.91      1512
   macro avg       0.91      0.91      0.91      1512
weighted avg       0.91      0.91      0.91      1512



{'1': {'precision': 0.8325581395348837,
  'recall': 0.8287037037037037,
  'f1-score': 0.8306264501160093,
  'support': 216.0},
 '2': {'precision': 0.813953488372093,
  'recall': 0.8101851851851852,
  'f1-score': 0.8120649651972158,
  'support': 216.0},
 '3': {'precision': 0.9262672811059908,
  'recall': 0.9305555555555556,
  'f1-score': 0.9284064665127021,
  'support': 216.0},
 '4': {'precision': 0.9719626168224299,
  'recall': 0.9629629629629629,
  'f1-score': 0.9674418604651163,
  'support': 216.0},
 '5': {'precision': 0.9363636363636364,
  'recall': 0.9537037037037037,
  'f1-score': 0.944954128440367,
  'support': 216.0},
 '6': {'precision': 0.918552036199095,
  'recall': 0.9398148148148148,
  'f1-score': 0.9290617848970252,
  'support': 216.0},
 '7': {'precision': 0.9666666666666667,
  'recall': 0.9398148148148148,
  'f1-score': 0.9530516431924883,
  'support': 216.0},
 'accuracy': 0.9093915343915344,
 'macro avg': {'precision': 0.9094748378663995,
  'recall': 0.9093915343915343,
 

In [13]:
joblib.dump(best_rf, "models/optimized_randomforest_model.joblib")
joblib.dump(best_lgbm, "models/optimized_lightgbm_model.joblib")
joblib.dump(best_xgb, "models/optimized_xgboost_model.joblib")
joblib.dump(best_hgb, "models/optimized_hgboost_model.joblib")

['models/optimized_hgboost_model.joblib']

In [8]:
from src.ensemble import build_stacking_ensemble

stacking_model = build_stacking_ensemble()
stacking_model.fit(X_train, y_train)

evaluate_model(stacking_model, X_val, y_val, model_name="Stacking Ensemble")

joblib.dump(stacking_model, "models/stacking_ensemble_model.joblib")


 Classification Report for Stacking Ensemble:

              precision    recall  f1-score   support

           1       0.79      0.87      0.83       216
           2       0.88      0.71      0.79       216
           3       0.94      0.95      0.94       216
           4       0.99      0.98      0.98       216
           5       0.91      0.98      0.94       216
           6       0.92      0.95      0.94       216
           7       0.96      0.94      0.95       216

    accuracy                           0.91      1512
   macro avg       0.91      0.91      0.91      1512
weighted avg       0.91      0.91      0.91      1512



['models/stacking_ensemble_model.joblib']