In this notebook, we evaluate several models: **RandomForest**, **XGBoost**, **Histogram Gradient Boosting**, **LightGBM**, **CatBoost**, and **Extra Trees**.

In [1]:
import os
import sys
import joblib

os.chdir("..")
sys.path.append(os.getcwd())

import pandas as pd
from src.preprocessing import preprocess_pipeline
from src.train_model import MODELS
from src.evaluate_model import evaluate_model, cross_validate_model
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
X_train, X_val, y_train, y_val, test_df = preprocess_pipeline()

In [3]:
# model_RF = MODELS["random_forest"](X_train, y_train)
#joblib.dump(model_RF, "models/random_forest_model.joblib")

model_RF = joblib.load("models/random_forest_model.joblib")
report_RF = evaluate_model(model_RF, X_val, y_val, model_name = "Random Forest")
# scores_rf = cross_validate_model(model_RF, X_train, y_train, cv=5, scoring='accuracy')


 Classification Report for Random Forest:

              precision    recall  f1-score   support

           1       0.78      0.81      0.79       216
           2       0.82      0.76      0.79       216
           3       0.89      0.91      0.90       216
           4       0.96      0.97      0.97       216
           5       0.94      0.95      0.94       216
           6       0.90      0.93      0.91       216
           7       0.95      0.92      0.93       216

    accuracy                           0.89      1512
   macro avg       0.89      0.89      0.89      1512
weighted avg       0.89      0.89      0.89      1512



In [4]:
label_encoder = LabelEncoder()
# model_XGB = MODELS["xgboost"](X_train, y_train)
# joblib.dump(model_XGB, "models/xgboost_model.joblib")

model_XGB = joblib.load("models/xgboost_model.joblib")
report_XGB = evaluate_model(model_XGB, X_val, y_val, LabelEncoder(), model_name = "XGBoost")
y_train_encoded = label_encoder.fit_transform(y_train)
# scores_xgb = cross_validate_model(model_XGB, X_train, y_train_encoded, cv=5, scoring='accuracy')


 Classification Report for XGBoost:

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       216
           1       0.80      0.79      0.80       216
           2       0.92      0.91      0.91       216
           3       0.98      0.97      0.97       216
           4       0.93      0.96      0.95       216
           5       0.92      0.94      0.93       216
           6       0.98      0.93      0.95       216

    accuracy                           0.90      1512
   macro avg       0.90      0.90      0.90      1512
weighted avg       0.90      0.90      0.90      1512



In [5]:
# model_HGB = MODELS["histogram_gb"](X_train, y_train)
# joblib.dump(model_HGB, "models/histogram_gb_model.joblib")

model_HGB = joblib.load("models/histogram_gb_model.joblib")
report_HGB = evaluate_model(model_HGB, X_val, y_val, LabelEncoder(), model_name = "HGB")
# scores_hgb = cross_validate_model(model_HGB, X_train, y_train, cv=5, scoring='accuracy')


 Classification Report for HGB:

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       216
           1       0.83      0.81      0.82       216
           2       0.93      0.94      0.94       216
           3       0.98      0.98      0.98       216
           4       0.93      0.94      0.94       216
           5       0.93      0.95      0.94       216
           6       0.97      0.94      0.96       216

    accuracy                           0.91      1512
   macro avg       0.91      0.91      0.91      1512
weighted avg       0.91      0.91      0.91      1512



In [6]:
# model_LGBM = MODELS["lightgbm"](X_train, y_train)
# joblib.dump(model_LGBM, "models/lightgbm_model.joblib")

model_LGBM = joblib.load("models/lightgbm_model.joblib")
report_LGBM = evaluate_model(model_LGBM, X_val, y_val, model_name = "LightGBM")
# scores_lgbm = cross_validate_model(model_LGBM, X_train, y_train, cv=5, scoring='accuracy')


 Classification Report for LightGBM:

              precision    recall  f1-score   support

           1       0.83      0.80      0.81       216
           2       0.77      0.80      0.78       216
           3       0.93      0.92      0.93       216
           4       0.97      0.98      0.97       216
           5       0.92      0.94      0.93       216
           6       0.92      0.94      0.93       216
           7       0.97      0.94      0.95       216

    accuracy                           0.90      1512
   macro avg       0.90      0.90      0.90      1512
weighted avg       0.90      0.90      0.90      1512



In [7]:
from sklearn.preprocessing import LabelEncoder
# model_CB = MODELS["catboost"](X_train, y_train)
# joblib.dump(model_CB, "models/catboost_model.joblib")

model_CB = joblib.load("models/catboost_model.joblib")
report_CB = evaluate_model(model_CB, X_val, y_val, LabelEncoder(), model_name = "CatBoost")
# scores_cb = cross_validate_model(model_CB, X_train, y_train, cv=5, scoring='accuracy')


 Classification Report for CatBoost:

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       216
           1       0.78      0.81      0.79       216
           2       0.89      0.88      0.88       216
           3       0.98      0.97      0.97       216
           4       0.93      0.90      0.92       216
           5       0.89      0.92      0.90       216
           6       0.97      0.92      0.94       216

    accuracy                           0.89      1512
   macro avg       0.89      0.89      0.89      1512
weighted avg       0.89      0.89      0.89      1512



In [9]:
# model_ET = MODELS["extra_trees"](X_train, y_train)
# joblib.dump(model_ET, "models/extratrees_model.joblib")

model_ET = joblib.load("models/extratrees_model.joblib")
report_ET = evaluate_model(model_ET, X_val, y_val, model_name = "Extra Trees")
#scores_et = cross_validate_model(model_ET, X_train, y_train, cv=5, scoring='accuracy')


 Classification Report for Extra Trees:

              precision    recall  f1-score   support

           1       0.80      0.81      0.81       216
           2       0.85      0.75      0.80       216
           3       0.90      0.92      0.91       216
           4       0.98      0.98      0.98       216
           5       0.94      0.98      0.96       216
           6       0.88      0.92      0.90       216
           7       0.94      0.94      0.94       216

    accuracy                           0.90      1512
   macro avg       0.90      0.90      0.90      1512
weighted avg       0.90      0.90      0.90      1512



In [10]:
#Comparison of Model Performance

performance_summary = {
    "RandomForest": {
        "F1-Score": report_RF["weighted avg"]["f1-score"]
    },
    "XGBoost": {
        "F1-Score": report_XGB["weighted avg"]["f1-score"]
    },
    "Histogram GB": {
        "F1-Score": report_HGB["weighted avg"]["f1-score"]
    },
    "LightGBM": {
        "F1-Score": report_LGBM["weighted avg"]["f1-score"]
    },
    "CatBoost": {
        "F1-Score": report_CB["weighted avg"]["f1-score"]
    },
    "Extra Trees": {
        "F1-Score": report_ET["weighted avg"]["f1-score"]
    }
}

performance_df = pd.DataFrame(performance_summary).T

display(performance_df.sort_values(by="F1-Score", ascending=False))

Unnamed: 0,F1-Score
Histogram GB,0.914498
XGBoost,0.902898
LightGBM,0.901013
Extra Trees,0.899895
RandomForest,0.891238
CatBoost,0.888904
