In [13]:
import joblib
import sys
sys.path.append("../")
from src.models import train_logistic_regression, train_random_forest, train_gradient_boosting, train_lightgbm, evaluate_model
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

### Loading Processed Data

We load the preprocessed and resampled training data and the scaled test data from the files saved in the Feature Engineering notebook.


In [2]:
X_train = joblib.load('../data/X_train_resampled.joblib')
y_train = joblib.load('../data/y_train_resampled.joblib')
X_test = joblib.load('../data/X_test_scaled.joblib')
y_test = joblib.load('../data/y_test.joblib')

# Model Training and Evaluation (Initial Baselines)

We train and evaluate several baseline classification models to get an initial understanding of their performance on this task. We focus on metrics relevant to imbalanced datasets, as provided by the `evaluate_model` function.

In [3]:
print("\n--- Logistic Regression ---")
lr_model = train_logistic_regression(X_train, y_train)
evaluate_model(lr_model, X_test, y_test)


--- Logistic Regression ---
Confusion Matrix:
 [[55322  1542]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.10        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.98     56962


AUC-ROC: 0.9714


(array([0, 0, 0, ..., 0, 0, 0], shape=(56962,)),
 array([4.94563032e-03, 4.23430942e-02, 2.27043661e-05, ...,
        5.28451843e-04, 1.96395096e-03, 5.25151879e-02], shape=(56962,)))

In [4]:
print("\n--- Random Forest ---")
rf_model = train_random_forest(X_train, y_train)
evaluate_model(rf_model, X_test, y_test)


--- Random Forest ---
Confusion Matrix:
 [[56850    14]
 [   19    79]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.85      0.81      0.83        98

    accuracy                           1.00     56962
   macro avg       0.92      0.90      0.91     56962
weighted avg       1.00      1.00      1.00     56962


AUC-ROC: 0.9783


(array([0, 0, 0, ..., 0, 0, 0], shape=(56962,)),
 array([0.  , 0.  , 0.03, ..., 0.  , 0.  , 0.  ], shape=(56962,)))

In [23]:
print("\n--- Gradient Boosting ---")
gb_model = train_gradient_boosting(X_train, y_train)
evaluate_model(gb_model, X_test, y_test)


--- Gradient Boosting ---
Confusion Matrix:
 [[56119   745]
 [    9    89]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.11      0.91      0.19        98

    accuracy                           0.99     56962
   macro avg       0.55      0.95      0.59     56962
weighted avg       1.00      0.99      0.99     56962


AUC-ROC: 0.9807


(array([0, 0, 0, ..., 0, 0, 0], shape=(56962,)),
 array([0.0225519 , 0.00933714, 0.08887581, ..., 0.00340834, 0.01343252,
        0.06190657], shape=(56962,)))

In [10]:
print("\n--- LightGBM ---")
lgbm_model = train_lightgbm(X_train, y_train)
evaluate_model(lgbm_model, X_test, y_test)


--- LightGBM ---
[LightGBM] [Info] Number of positive: 227451, number of negative: 227451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 454902, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Confusion Matrix:
 [[56796    68]
 [   13    85]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.56      0.87      0.68        98

    accuracy                           1.00     56962
   macro avg       0.78      0.93      0.84     56962
weighted avg       1.00      1.00      1.00     56962


AUC-ROC: 0.9463




(array([0, 0, 0, ..., 0, 0, 0], shape=(56962,)),
 array([1.50629968e-04, 2.13502983e-04, 3.49314972e-04, ...,
        6.91920436e-05, 5.07660398e-04, 1.81072489e-04], shape=(56962,)))

# Hyperparameter Tuning

To potentially improve the performance of our models, we can tune their hyperparameters. This involves searching through a predefined set of parameter values to find the combination that yields the best performance on a validation set.

Logistic Regression Tuning

In [11]:
print("\nTuning Logistic Regression...")
param_distributions_lr = {
    'C': np.logspace(-4, 4, 10), 
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None]
}
random_search_lr = RandomizedSearchCV(LogisticRegression(random_state=42),
                                       param_distributions=param_distributions_lr,
                                       n_iter=10, cv=3, scoring='roc_auc', n_jobs=1, random_state=42)
random_search_lr.fit(X_train, y_train)
best_lr = random_search_lr.best_estimator_
print("Best Logistic Regression parameters:", random_search_lr.best_params_)
joblib.dump(best_lr, '../models/logistic_regression_tuned.joblib')
print("Tuned Logistic Regression model saved.")


Tuning Logistic Regression...




Best Logistic Regression parameters: {'solver': 'liblinear', 'penalty': 'l2', 'class_weight': None, 'C': np.float64(1291.5496650148827)}
Tuned Logistic Regression model saved.


Random Forest Tuning

In [15]:
# Sampling 10% of training data for faster hyperparameter tuning
np.random.seed(42) 
sample_indices_rf = np.random.choice(len(X_train), size=int(0.1 * len(X_train)), replace=False)

X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

X_sample_rf = X_train.iloc[sample_indices_rf]
y_sample_rf = y_train.iloc[sample_indices_rf]

# Define hyperparameter search space for Random Forest
param_distributions_rf = {
    'n_estimators': [100, 200],        # number of trees
    'max_depth': [None, 10],            # max depth of tree
    'min_samples_split': [2, 5],        # min samples needed to split
    'min_samples_leaf': [1, 3],         # min samples needed at a leaf
    'class_weight': ['balanced']        # handle class imbalance
}

# Setup RandomizedSearchCV for Random Forest
print("\nTuning Random Forest on 10% of training data...")
random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_distributions_rf,
    n_iter=10,                  # number of random combinations to try
    cv=3,                       # 3-fold cross-validation
    scoring='roc_auc',           # optimize for ROC-AUC
    n_jobs=1,                   # run sequentially to avoid OSError in Jupyter
    random_state=42
)

# Perform the search
random_search_rf.fit(X_sample_rf, y_sample_rf)

# Save the best model
best_rf = random_search_rf.best_estimator_
print("Best Random Forest parameters:", random_search_rf.best_params_)
joblib.dump(best_rf, '../models/random_forest_tuned.joblib')
print("Tuned Random Forest model saved.")



Tuning Random Forest on 10% of training data...
Best Random Forest parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced'}
Tuned Random Forest model saved.


Gradient Boosting Tuning

In [16]:
# Taking 20% sample of training data for faster tuning
X_sample, _, y_sample, _ = train_test_split(
    X_train, y_train,
    test_size=0.9,        # Keep 90% aside, use 10%
    stratify=y_train,     # Maintain class distribution
    random_state=42       # For reproducibility
)

# Define hyperparameter search space
param_distributions_gb = {
    'n_estimators': [100, 150],      # Number of trees
    'learning_rate': [0.01, 0.1],    # Step size shrinkage
    'max_depth': [3, 5],             # Max depth of trees
    'min_samples_split': [2, 4],     # Minimum samples to split node
    'min_samples_leaf': [1, 2]       # Minimum samples per leaf node
}

# Set up RandomizedSearchCV
random_search_gb = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions=param_distributions_gb,
    n_iter=10,              # Try 10 random combinations
    cv=3,                   # 3-fold cross validation
    scoring='roc_auc',      # Use ROC-AUC for scoring
    n_jobs=1,               # run sequentially
    random_state=42         # For reproducibility
)

# Perform hyperparameter tuning on 10% sample
print("\nTuning Gradient Boosting on 10% of training data...")
random_search_gb.fit(X_sample, y_sample)

# Get the best model from search
best_gb = random_search_gb.best_estimator_
print("Best Gradient Boosting parameters:", random_search_gb.best_params_)

# Retrain best model on FULL training data
print("\nRetraining best Gradient Boosting model on FULL training data...")
best_gb.fit(X_train, y_train)

# Save the final model
joblib.dump(best_gb, '../models/gradient_boosting_tuned.joblib')
print("Final tuned Gradient Boosting model saved.")



Tuning Gradient Boosting on 10% of training data...
Best Gradient Boosting parameters: {'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5, 'learning_rate': 0.1}

Retraining best Gradient Boosting model on FULL training data...
Final tuned Gradient Boosting model saved.


LightGBM Tuning

In [17]:
print("\nTuning LightGBM...")
param_distributions_lgbm = {
    'n_estimators': [100, 150],  
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 40],      
    'class_weight': ['balanced', None]
}
random_search_lgbm = RandomizedSearchCV(LGBMClassifier(random_state=42),
                                        param_distributions=param_distributions_lgbm,
                                        n_iter=10, cv=3, scoring='roc_auc', n_jobs=1, random_state=42)
random_search_lgbm.fit(X_train, y_train)
best_lgbm = random_search_lgbm.best_estimator_
print("Best LightGBM parameters:", random_search_lgbm.best_params_)
joblib.dump(best_lgbm, '../models/lightgbm_tuned.joblib')
print("Tuned LightGBM model saved.")


Tuning LightGBM...
[LightGBM] [Info] Number of positive: 151634, number of negative: 151634
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 303268, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 151634, number of negative: 151634
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 303268, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 151634, number of negative: 151634
[LightGBM] [Info] Auto-

# FInding the best model

In [19]:
random_search_rf.best_score_

np.float64(0.99994430332102)

In [20]:
random_search_lr.best_score_

np.float64(0.9916568588276671)

In [21]:
random_search_gb.best_score_

np.float64(0.9998275846550692)

In [22]:
random_search_lgbm.best_score_

np.float64(0.9999481089281613)

In [18]:
best_model = None
best_auc = 0
best_model_name = ""

if random_search_lr.best_score_ > best_auc:
    best_auc = random_search_lr.best_score_
    best_model = best_lr
    best_model_name = "Logistic Regression"

if random_search_rf.best_score_ > best_auc:
    best_auc = random_search_rf.best_score_
    best_model = best_rf
    best_model_name = "Random Forest"

if random_search_gb.best_score_ > best_auc:
    best_auc = random_search_gb.best_score_
    best_model = best_gb
    best_model_name = "Gradient Boosting"

if random_search_lgbm.best_score_ > best_auc:
    best_auc = random_search_lgbm.best_score_
    best_model = best_lgbm
    best_model_name = "LightGBM"

if best_model:
    joblib.dump(best_model, '../models/best_model.joblib')
    print(f"\nBest model ({best_model_name}) with AUC: {best_auc:.4f} saved as best_model.joblib.")
else:
    print("\nNo best model found during hyperparameter tuning.")


Best model (LightGBM) with AUC: 0.9999 saved as best_model.joblib.
