In [1]:
import sys
import os
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.model import model, train, test
from src.utils import get_config

In [4]:
config = get_config.read_yaml_from_main()
print("Configuration loaded successfully!")

Configuration loaded successfully!


In [5]:
def run_model(model_name):
    model_dir = Path(config['paths']['model_data_directory'])
    model_path = model_dir / f"{model_name}_model.joblib"
    train.train_model(model_name, model_path)
    test.test_model(model_name)

In [6]:
model_name = 'lightgbm'
run_model(model_name)

--- Preparing to Train Model: lightgbm ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12872
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Validation ROC AUC: 0.6132
Validation PR AUC (AUC-PR): 0.1283
Model saved to ..\models\lightgbm_model.joblib
Reading file: lightgbm_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test ROC AUC: 0.6103
Test PR AUC: 0.1275


In [7]:
model_name = 'xgboost'
run_model(model_name)

--- Preparing to Train Model: xgboost ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting XGBoostModel ---
Validation ROC AUC: 0.5884
Validation PR AUC (AUC-PR): 0.1171
Model saved to ..\models\xgboost_model.joblib
Reading file: xgboost_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test ROC AUC: 0.5855
Test PR AUC: 0.1181


In [8]:
model_name = 'catboost'
run_model(model_name)

--- Preparing to Train Model: catboost ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting CatBoostModel ---
Validation ROC AUC: 0.5945
Validation PR AUC (AUC-PR): 0.1230
Model saved to ..\models\catboost_model.joblib
Reading file: catboost_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test ROC AUC: 0.5982
Test PR AUC: 0.1221


In [9]:
model_name = 'logistic_regression'
run_model(model_name)

--- Preparing to Train Model: logistic_regression ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting LogisticRegressionModel ---
Validation ROC AUC: 0.6143
Validation PR AUC (AUC-PR): 0.1252
Model saved to ..\models\logistic_regression_model.joblib
Reading file: logistic_regression_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test ROC AUC: 0.6138
Test PR AUC: 0.1252


In [10]:
model_name = 'ensemble'
run_model(model_name)

--- Preparing to Train Model: ensemble ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting Stacking Ensemble ---
Fitting base model: LightGBMModel
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12872
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fitting base model: XGBoostModel
--- Fitting XGBoostModel ---
Fitting base model: CatBoostModel
--- Fitting CatBoostModel ---
Fitting base model: LogisticRegressionModel
--- Fitting LogisticRegressionModel ---
Fitting meta-learner...
--- Ensemble

In [11]:
model_name = 'tabnet'
run_model(model_name)

--- Preparing to Train Model: tabnet ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting TabNetModel ---
Validation ROC AUC: 0.6152
Validation PR AUC (AUC-PR): 0.1230
Model saved to ..\models\tabnet_model.joblib
Reading file: tabnet_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test ROC AUC: 0.6155
Test PR AUC: 0.1232


In [12]:
model = 'ziber'
run_model(model)

--- Preparing to Train Model: ziber ---
Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!
--- Fitting ZIBerModel ---
Iteration 1, Log-Likelihood: -72349.7560
Iteration 2, Log-Likelihood: -61216.9599
Iteration 3, Log-Likelihood: -59891.7834
Iteration 4, Log-Likelihood: -59463.4707
Iteration 5, Log-Likelihood: -59271.5100
Iteration 6, Log-Likelihood: -59168.7835
Iteration 7, Log-Likelihood: -59107.1671
Iteration 8, Log-Likelihood: -59066.5502
Iteration 9, Log-Likelihood: -59038.5401
Iteration 10, Log-Likelihood: -59018.1288
Iteration 11, Log-Likelihood: -59002.2041
Iteration 12, Log-Likelihood: -58989.8890
Iteration 13, Log-Likelihood: -58979.6718
Iteration 14, Log-Likelihood: -58971.6125
Iteration 15, Log-Likelihood: -58964.4212
Iteration 16, Log-Likelihood: -58958.4508
Iteration 17, Log-Likelihood: -58952.8019
Iteration 18, Log-Likelihood: -58947.4122
Iteration 19, Log-Likelihood: -58942.8256
Iteration 20, Log-Likelihood