In [1]:
import sys
import os
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.model import model, train, test
from src.utils import get_config

Reading file: clean_train_data.csv
Successfully read file!
Reading file: clean_val_data.csv
Successfully read file!


In [4]:
config = get_config.read_yaml_from_main()
print("Configuration loaded successfully!")

Configuration loaded successfully!


In [5]:
def run_model(model_name, params=None):
    model_dir = Path(config['paths']['model_data_directory'])
    model_path = model_dir / f"{model_name}_model.joblib"
    train.train_model(model_name, model_path, params=params)
    test.test_model(model_name)

In [6]:
model_name = 'lightgbm'
run_model(model_name)
params = train.tune_model_with_optuna(model_name, n_trials=250)
run_model(model_name, params=params)

--- Preparing to Train Model: lightgbm ---
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14406
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Validation accuracy: 0.9156
Validation ROC AUC: 0.6157
Validation PR AUC (AUC-PR): 0.1298
Model saved to ..\models\lightgbm_model.joblib
Reading file: lightgbm_model.joblib
Successfully read file!
Reading file: clean_test_data.csv


[I 2025-09-01 12:18:55,701] A new study created in memory with name: no-name-13d3abfb-1462-4ad5-8fc0-a54063fd8bae


Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9156
Test ROC AUC: 0.6143
Test PR AUC: 0.1273
--- Starting Hyperparameter Tuning for lightgbm ---


[I 2025-09-01 12:19:02,857] Trial 0 finished with value: 0.12104046467257669 and parameters: {'n_estimators': 690, 'learning_rate': 0.06826479968008267, 'num_leaves': 52, 'max_depth': 6, 'lambda_l1': 0.0007725281567892143, 'lambda_l2': 2.469155474113354e-05}. Best is trial 0 with value: 0.12104046467257669.
[I 2025-09-01 12:19:10,428] Trial 1 finished with value: 0.12331174063900445 and parameters: {'n_estimators': 595, 'learning_rate': 0.09628590698979872, 'num_leaves': 21, 'max_depth': 9, 'lambda_l1': 0.0008486088928431798, 'lambda_l2': 0.0019117128888578932}. Best is trial 1 with value: 0.12331174063900445.
[I 2025-09-01 12:20:16,938] Trial 2 finished with value: 0.1002667097861284 and parameters: {'n_estimators': 1935, 'learning_rate': 0.08717758250370142, 'num_leaves': 166, 'max_depth': 8, 'lambda_l1': 5.269240130438481e-07, 'lambda_l2': 7.3297897484069995e-06}. Best is trial 1 with value: 0.12331174063900445.
[I 2025-09-01 12:20:36,556] Trial 3 finished with value: 0.110357148378


--- Tuning Complete for lightgbm ---
Number of finished trials:  250
Best trial:
  Value (PR AUC):  0.13252206390173127
  Params: 
    n_estimators: 914
    learning_rate: 0.011824181816259443
    num_leaves: 196
    max_depth: 5
    lambda_l1: 5.42948906114636e-08
    lambda_l2: 1.3586612579055367e-07
--- Preparing to Train Model: lightgbm ---
--- Fitting LightGBMModel ---
Validation accuracy: 0.9156
Validation ROC AUC: 0.6189
Validation PR AUC (AUC-PR): 0.1325
Model saved to ..\models\lightgbm_model.joblib
Reading file: lightgbm_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9158
Test ROC AUC: 0.6154
Test PR AUC: 0.1298


In [7]:
model_name = 'xgboost'
run_model(model_name)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: xgboost ---
--- Fitting XGBoostModel ---
Validation accuracy: 0.9118
Validation ROC AUC: 0.5852
Validation PR AUC (AUC-PR): 0.1179
Model saved to ..\models\xgboost_model.joblib
Reading file: xgboost_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9115
Test ROC AUC: 0.5892
Test PR AUC: 0.1176


In [6]:
model_name = 'catboost'
run_model(model_name)
params = train.tune_model_with_optuna(model_name, n_trials=10)
run_model(model_name, params=params)

--- Preparing to Train Model: catboost ---
--- Fitting CatBoostModel ---
Validation accuracy: 0.9137
Validation ROC AUC: 0.5929
Validation PR AUC (AUC-PR): 0.1208
Model saved to ..\models\catboost_model.joblib
Reading file: catboost_model.joblib
Successfully read file!
Reading file: clean_test_data.csv


[I 2025-09-01 14:02:29,384] A new study created in memory with name: no-name-6f15410c-1038-49a6-a154-9cd1b2f735cf


Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9137
Test ROC AUC: 0.5939
Test PR AUC: 0.1193
--- Starting Hyperparameter Tuning for catboost ---


[I 2025-09-01 14:03:43,889] Trial 0 finished with value: 0.10892498334654069 and parameters: {'iterations': 1450, 'learning_rate': 0.09339280927795574, 'depth': 9, 'l2_leaf_reg': 5.05378953072099}. Best is trial 0 with value: 0.10892498334654069.
[I 2025-09-01 14:04:17,253] Trial 1 finished with value: 0.11995052432635887 and parameters: {'iterations': 1270, 'learning_rate': 0.074523140257616, 'depth': 7, 'l2_leaf_reg': 5.321588551624511}. Best is trial 1 with value: 0.11995052432635887.
[I 2025-09-01 14:04:55,708] Trial 2 finished with value: 0.11983915568138233 and parameters: {'iterations': 1805, 'learning_rate': 0.07781503365678426, 'depth': 6, 'l2_leaf_reg': 7.023193448523134}. Best is trial 1 with value: 0.11995052432635887.
[I 2025-09-01 14:05:56,016] Trial 3 finished with value: 0.11411673655748285 and parameters: {'iterations': 590, 'learning_rate': 0.06069179592570378, 'depth': 10, 'l2_leaf_reg': 2.127291852997087}. Best is trial 1 with value: 0.11995052432635887.
[I 2025-09-


--- Tuning Complete for catboost ---
Number of finished trials:  10
Best trial:
  Value (PR AUC):  0.12779226361731266
  Params: 
    iterations: 670
    learning_rate: 0.08467699248746405
    depth: 6
    l2_leaf_reg: 7.028043022644782
--- Preparing to Train Model: catboost ---
--- Fitting CatBoostModel ---
Validation accuracy: 0.9148
Validation ROC AUC: 0.6092
Validation PR AUC (AUC-PR): 0.1278
Model saved to ..\models\catboost_model.joblib
Reading file: catboost_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9148
Test ROC AUC: 0.6082
Test PR AUC: 0.1265


In [7]:
model_name = 'logistic_regression'
run_model(model_name)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: logistic_regression ---
--- Fitting LogisticRegressionModel ---
Validation accuracy: 0.9193
Validation ROC AUC: 0.5521
Validation PR AUC (AUC-PR): 0.0988
Model saved to ..\models\logistic_regression_model.joblib
Reading file: logistic_regression_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9193
Test ROC AUC: 0.5478
Test PR AUC: 0.0981


In [8]:
model_name = 'ensemble'
run_model(model_name)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: ensemble ---
--- Fitting Stacking Ensemble ---
Fitting base model: LightGBMModel
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14406
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fitting base model: XGBoostModel
--- Fitting XGBoostModel ---
Fitting base model: CatBoostModel
--- Fitting CatBoostModel ---
Fitting base model: LogisticRegressionModel
--- Fitting LogisticRegressionModel ---
Fitting meta-learner...
--- Ensemble Fitting Complete ---
Validation accuracy: 0.9164
Validation ROC AUC: 0.5607
Validation PR AUC (AUC-PR): 0.1038
Ense

In [9]:
model_name = 'tabnet'
run_model(model_name)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: tabnet ---
--- Fitting TabNetModel ---
Validation accuracy: 0.9193
Validation ROC AUC: 0.6145
Validation PR AUC (AUC-PR): 0.1226
Model saved to ..\models\tabnet_model.joblib
Reading file: tabnet_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9193
Test ROC AUC: 0.6123
Test PR AUC: 0.1248


In [10]:
model = 'ziber'
run_model(model)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: ziber ---
--- Fitting ZIBerModel ---
Iteration 1, Log-Likelihood: -72254.0131
Iteration 2, Log-Likelihood: -61208.3912
Iteration 3, Log-Likelihood: -59895.4608
Iteration 4, Log-Likelihood: -59470.5533
Iteration 5, Log-Likelihood: -59281.7925
Iteration 6, Log-Likelihood: -59181.1916
Iteration 7, Log-Likelihood: -59151.8512
Iteration 8, Log-Likelihood: -59088.0712
Iteration 9, Log-Likelihood: -59067.8927
Iteration 10, Log-Likelihood: -59044.5221
Validation accuracy: 0.9192
Validation ROC AUC: 0.6138
Validation PR AUC (AUC-PR): 0.1264
Model saved to ..\models\ziber_model.joblib
Reading file: ziber_model.joblib
Successfully read file!
Reading file: clean_test_data.csv
Successfully read file!

--- Test Set Performance ---
Test accuracy: 0.9193
Test ROC AUC: 0.6126
Test PR AUC: 0.1267


In [11]:
model = 'lightgbm-ziber'
run_model(model)
# params = train.tune_model_with_optuna(model_name, n_trials=1)
# run_model(model_name, params=params)

--- Preparing to Train Model: lightgbm-ziber ---
--- Fitting LightGBMZIBerModel ---
Fitting base LightGBM model...
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14406
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Generating LightGBM predictions as a new feature...
Fitting ZIBer model on augmented data...
--- Fitting ZIBerModel ---
Iteration 1, Log-Likelihood: -65788.6043
Iteration 2, Log-Likelihood: -56327.0580
Iteration 3, Log-Likelihood: -55088.8254
Iteration 4, Log-Likelihood: -54632.9293
Iteration 5, Log-Likelihood: -54405.2684
Iteration 6, Log-Likelihood: -54232.6291
Iteration 7, Log-Likel