In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
import warnings
import joblib

warnings.filterwarnings('ignore')

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.model import train, test
from src.model.model import (
    LightGBMModel,
    XGBoostModel,
    CatBoostModel,
    LogisticRegressionModel,
    StackingEnsemble
)
from src.utils import get_config

In [4]:
config = get_config.read_yaml_from_main()
print("Configuration loaded successfully!")

Configuration loaded successfully!


In [5]:
def run_model(model_name):
    model_dir = Path(config['paths']['model_data_directory'])
    model_path = model_dir / f"{model_name}_model.joblib"
    train.train_model(model_name, model_path)
    test.test_model(model_name)

In [6]:
model_name = 'lightgbm'
run_model(model_name)

--- Preparing to Train Model: lightgbm ---
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15686
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Validation ROC AUC: 0.6125
Validation PR AUC (AUC-PR): 0.1277
Model saved to ..\models\lightgbm_model.joblib
--- Testing Model from: ../models/lightgbm_model.joblib ---

--- Test Set Performance ---
Test ROC AUC: 0.6133
Test PR AUC: 0.1268


In [7]:
model_name = 'xgboost'
run_model(model_name)

--- Preparing to Train Model: xgboost ---
--- Fitting XGBoostModel ---
Validation ROC AUC: 0.5797
Validation PR AUC (AUC-PR): 0.1156
Model saved to ..\models\xgboost_model.joblib
--- Testing Model from: ../models/xgboost_model.joblib ---

--- Test Set Performance ---
Test ROC AUC: 0.5875
Test PR AUC: 0.1158


In [8]:
model_name = 'catboost'
run_model(model_name)

--- Preparing to Train Model: catboost ---
--- Fitting CatBoostModel ---
Validation ROC AUC: 0.5856
Validation PR AUC (AUC-PR): 0.1205
Model saved to ..\models\catboost_model.joblib
--- Testing Model from: ../models/catboost_model.joblib ---

--- Test Set Performance ---
Test ROC AUC: 0.5948
Test PR AUC: 0.1211


In [9]:
model_name = 'logistic_regression'
run_model(model_name)

--- Preparing to Train Model: logistic_regression ---
--- Fitting LogisticRegressionModel ---
Validation ROC AUC: 0.6114
Validation PR AUC (AUC-PR): 0.1242
Model saved to ..\models\logistic_regression_model.joblib
--- Testing Model from: ../models/logistic_regression_model.joblib ---

--- Test Set Performance ---
Test ROC AUC: 0.6140
Test PR AUC: 0.1263


In [10]:
model_name = 'ensemble'
run_model(model_name)

--- Preparing to Train Model: ensemble ---
--- Fitting Stacking Ensemble ---
Fitting base model: LightGBMModel
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15686
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fitting base model: XGBoostModel
--- Fitting XGBoostModel ---
Fitting base model: CatBoostModel
--- Fitting CatBoostModel ---
Fitting base model: LogisticRegressionModel
--- Fitting LogisticRegressionModel ---
Fitting meta-learner...
--- Ensemble Fitting Complete ---
Validation ROC AUC: 0.5483
Validation PR AUC (AUC-PR): 0.1002
Ensemble model saved to ..\model

In [None]:
import optuna
from sklearn.metrics import average_precision_score, roc_auc_score

# --- This code goes in a new cell in your notebook ---

def objective(trial):
    # 1. Define the hyperparameters to search
    params = {
        'objective': 'binary',
        'metric': 'rocauc', # Optimize directly for PR AUC
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'class_weight': 'balanced',
        'random_state': 42,
        'verbose': -1
    }

    data_dir = config['paths']['processed_data_directory']
    val_df = pd.read_csv(data_dir + "/clean_val_data.csv")

    # 2. Prepare Data
    id_col = config['data']['id']
    target_col = config['data']['target']

    X_val = val_df.drop(columns=[id_col, target_col])
    y_val = val_df[target_col]
    model_name = 'ensemble'
    model_dir = Path(config['paths']['model_data_directory'])
    model_path = model_dir / f"{model_name}_model.joblib"
    train.train_model(model_name, model_path)

    model = joblib.load(model_path)

    # 3. Evaluate on the validation set and return the score
    val_preds = model.predict_proba(X_val)[:, 1]
    roc_auc = roc_auc_score(y_val, val_preds)

    return roc_auc

# --- Create and run the study ---
# We want to MAXIMIZE the PR AUC, so the direction is 'maximize'
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) # Run 50 trials

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# You can now retrain your model using these best parameters.

[I 2025-08-31 00:28:05,406] A new study created in memory with name: no-name-f4113462-db42-479d-9426-dca8e7b17c08


--- Preparing to Train Model: ensemble ---
--- Fitting Stacking Ensemble ---
Fitting base model: LightGBMModel
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15686
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fitting base model: XGBoostModel
--- Fitting XGBoostModel ---
Fitting base model: CatBoostModel
--- Fitting CatBoostModel ---
Fitting base model: LogisticRegressionModel
--- Fitting LogisticRegressionModel ---
Fitting meta-learner...
--- Ensemble Fitting Complete ---
Validation ROC AUC: 0.5483
Validation PR AUC (AUC-PR): 0.1002
Ensemble model saved to ..\model

[I 2025-08-31 00:28:39,180] Trial 0 finished with value: 0.5482650324201116 and parameters: {'n_estimators': 1916, 'learning_rate': 0.23270434773697649, 'num_leaves': 117, 'max_depth': 4, 'min_child_samples': 129, 'feature_fraction': 0.6452104266009124, 'bagging_fraction': 0.9490543271504135, 'bagging_freq': 5, 'lambda_l1': 1.1093170401418448e-08, 'lambda_l2': 6.4644230724332e-07}. Best is trial 0 with value: 0.5482650324201116.


--- Preparing to Train Model: ensemble ---
--- Fitting Stacking Ensemble ---
Fitting base model: LightGBMModel
--- Fitting LightGBMModel ---
[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15686
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fitting base model: XGBoostModel
--- Fitting XGBoostModel ---
Fitting base model: CatBoostModel
--- Fitting CatBoostModel ---
Fitting base model: LogisticRegressionModel
--- Fitting LogisticRegressionModel ---
Fitting meta-learner...
