# Model devepment

In this document we develop and compare different models for our model devepment. We have the following sections:

1. Model creation
2. Model evaluation
3. Model implementation on test data

Note that for model creation instead of running the code each time one can load the best model.



### Import libraries

In [1]:
from preprocessing import preprocessor
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, roc_auc_score, recall_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import uniform, randint
import joblib

### Data collection

In [2]:
# Load the dataset
total_df = pd.read_csv('../Data/Base.csv')

# Define features (X) and target (y)
X = total_df.drop(columns=['fraud_bool'])
y = total_df['fraud_bool']

# Split the data into training and test sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Data has been loaded")

# Apply the preprocessor to the training and test datasets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


Data has been loaded


## 1. Model Creation

### 1a. Hyperparameter specification

We will use the following hyperparameters for our models. They are based on common hyperparameters used for the respective models with ranges that are commonly used.

In [4]:
# Pipeline with placeholder classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])

# Define models and hyperparameters
models = {
    'Logistic Regression (lbfgs)': (
        LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
        {
            'classifier__C': uniform(0.01, 10),  # Regularization strength
            'classifier__solver': ['lbfgs'],  # Only lbfgs solver
            'classifier__penalty': ['l2', 'none'],  # Supported penalties
        }
    ),
    # 'Logistic Regression (liblinear)': (
    #     LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
    #     {
    #         'classifier__C': uniform(0.01, 10),  # Regularization strength
    #         'classifier__solver': ['liblinear'],  # Only liblinear solver
    #         'classifier__penalty': ['l1', 'l2'],  # Supported penalties
    #     }
    # ),
    # 'Logistic Regression (saga)': (
    #     LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
    #     {
    #         'classifier__C': uniform(0.01, 10),  # Regularization strength
    #         'classifier__solver': ['saga'],  # Only saga solver
    #         'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # All supported penalties
    #         'classifier__l1_ratio': uniform(0, 1),  # ElasticNet mixing parameter (only with saga and elasticnet)
    #     }
    # ),
    'Logistic Regression (default)': (
        LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
        {
            'classifier__C': uniform(0.01, 10),  # Regularization strength
        }
    ),
    'Random Forest': (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {
            'classifier__n_estimators': randint(100, 300),  # Number of trees
            'classifier__max_depth': randint(5, 20),  # Max tree depth
            'classifier__min_samples_split': randint(2, 10),  # Min samples to split
            'classifier__min_samples_leaf': randint(1, 5),  # Min samples per leaf
            'classifier__class_weight': ['balanced', {0: 1, 1: 10}],  # Explicit weights
        }
    ),
    'Gradient Boosting': (
        GradientBoostingClassifier(random_state=42),
        {
            'classifier__n_estimators': randint(50, 300),  # Number of boosting stages
            'classifier__learning_rate': uniform(0.01, 0.2),  # Shrinkage rate
            'classifier__max_depth': randint(3, 15),  # Max tree depth
            'classifier__min_samples_split': randint(2, 10),  # Min samples to split
            'classifier__min_samples_leaf': randint(1, 5),  # Min samples per leaf
        }
    ),
    'XGBoost': (
        XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist'),
        {
            'classifier__n_estimators': randint(50, 300),  # Number of boosting stages
            'classifier__learning_rate': uniform(0.01, 0.2),  # Learning rate
            'classifier__max_depth': randint(3, 15),  # Maximum tree depth
            'classifier__gamma': uniform(0, 0.5),  # Min split loss
            'classifier__scale_pos_weight': [1, 5, 10, 20],  # Reweighting for imbalanced data
        }
    ),
    'LightGBM': (
        LGBMClassifier(random_state=42, device='gpu'),
        {
            'classifier__n_estimators': randint(50, 300),  # Number of boosting stages
            'classifier__learning_rate': uniform(0.01, 0.2),  # Learning rate
            'classifier__max_depth': randint(3, 15),  # Maximum tree depth
            'classifier__num_leaves': randint(20, 50),  # Number of leaves
            'classifier__class_weight': ['balanced', {0: 1, 1: 10}],  # Explicit weights
        }
    ),
    'Naive Bayes': (
        GaussianNB(),
        {}  # No hyperparameters for Naive Bayes
    ),
}

### 1b. Exploratory Hyperparameter Search 

To find the best hyperparameters for each model we will use RandomizedSearchCV because it is more efficient than GridSearchCV for the exploration of the hyperparameter space.

Note: Running the following code is not recommended. It is just for reference. Running it will train the models, and we have provided our training mnodels in the repository.

In [None]:
import time
import joblib
import os
from datetime import datetime
from tqdm import tqdm # Progress bar
from glob import glob # File search

# Custom recall scorer at a target FPR threshold
def recall_at_fpr(y_true, y_scores, target_fpr=0.05):
    from sklearn.metrics import roc_curve
    import numpy as np

    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    if len(np.where(fpr <= target_fpr)[0]) == 0:
        # If no threshold meets the target FPR
        return 0.0
    threshold_idx = np.where(fpr <= target_fpr)[0][-1]
    return tpr[threshold_idx]

# Wrapper function to pass the target FPR
def recall_at_fpr_scorer(y_true, y_scores):
    return recall_at_fpr(y_true, y_scores, target_fpr=0.05)

# Make a scorer for recall @ 5% FPR
recall_5_fpr_scorer = make_scorer(recall_at_fpr_scorer, greater_is_better=True, needs_proba=True)

# Create directory for saving models if it doesn't exist
os.makedirs('saved_models', exist_ok=True)

# Stratified K-Fold Cross-Validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Number of random searches per model
n_iter_per_model = 5
best_models = []

# Dictionary to store all RandomizedSearchCV objects
search_results = {}

# Iterate through each model
for name, (model, params) in models.items():
    print(f"\n--- Starting RandomizedSearchCV for {name} ---\n")
    
    # Check if the model has already been trained and skip if it exists
    saved_files = glob(f"saved_models/{name}_*.joblib")
    if saved_files:
        print(f"Model {name} already trained. Skipping...")
        continue

    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    # Create RandomizedSearchCV
    search = RandomizedSearchCV(
        pipeline, 
        param_distributions=params,
        n_iter=n_iter_per_model,
        cv=stratified_cv,
        n_jobs=1,
        random_state=42,
        scoring=recall_5_fpr_scorer,
        verbose=3
    )

    # Start timing
    start_time = time.time()

    # Fit the model
    try:
        print(f"Fitting the model {name} with {n_iter_per_model} iterations and {stratified_cv.get_n_splits()} cross-validation splits...")
        search.fit(X_train, y_train)
    except Exception as e:
        print(f"An error occurred while fitting {name}: {e}")
        continue

    # End timing
    end_time = time.time()

    # Calculate elapsed time
    elapsed_time = end_time - start_time
    elapsed_time_str = f"{elapsed_time:.2f}s"
    print(f"\n--- Finished RandomizedSearchCV for {name} in {elapsed_time_str} ---\n")

    # Store the search object in the dictionary
    search_results[name] = search

    # Get current date and time for naming
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Create a unique filename with version number, model name, date, and training time
    filename = f"saved_models/{name}_{current_datetime}_{elapsed_time_str}.joblib"
    
    # Save each RandomizedSearchCV object immediately after training
    print(f"Saving RandomizedSearchCV results for {name} as {filename}...\n")
    joblib.dump(search, filename)
    print(f"RandomizedSearchCV for {name} saved successfully.\n")

print("\n--- All Models Processed ---\n")





--- Starting RandomizedSearchCV for Logistic Regression (lbfgs) ---

Model Logistic Regression (lbfgs) already trained. Skipping...

--- Starting RandomizedSearchCV for Logistic Regression (default) ---

Model Logistic Regression (default) already trained. Skipping...

--- Starting RandomizedSearchCV for Random Forest ---

Fitting the model Random Forest with 5 iterations and 5 cross-validation splits...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END classifier__class_weight=balanced, classifier__max_depth=8, classifier__min_samples_leaf=1, classifier__min_samples_split=8, classifier__n_estimators=206;, score=0.458 total time= 1.9min
[CV 2/5] END classifier__class_weight=balanced, classifier__max_depth=8, classifier__min_samples_leaf=1, classifier__min_samples_split=8, classifier__n_estimators=206;, score=0.459 total time= 1.9min
[CV 3/5] END classifier__class_weight=balanced, classifier__max_depth=8, classifier__min_samples_leaf=1, classifier__min_samples_sp

### 1c. Search saving and loading

Save search:

In [None]:
# Define the directory containing the saved joblib files
saved_models_dir = 'saved_models'

# Initialize the final search_results dictionary to store all models
search_results = {}

# Iterate over each joblib file in the directory
for filename in os.listdir(saved_models_dir):
    if filename.endswith(".joblib"):
        # Construct the full path to the file
        filepath = os.path.join(saved_models_dir, filename)

        # Load the model information from the file
        model_info = joblib.load(filepath)

        # Use the filename (without extension) as the key in the search_results dictionary
        model_key = filename.replace('.joblib', '')
        
        # Add the loaded model information to the search_results dictionary
        search_results[model_key] = model_info

# Save the combined search_results dictionary to a new joblib file
combined_filename = 'search_results.joblib'
joblib.dump(search_results, combined_filename)
print(f"Combined search results saved successfully as {combined_filename}")


Load search:

In [None]:
search_results = joblib.load(f"search_results.joblib")

## 2. Model evaluation

### 2a. Random search evaluation

Due to the biased data set we use the auc roc score to evaluate different models. We start by printing the best model of each model class for our search. 

In [None]:
# Create a list for storing model information
results_summary = []

# Collect the best model, score, and parameters
for model_name, search in search_results.items():
    best_score = search.best_score_
    best_params = search.best_params_
    results_summary.append({
        'Model': model_name,
        'Best Score (AUC)': f"{best_score:.4f}",
        'Best Parameters': best_params
    })

# Convert to a DataFrame and sort by AUC score
results_df = pd.DataFrame(results_summary).sort_values(by='Best Score (AUC)', ascending=False)

# Display the DataFrame in Jupyter
from IPython.display import display

display(results_df)

We can see that the best performing model on the trainig data is "TODO" with auc_roc score being "TODO". Now we will evaluate this model closer in the following section

### 2b. Best model evaluation

We begin the evaluation of the best model by extracting it from search_results.

In [None]:
# Initialize variables to track the best model
best_model_name = None
best_model_score = -float('inf')
best_model_params = None
best_model_object = None
best_classifier = None

# Iterate through the search results to find the best model
for model_name, search in search_results.items():
    if search.best_score_ > best_model_score:
        best_model_name = model_name
        best_model_score = search.best_score_
        best_model_params = search.best_params_
        best_model_object = search.best_estimator_

        # Extract the classifier from the pipeline
        best_classifier = best_model_object.named_steps['classifier']

# Print the best model details
print(f"Best Model Name: {best_model_name}")
print(f"Best Model Score (AUC): {best_model_score:.4f}")
print(f"Best Model Parameters: {best_model_params}")

# Print the best classifier object
print(f"Best Classifier Object: {best_classifier}")


### Continued Model Development

Given that LightGBM and XGBoost are the top performing models, we will continue to develop these models further. We will start by tuning the hyperparameters of the models using Optuna, which is a library that applies Bayesian optimization to hyperparameter tuning. They adaptively explore the search space, focusing on more promising hyperparameter sets, which theoretically makes them more effective than RandomizedSearchCV.

In [None]:
import time
import os
from datetime import datetime
from scipy.stats import uniform, randint
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna
from optuna.storages import RDBStorage
from optuna.pruners import HyperbandPruner
from optuna.samplers import TPESampler
import joblib

# Define Optuna storage for resuming capability
os.makedirs('saved_studies', exist_ok=True)
storage = RDBStorage(url='sqlite:///saved_studies/optuna_study.db')

# Define the XGBoost model and its hyperparameters
xgb_model = XGBClassifier(random_state=42, eval_metric='auc', tree_method='hist', device='cuda')

# Define the LightGBM model
lgbm_model = LGBMClassifier(random_state=42, device='gpu')

# Create pipeline for XGBoost
pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', xgb_model)])

# Create pipeline for LightGBM
pipeline_lgbm = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', lgbm_model)])

# Define Stratified K-Fold Cross-Validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Optuna objective function for XGBoost
def objective_xgb(trial):
    params = {
        'classifier__n_estimators': trial.suggest_int('classifier__n_estimators', 50, 2000),
        'classifier__learning_rate': trial.suggest_float('classifier__learning_rate', 0.001, 0.5, log=True),
        'classifier__max_depth': trial.suggest_int('classifier__max_depth', 3, 30),
        'classifier__min_child_weight': trial.suggest_int('classifier__min_child_weight', 1, 20),
        'classifier__gamma': trial.suggest_float('classifier__gamma', 0, 10),
        'classifier__subsample': trial.suggest_float('classifier__subsample', 0.5, 1.0),
        'classifier__colsample_bytree': trial.suggest_float('classifier__colsample_bytree', 0.5, 1.0),
        'classifier__colsample_bylevel': trial.suggest_float('classifier__colsample_bylevel', 0.5, 1.0),
        'classifier__colsample_bynode': trial.suggest_float('classifier__colsample_bynode', 0.5, 1.0),
        'classifier__reg_alpha': trial.suggest_float('classifier__reg_alpha', 1e-3, 20.0, log=True),
        'classifier__reg_lambda': trial.suggest_float('classifier__reg_lambda', 1e-3, 20.0, log=True),
        'classifier__scale_pos_weight': trial.suggest_float('classifier__scale_pos_weight', 1, 20),
        'classifier__max_delta_step': trial.suggest_int('classifier__max_delta_step', 0, 20),
        'classifier__objective': trial.suggest_categorical('classifier__objective', ['binary:logistic']),
        'classifier__grow_policy': trial.suggest_categorical('classifier__grow_policy', ['depthwise', 'lossguide']),
        'classifier__max_bin': trial.suggest_int('classifier__max_bin', 128, 1024),
        'classifier__learning_rate_decay_factor': trial.suggest_float('classifier__learning_rate_decay_factor', 0.1, 1.0),
        'classifier__num_boost_round': trial.suggest_int('classifier__num_boost_round', 100, 2000),
        'classifier__tree_method': trial.suggest_categorical('classifier__tree_method', ['gpu_hist']),
        'classifier__booster': trial.suggest_categorical('classifier__booster', ['gbtree', 'gblinear', 'dart']),
        'classifier__lambda_bias': trial.suggest_float('classifier__lambda_bias', 0.0, 10.0),
        'classifier__monotone_constraints': trial.suggest_categorical('classifier__monotone_constraints', ['()', '(0, 0, 0)', '(1, 0, -1)', '(1, -1, 1)']),
        'classifier__interaction_constraints': trial.suggest_categorical('classifier__interaction_constraints', ['()', '(0, 1)', '(1, 2, 3)', '(1, 2)']),
        'classifier__num_parallel_tree': trial.suggest_int('classifier__num_parallel_tree', 1, 20),
        'classifier__max_leaves': trial.suggest_int('classifier__max_leaves', 0, 200),
        'classifier__sampling_method': trial.suggest_categorical('classifier__sampling_method', ['uniform', 'gradient_based']),
        'classifier__alpha': trial.suggest_float('classifier__alpha', 0.0, 20.0),
        'classifier__lambda': trial.suggest_float('classifier__lambda', 0.0, 20.0),
        'classifier__max_cat_to_onehot': trial.suggest_int('classifier__max_cat_to_onehot', 1, 200),
        'classifier__predictor': trial.suggest_categorical('classifier__predictor', ['gpu_predictor']),
        'classifier__single_precision_histogram': trial.suggest_categorical('classifier__single_precision_histogram', [True, False]),
        'classifier__objective_sample_rate': trial.suggest_float('classifier__objective_sample_rate', 0.1, 1.0),
        'classifier__max_cat_threshold': trial.suggest_int('classifier__max_cat_threshold', 1, 128),
        'classifier__refresh_leaf': trial.suggest_categorical('classifier__refresh_leaf', [True, False]),
        'classifier__process_type': trial.suggest_categorical('classifier__process_type', ['default', 'update']),
        'classifier__updater': trial.suggest_categorical('classifier__updater', ['grow_gpu_hist']),
        'classifier__learning_rate_decay': trial.suggest_categorical('classifier__learning_rate_decay', [True, False]),
        'classifier__early_stopping_rounds': trial.suggest_int('classifier__early_stopping_rounds', 10, 100),
    }
    
    pipeline_xgb.set_params(**params)
    scores = []
    start_time = time.time()
    for fold, (train_idx, val_idx) in enumerate(stratified_cv.split(X_train, y_train)):
        fold_start_time = time.time()
        print(f"\n--- Starting fold {fold + 1} for XGBoost ---\n")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        pipeline_xgb.fit(X_tr, y_tr)
        score = pipeline_xgb.score(X_val, y_val)
        scores.append(score)
        fold_end_time = time.time()
        fold_duration = fold_end_time - fold_start_time
        print(f"--- Finished fold {fold + 1} for XGBoost in {fold_duration:.2f} seconds ---\n")
        # Estimate remaining time for folds
        remaining_folds = stratified_cv.get_n_splits() - (fold + 1)
        eta_folds = remaining_folds * fold_duration
        print(f"Estimated time remaining for current trial: {eta_folds:.2f} seconds\n")
    end_time = time.time()
    trial_duration = end_time - start_time
    print(f"\n--- Completed all folds for XGBoost in {trial_duration:.2f} seconds ---\n")
    return sum(scores) / len(scores)

# Define Optuna study with HyperbandPruner and TPESampler
sampler = TPESampler()
pruner = HyperbandPruner()

# Optimize XGBoost using Optuna
study_xgb = optuna.create_study(direction='maximize', storage=storage, study_name='xgboost_study', load_if_exists=True, sampler=sampler, pruner=pruner)
print("\n--- Starting Optuna optimization for XGBoost ---\n")
total_trials = 200
start_time = time.time()
for trial_idx in range(total_trials):
    trial_start_time = time.time()
    study_xgb.optimize(objective_xgb, n_trials=1)
    # Save intermediate model after each trial
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    intermediate_xgb_filename = f"saved_models/XGBoost_trial_{trial_idx + 1}_{current_datetime}.joblib"
    print(f"Saving intermediate XGBoost model for trial {trial_idx + 1} as {intermediate_xgb_filename}...\n")
    joblib.dump(pipeline_xgb, intermediate_xgb_filename)
    print(f"Intermediate XGBoost model for trial {trial_idx + 1} saved successfully as {intermediate_xgb_filename}.\n")
    trial_end_time = time.time()
    trial_duration = trial_end_time - trial_start_time
    elapsed_time = trial_end_time - start_time
    remaining_trials = total_trials - (trial_idx + 1)
    eta_trials = remaining_trials * trial_duration
    print(f"\n--- Finished trial {trial_idx + 1}/{total_trials} for XGBoost in {trial_duration:.2f} seconds ---\n")
    print(f"Elapsed time: {elapsed_time:.2f} seconds. Estimated time remaining for all trials: {eta_trials:.2f} seconds\n")
end_time = time.time()
print(f"\n--- Finished Optuna optimization for XGBoost in {end_time - start_time:.2f} seconds ---\n")

# Save the best XGBoost model
xgb_best_params = study_xgb.best_params
pipeline_xgb.set_params(**xgb_best_params)
pipeline_xgb.fit(X_train, y_train)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
xgb_filename = f"saved_models/XGBoost_optuna_{current_datetime}.joblib"
print(f"Saving best Optuna XGBoost model as {xgb_filename}...\n")
joblib.dump(pipeline_xgb, xgb_filename)
print(f"Best Optuna XGBoost model saved successfully as {xgb_filename}.\n")

# Optuna objective function for LightGBM
def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', -1, 15),  # Use -1 for no limit on depth
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'boosting_type': 'gbdt',  # For GPU, it is common to use 'gbdt'
        'device': 'gpu',  # This will enable GPU usage
        'objective': trial.suggest_categorical('objective', ['binary', 'regression', 'multiclass']),
        'max_bin': trial.suggest_int('max_bin', 128, 512),  # Typically, a higher value is recommended for GPU
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'learning_rate_decay_factor': trial.suggest_float('learning_rate_decay_factor', 0.1, 1.0),
        'num_boost_round': trial.suggest_int('num_boost_round', 100, 1000),
        'early_stopping_round': trial.suggest_int('early_stopping_round', 10, 50),  # Optional
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100),  # Often used for regularization
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),  # L1 regularization
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10),  # L2 regularization
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5),  # Used when boosting_type='goss'
        'other_rate': trial.suggest_float('other_rate', 0.1, 0.5),  # Used when boosting_type='goss'
    }


    pipeline_lgbm.set_params(**params)
    scores = []
    start_time = time.time()
    for fold, (train_idx, val_idx) in enumerate(stratified_cv.split(X_train, y_train)):
        fold_start_time = time.time()
        print(f"\n--- Starting fold {fold + 1} for LightGBM ---\n")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        pipeline_lgbm.fit(X_tr, y_tr)
        score = pipeline_lgbm.score(X_val, y_val)
        scores.append(score)
        fold_end_time = time.time()
        fold_duration = fold_end_time - fold_start_time
        print(f"--- Finished fold {fold + 1} for LightGBM in {fold_duration:.2f} seconds ---\n")
        # Estimate remaining time for folds
        remaining_folds = stratified_cv.get_n_splits() - (fold + 1)
        eta_folds = remaining_folds * fold_duration
        print(f"Estimated time remaining for current trial: {eta_folds:.2f} seconds\n")
    end_time = time.time()
    trial_duration = end_time - start_time
    print(f"\n--- Completed all folds for LightGBM in {trial_duration:.2f} seconds ---\n")
    return sum(scores) / len(scores)

# Optimize LightGBM using Optuna
study_lgbm = optuna.create_study(direction='maximize', storage=storage, study_name='lgbm_study', load_if_exists=True, sampler=sampler, pruner=pruner)
print("\n--- Starting Optuna optimization for LightGBM ---\n")
total_trials = 200
start_time = time.time()
for trial_idx in range(total_trials):
    trial_start_time = time.time()
    study_lgbm.optimize(objective_lgbm, n_trials=1)
    # Save intermediate model after each trial
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    intermediate_lgbm_filename = f"saved_models/LightGBM_trial_{trial_idx + 1}_{current_datetime}.joblib"
    print(f"Saving intermediate LightGBM model for trial {trial_idx + 1} as {intermediate_lgbm_filename}...\n")
    joblib.dump(pipeline_lgbm, intermediate_lgbm_filename)
    print(f"Intermediate LightGBM model for trial {trial_idx + 1} saved successfully as {intermediate_lgbm_filename}.\n")
    trial_end_time = time.time()
    trial_duration = trial_end_time - trial_start_time
    elapsed_time = trial_end_time - start_time
    remaining_trials = total_trials - (trial_idx + 1)
    eta_trials = remaining_trials * trial_duration
    print(f"\n--- Finished trial {trial_idx + 1}/{total_trials} for LightGBM in {trial_duration:.2f} seconds ---\n")
    print(f"Elapsed time: {elapsed_time:.2f} seconds. Estimated time remaining for all trials: {eta_trials:.2f} seconds\n")
end_time = time.time()
print(f"\n--- Finished Optuna optimization for LightGBM in {end_time - start_time:.2f} seconds ---\n")

# Save the best LightGBM model
lgbm_best_params = study_lgbm.best_params
pipeline_lgbm.set_params(**lgbm_best_params)
pipeline_lgbm.fit(X_train, y_train)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
lgbm_filename = f"saved_models/LightGBM_optuna_{current_datetime}.joblib"
print(f"Saving best Optuna LightGBM model as {lgbm_filename}...\n")
joblib.dump(pipeline_lgbm, lgbm_filename)
print(f"Best Optuna LightGBM model saved successfully as {lgbm_filename}.\n")


Next we evaluate the model on the training data. This gives:

### Ruibin: TODO

## 3. Test data evaluation

In [None]:
# Evaluate the best model
y_pred = best_classifier.predict(X_test)
y_pred_proba = best_classifier.predict_proba(X_test)[:, 1] if hasattr(best_model_name.named_steps['classifier'], 'predict_proba') else y_pred

# Print the best model and its parameters
print(f"\nBest Model: {best_model_name}")
print(f"Best Cross-Validation AUC Score: {best_score:.4f}")

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate and print AUC score on the test set
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score on Test Set: {auc_score:.4f}")