# Model Development

Import libraries/packages + preprocessed data

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.custom_transformers import remove_prefix
from src.build_dnn_model import build_nn_model
import pandas as pd
import numpy as np
from sklearn import svm, ensemble, linear_model
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import joblib
import matplotlib.pyplot as plt
##Neural Network
import tensorflow as tf
import keras_tuner as kt
from keras import layers, optimizers, metrics, callbacks, losses, initializers, Sequential, regularizers
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import auc, roc_curve

from src.config import SEED

##Raw (not-processed) training data
raw_X_train, raw_y_train = pd.read_parquet('../data/raw/split/Raw_X_train.parquet'), pd.read_excel('../data/raw/split/Raw_y_train.xlsx', index_col=0)
## Processed training data
ml_X_train = pd.read_parquet('../data/processed/ml_train_transformed.parquet')
ml_X_test = pd.read_parquet('../data/processed/ml_test_transformed.parquet')
nomo_X_train = pd.read_parquet('../data/processed/nomo_train_transformed.parquet')
##Pipeline
ml_pipeline = joblib.load('../data/processed/ml_preprocessing_pipeline.pkl')

# RF, LightGBM, SVC, KNN, LR-Nomogram

In [None]:
###Param Search Grid
model_params = {
    'RF': {
        'model': ensemble.RandomForestClassifier(random_state=SEED), 
        'params': {
            'n_estimators': np.arange(5, 500, 10),
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': np.arange(3, 50, 3),
            'min_samples_split': np.arange(2, 15, 1),
            'min_samples_leaf': np.arange(1, 15, 1),
            'min_weight_fraction_leaf': np.linspace(0, 0.5, 10),
            'max_features': ['sqrt', 'log2', None],
            'max_leaf_nodes': np.arange(3, 50, 5),
            'min_impurity_decrease': np.linspace(0, 3, 15),
            'class_weight': ['balanced', 'balanced_subsample', None],
            'max_samples': np.linspace(0.01, 1, 15)
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(
            random_state=SEED,
            n_jobs=-1,
            class_weight='balanced',  # Handles class imbalance automatically
            verbosity=-1
            ),
        'params': {
            'num_leaves': np.arange(5, 105, 10),
            'min_data_in_leaf': np.arange(50, 500, 25),
            'max_depth': [3, 5, 7, 10, -1],  # -1 means no limit

            'n_estimators': [50, 100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            
            
            'min_child_samples': [5, 10, 20, 40],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [0, 0.1, 1],
            
        }
    },
    'SVC': {
        'model': svm.SVC(probability = True, random_state = SEED),
        'params': {
            'C': np.linspace(0.005, 10, 20),
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            #Only used for poly
            'degree': [2,3,4,5,6,7,8],
            #Only used for rbf, poly, sigmoid
            'gamma': ['scale', 'auto'] + list(np.arange(0.05, 2, 10)),
            #Onbly used for poly and sigmoid
            'coef0': np.linspace(0.0, 1.5, 20),
            'shrinking': [True, False],
            'class_weight': ['balanced', None, {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:4}]
        }
    }, 
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,4,5,6],
            'weights': ['uniform', 'distance', None],
            'algorithm': ['ball_tree', 'kd_tree', 'brute', 'auto'],
            #only used for ball_tree or kd_tree
            'leaf_size': np.arange(15, 45, 5),
            'p': [1,2],
            'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'nan_euclidean', 'minkowski']
        }
    },     
    'NLR': {
        'model': linear_model.LogisticRegression(max_iter = 10000, random_state = SEED,
                                                solver='liblinear'),
        'params': {
            'penalty': ['l1', 'l2'],
            'C': np.linspace(0.01, 3, 50),
            'class_weight': ['balanced', None, {0:1, 1: 1.15}, {0:1, 1: 1.25}, {0:1, 1: 1.5}, {0:1, 1: 1.75}, {0:1, 1: 2}, {0:1, 1: 2.5}, {0:1, 1: 3}],
            #'l1_ratio': np.linspace(0.01,1, 20)
        }
    }
}

def tune_w_random(X, y,
                  model_name, n_iters, 
                  optimize_for, random_state = SEED,
                  export_model = False):
    model = model_params[model_name]['model']
    params = model_params[model_name]['params']

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter= n_iters,    
        scoring=optimize_for,
        cv=skf,
        n_jobs=-1,
        random_state=random_state
    )
    search.fit(X, y)
    best_params = search.best_params_
    best_score = search.best_score_
    print(f"{model_name} Best Params: {best_params}")
    print(f"{model_name} Best Average {optimize_for}: {best_score:.4f}")
    best_model = search.best_estimator_
    if export_model:
        joblib.dump(best_model, f'../models/{model_name}.joblib')
    #Return best model
    # NOTE: RandomSearchCV automatically retrains model on entire train set
    return best_model


In [None]:
lightgbm_model = tune_w_random(ml_X_train, raw_y_train.values.ravel(),
                        'LightGBM', 500,
                        'roc_auc')

In [None]:
svc_model = tune_w_random(ml_X_train, raw_y_train.values.ravel(),
                        'SVC', 500,
                        'roc_auc')

In [None]:
knn_model = tune_w_random(ml_X_train, raw_y_train.values.ravel(),
                        'KNN', 500,
                        'roc_auc')

In [None]:
nomo_model = tune_w_random(nomo_X_train, raw_y_train.values.ravel(),
                        'NLR', 500,
                        'roc_auc')

# DNN

Set up search space for tuning

In [None]:
pos = raw_y_train.value_counts()[1] / raw_y_train.value_counts()[0] #Positive class ratio
log_bias= np.log(pos / (1-pos))

##Function used to build model (with search space) for keras-tuner
def model_builder(hp):
    hp_activation = hp.Choice('activation', values = ['relu', 'leaky_relu', 'elu']) #

    #Layer size
    hp_layer_1 = hp.Int('layer_1', min_value =5, max_value = 100, step = 5) # 
    hp_layer_2 = hp.Int('layer_2', min_value =5, max_value = 100, step = 5) # 
    #hp_layer_3 = hp.Int('layer_3', min_value =5, max_value = 100, step = 5)  # 
    #Dropout rate
    hp_drop_1 = hp.Choice('drop_1', values = [0.25, 0.5, 0.75, 0.9, 0.99999]) # 
    hp_drop_2 = hp.Choice('drop_2', values = [0.25, 0.5, 0.75, 0.9, 0.99999]) #  
    #hp_drop_3 = hp.Choice('drop_3', values = [0.25, 0.5, 0.75, 0.9, 0.99999]) #
    # Regularization
    hp_l2 = hp.Choice('l2', values=[0.0, 0.001, 0.01, 0.05, 0.1])
    #Loss
    hp_gamma = hp.Choice('gamma', values= [0.5, 1.0, 1.5, 2.0, 2.5]) #
    hp_alpha = hp.Choice('alpha', values=[0.1, 0.15, 0.2, 0.25, 0.3]) #
    hp_loss = losses.BinaryFocalCrossentropy(gamma=hp_gamma, alpha=hp_alpha)
    #Optimizer
    hp_lr = hp.Choice('learning_rate', values = [0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]) #
    hp_momentum = hp.Choice('momentum', values = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]) #

    model = Sequential()
    #Input layer
    model.add(layers.Input(shape = ml_X_train.loc[0].shape))
    #Hidden layer1
    model.add(layers.Dense(units = hp_layer_1, 
                                    kernel_initializer= initializers.HeUniform(seed=SEED),
                                    bias_initializer='zeros',
                                    kernel_regularizer=regularizers.l2(hp_l2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation(hp_activation))
    model.add(layers.Dropout(hp_drop_1)) 
    #Hidden layer 2
    model.add(layers.Dense(units = hp_layer_2, 
                                    kernel_initializer= initializers.HeUniform(seed=SEED),
                                    bias_initializer='zeros',
                                    kernel_regularizer=regularizers.l2(hp_l2)))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation(hp_activation))
    model.add(layers.Dropout(hp_drop_2)) 

    # NOTE: Param code for third layer is commented out since tuning with 2 layers was explored after 3-layer tuning
    ## Ultimately, 2 layers out-performed 3 layers in average AUROC across CV folds
    ################################### Uncomment below for 3-layer tuning ###################################
    #Hidden layer 3
    #model.add(layers.Dense(units = hp_layer_3, 
    #                                kernel_initializer= initializers.HeUniform(seed=SEED),
    #                                bias_initializer='zeros'))
    #model.add(layers.BatchNormalization())
    #model.add(layers.Activation(hp_activation))
    #model.add(layers.Dropout(hp_drop_3)) 
    #########################################################################################################

    #Output layer
    model.add(layers.Dense(units = 1, activation='sigmoid',
                                    kernel_initializer= initializers.GlorotUniform(seed=SEED),
                                    bias_initializer= initializers.Constant(log_bias)
                                    )
                )
    

    model.compile(optimizers.SGD(learning_rate=hp_lr, momentum=hp_momentum),
                  loss = hp_loss,
                  metrics = [metrics.AUC(curve = 'ROC',name = 'AUCROC')])
    return model


Tune architecture + hyper-parameters w/ stratified 5-fold CV

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

all_best_scores = []
all_best_hps = []

i = 1
for train_idx, val_idx in skf.split(raw_X_train, raw_y_train):
    #Get raw train/val indices
    X_train, X_val = raw_X_train.iloc[train_idx], raw_X_train.iloc[val_idx]
    y_train, y_val = raw_y_train.iloc[train_idx], raw_y_train.iloc[val_idx]

    train_transformed = ml_pipeline.fit_transform(X_train)
    val_transformed = ml_pipeline.transform(X_val)

    tuner = kt.Hyperband(
        hypermodel=model_builder,
        objective=kt.Objective('val_AUCROC', 'max'),
        max_epochs=200,
        factor=3,
        seed=SEED,
        directory='../nn_tune_dir',
        project_name=f'x_fold_{i}'
    )

    stop_early = callbacks.EarlyStopping(monitor='val_AUCROC', patience=3)
    tuner.search(train_transformed, y_train,
                 epochs=30,
                 validation_data=(val_transformed, y_val),
                 callbacks=[stop_early])
    best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]
    all_best_scores.append(best_trial.score)
    all_best_hps.append(tuner.get_best_hyperparameters(1)[0])
    i+=1

mean_score = np.mean(all_best_scores)
std_score = np.std(all_best_scores)
print(f"Mean CV val_AUCROC: {mean_score:.3f} ± {std_score:.3f}")

In [None]:
print('-------------- Best HPs for each fold --------------')
for idx, hps in enumerate(all_best_hps):
    print(f"**************Fold {idx+1} best hyperparameters:**************")
    print(all_best_scores[idx])
    print(hps.values)

## Uncomment layer_3 and drop_3 lines for 3-layer tuning
numeric_keys = [ 'layer_1',
 'layer_2',
 #'layer_3',
 'drop_1',
 'drop_2',
 #'drop_3',
 'l2',
 'gamma',
 'alpha',
 'learning_rate',
 'momentum']
print('\t\t')
print('-------------- Average (numerical) hyper-parameter values amongst best combinations --------------')
average_hps = {}
for key in numeric_keys:
    values = [hps[key] for hps in all_best_hps if key in hps]
    average_hps[key] = np.mean(values)

for key in sorted(average_hps):
    print(f"{key}: {average_hps[key]:.4f}")

Set final architecture + hyper-params

In [None]:
##Set final hyper-params
chosen_hps = {'activation': 'relu', 
 'layer_1': 48, 
 'layer_2': 35, 
 'drop_1': 0.4, 
 'drop_2': 0.6, 
 'l2': 0.0022, 
 'gamma': 1.2, 
 'alpha': 0.22, 
 'learning_rate': 0.0970, 
 'momentum': 0.8, 
 ## The last 5 params are arbitrary
 'tuner/epochs': 100, 
 'tuner/initial_epoch': 20, 
 'tuner/bracket': 2, 
 'tuner/round': 2, 
 'tuner/trial_id': '0035'}

Note: 82 epochs chosen

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
all_val_metrics = []
##NOTE: Iteratively 
min_epochs = 80
max_epochs = 86
step = 1

i = 1
for train_idx, val_idx in skf.split(raw_X_train, raw_y_train):
    #Get raw train/val indices
    X_train, X_val = raw_X_train.iloc[train_idx], raw_X_train.iloc[val_idx]
    y_train, y_val = raw_y_train.iloc[train_idx], raw_y_train.iloc[val_idx]

    train_transformed = ml_pipeline.fit_transform(X_train)
    val_transformed = ml_pipeline.transform(X_val)

    fold_val_metrics = []
    for epoch in range(min_epochs, max_epochs, step):
        model = KerasClassifier(
            model = build_nn_model,
            input_shape = ml_X_train.loc[0].shape,
            hidden_layer_1 = chosen_hps.get('layer_1'),
            hidden_layer_2 = chosen_hps.get('layer_2'),
            #hidden_layer_3 = chosen_hps.get('layer_3'),
            dropout_1 = chosen_hps.get('drop_1'),
            dropout_2 = chosen_hps.get('drop_2'),
            #dropout_3 = chosen_hps.get('drop_3'),
            l2 = chosen_hps.get('l2'),
            learning_rate = chosen_hps.get('learning_rate'),
            #Log bias calculated earlier (few cells up)
            log_bias = log_bias,
            activation = chosen_hps.get('activation'),
            momentum = chosen_hps.get('momentum'),
            loss_func = losses.BinaryFocalCrossentropy(gamma=chosen_hps.get('gamma'), alpha=chosen_hps.get('alpha')),
            seed = SEED, #model builder random state
            epochs = epoch,
            verbose = 0,
            random_state= SEED #keras classifier random state
        )
        model.fit(train_transformed, y_train)
        #Get score
        y_pred_proba = model.predict_proba(val_transformed)[:, 1]
        #y_pred_proba = get_y_pred_probability(model, 'NN', val_transformed)
        fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
        val_metric = auc(fpr, tpr)
        fold_val_metrics.append(val_metric)
    all_val_metrics.append(fold_val_metrics)
    print(f'Done with fold {i}')
    i+=1

# Compute average validation metric per epoch
avg_val_metrics = np.mean(all_val_metrics, axis=0)
best_epoch = (np.argmax(avg_val_metrics) * step) + min_epochs

best_score = np.max(avg_val_metrics)
print(f"Best CV val AUROC: {best_epoch} Epochs with score of {best_score:.3f}")

Fit final model

In [None]:
nn_model = KerasClassifier(
   model = build_nn_model,
   input_shape = ml_X_train.loc[0].shape,
   hidden_layer_1 = chosen_hps.get('layer_1'),
   hidden_layer_2 = chosen_hps.get('layer_2'),
   #hidden_layer_3 = chosen_hps.get('layer_3'),
   dropout_1 = chosen_hps.get('drop_1'),
   dropout_2 = chosen_hps.get('drop_2'),
   #dropout_3 = chosen_hps.get('drop_3'),
   l2 = chosen_hps.get('l2'),
   learning_rate = chosen_hps.get('learning_rate'),
   log_bias = log_bias,
   activation = chosen_hps.get('activation'),
   momentum = chosen_hps.get('momentum'),
   loss_func = losses.BinaryFocalCrossentropy(gamma=chosen_hps.get('gamma'), alpha=chosen_hps.get('alpha')),
   seed = SEED,
   epochs = best_epoch,
   verbose = 1,
   random_state= SEED
)

##Train model with tuned hyper params
nn_model.fit(ml_X_train, raw_y_train)

Plot history and (optionally) export model

In [None]:
history = nn_model.history_

###Plot AUROC and loss over epochs
# AUROC
train_auc_roc = history['AUCROC']
plt.figure(figsize=(10, 6))
plt.plot(train_auc_roc, label='Train AUC-ROC')
plt.xlabel('Epochs')
plt.ylabel('AUC-ROC')
plt.title('AUC-ROC Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

#Loss
train_loss = history['loss']
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

joblib.dump(nn_model, '../models/DNN.joblib')


# Stack

In [None]:
stack_model = ensemble.StackingClassifier(estimators=[
                                              ('LightGBM', lightgbm_model),
                                              ('SVC', svc_model), 
                                              ('KNN', knn_model), 
                                              ('Neural Network', nn_model)
                                              ],
                                       cv =5,
                                       final_estimator= linear_model.LogisticRegression()
                                       )
stack_model.fit(ml_X_train, raw_y_train.values.ravel())
joblib.dump(stack_model, '../models/stack.joblib')