In this notebook, we aim to build and evaluate various machine learning models to predict a target variable from a given dataset. We start by loading and preprocessing the data, including handling missing values and encoding categorical features. After splitting the data into training and testing sets, we train multiple models such as Logistic Regression, Random Forest, XGBoost, and a Neural Network, and save the trained models for future use. I use Optuna for hyperparameter tuning, including GridSearch and RandomizedSearch. The purpose of this project is to compare the performance of different models and select the best one for our prediction task.

# Libraries

In [1]:
# !python -m pip install scikit-learn==1.3.1
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dill
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from helper_functions import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
import optuna
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [None]:
df = load_data('application_train.parquet')

In [3]:
df 

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Drop Features

In [4]:
# drop uneeeded columns
df = drop_features(df, features_to_drop=['SK_ID_CURR'])

In [5]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,,,,,,
307507,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,,,,,,
307508,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Impute Data

In [6]:
# impute missing values
df = clean_data(df)

In [7]:
# check for missing values
df.isna().sum().sum()

0

In [8]:
# check for imbalanced data
df.TARGET.value_counts() # imbalanced data

TARGET
0    282686
1     24825
Name: count, dtype: int64

# Encode Data

In [9]:
# Get only categorical features:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# encode categorical features to numerical
df = encode_data(df, 'TARGET', categorical_cols, train=True, model=TargetEncoder)


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [10]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,0.083459,0.101419,0.085002,0.079616,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.083459,0.069993,0.085002,0.083249,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.054784,0.101419,0.072437,0.079616,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.083459,0.069993,0.085002,0.079616,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0.083459,0.101419,0.085002,0.079616,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,0.083459,0.101419,0.085002,0.083249,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307507,0,0.083459,0.069993,0.085002,0.079616,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307508,0,0.083459,0.069993,0.085002,0.079616,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,0.083459,0.069993,0.085002,0.079616,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Split train_test 

In [11]:
# split data into train and test
X_train, X_test, y_train, y_test = split_data(df, 'TARGET', feature_selected= None, features_dropped =[], balanced_data=False)

# save splitted data
with open('splitted_train_test.pickle', 'wb') as f: 
    dill.dump((X_train, X_test, y_train, y_test), f)

# Train Models

### Logistic model

In [12]:
# train logistic regression model
logistic_model = train_model(model_class=LogisticRegression, X_train=X_train, y_train=y_train)

# save trained model
with open('trained_logistic_model.pickle', 'wb') as f: 
    dill.dump(logistic_model, f)

### Random Forest Model

In [13]:
# train random forest model
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

random_forest_model = train_model(model_class=RandomForestClassifier, X_train=X_train, y_train=y_train, param_grid=param_grid, best_combination=True, n_trials=10)

# save trained model 
with open('trained_random_forest_model.pickle', 'wb') as f: 
    dill.dump(random_forest_model, f)

[I 2025-02-20 15:36:00,698] A new study created in memory with name: no-name-0a08e9db-ffdd-4071-a5f2-4ba5cc73b7c3
[I 2025-02-20 15:36:03,686] Trial 0 finished with value: 0.7234611585186985 and parameters: {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 0 with value: 0.7234611585186985.
[I 2025-02-20 15:36:07,701] Trial 1 finished with value: 0.7392406754641911 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 1 with value: 0.7392406754641911.
[I 2025-02-20 15:36:09,660] Trial 2 finished with value: 0.7378567568208463 and parameters: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 1 with value: 0.7392406754641911.
[I 2025-02-20 15:36:13,373] Trial 3 finished with value: 0.7406062814849531 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 

Best model: Optuna with AUC: 0.7432408096124732


### XGBoost Model

In [14]:
# train xgboost model
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'gamma': [0, 0.1],
    'subsample': [0.5, 0.8],
    'colsample_bytree': [0.5, 0.8],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1]
}

xgb_model = train_model(model_class=xgb.XGBClassifier, X_train=X_train, y_train=y_train, param_grid=param_grid, best_combination=True, n_trials=10) 

# save trained model
with open('trained_xgb_model.pickle', 'wb') as f: 
    dill.dump(xgb_model, f)

[I 2025-02-20 15:37:21,732] A new study created in memory with name: no-name-991a88c9-6053-4695-85ee-09d5572824e7
[I 2025-02-20 15:37:22,103] Trial 0 finished with value: 0.7663360490505007 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'subsample': 0.5, 'colsample_bytree': 0.5, 'reg_alpha': 0, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.7663360490505007.
[I 2025-02-20 15:37:22,439] Trial 1 finished with value: 0.7520702958361969 and parameters: {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 0.1, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.7663360490505007.
[I 2025-02-20 15:37:22,696] Trial 2 finished with value: 0.7550842467208951 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0}. Best is trial 0 with value: 0.7663360490505007.
[I 2025-02

Best model: RandomizedSearchCV with AUC: 0.8065527742622087


### Neural Network Model

In [15]:
# train neural network model
model_nn, history = neural_network_model(X=X_train, y=y_train, loss=tf.keras.losses.CategoricalCrossentropy(), metrics='auc', activations='relu', output_activation='softmax', widths=[32, 4, 2], num_layers=2, epochs=10, batch_size=32, learning_rate=0.00001, validation_split=0.333)

# save trained model
with open('trained_nn_model.pickle', 'wb') as f:
    dill.dump(model_nn, f)

Epoch 1/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - auc: 0.5956 - loss: 14909.7725 - val_auc: 0.0164 - val_loss: 31506.3867
Epoch 2/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6359 - loss: 11944.8213 - val_auc: 0.0172 - val_loss: 20868.8750
Epoch 3/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6375 - loss: 8398.9395 - val_auc: 0.0176 - val_loss: 12372.6885
Epoch 4/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6511 - loss: 6187.7148 - val_auc: 0.0168 - val_loss: 6255.3667
Epoch 5/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6644 - loss: 4755.8916 - val_auc: 0.0153 - val_loss: 2559.9412
Epoch 6/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6656 - loss: 3377.9719 - val_auc: 0.0148 - val_loss: 857.1994
Epoch 7/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━

In [16]:
# model summary
model_nn.summary()