In this notebook, we aim to build and evaluate various machine learning models to predict a target variable from a given dataset. We start by loading and preprocessing the data, including handling missing values and encoding categorical features. After splitting the data into training and testing sets, we train multiple models such as Logistic Regression, Random Forest, XGBoost, and a Neural Network, and save the trained models for future use. The purpose of this project is to compare the performance of different models and select the best one for our prediction task.

# Libraries

In [None]:
!python -m pip install scikit-learn==1.3.1
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dill
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from helper_functions import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
df = load_data('application_train.parquet')

In [3]:
df 

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Drop Features

In [4]:
# drop uneeeded columns
df = drop_features(df, features_to_drop=['SK_ID_CURR'])

In [5]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,,,,,,
307507,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,,,,,,
307508,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Impute Data

In [6]:
# impute missing values
df = clean_data(df)

In [7]:
# check for missing values
df.isna().sum().sum()

764371

In [8]:
# check for imbalanced data
df.TARGET.value_counts() # imbalanced data

TARGET
0    282686
1     24825
Name: count, dtype: int64

# Encode Data

In [9]:
# Get only categorical features:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# encode categorical features to numerical
df = encode_data(df, 'TARGET', categorical_cols, train=True, model=TargetEncoder)


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [10]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,0.083459,0.101419,0.085002,0.079616,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.083459,0.069993,0.085002,0.083249,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.054784,0.101419,0.072437,0.079616,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.083459,0.069993,0.085002,0.079616,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0.083459,0.101419,0.085002,0.079616,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,0.083459,0.101419,0.085002,0.083249,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307507,0,0.083459,0.069993,0.085002,0.079616,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307508,0,0.083459,0.069993,0.085002,0.079616,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,0.083459,0.069993,0.085002,0.079616,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Split train_test 

In [11]:
# split data into train and test
X_train, X_test, y_train, y_test = split_data(df, 'TARGET', feature_selected= None, features_dropped =[], balanced_data=False)

# save splitted data
with open('splitted_train_test.pickle', 'wb') as f: 
    dill.dump((X_train, X_test, y_train, y_test), f)

# Train Models

### Logistic model

In [12]:
# train logistic regression model
logistic_model = train_model(xtrain=X_train, ytrain=y_train, model_class=LogisticRegression) 

# save trained model
with open('trained_logistic_model.pickle', 'wb') as f: 
    dill.dump(logistic_model, f)

### Random Forest Model

In [13]:
# train random forest model
random_forest_model = train_model(xtrain=X_train, ytrain=y_train, model_class=RandomForestClassifier, n_estimators=100, random_state=42) 

# save trained model
with open('trained_random_forest_model.pickle', 'wb') as f: 
    dill.dump(random_forest_model, f)

### XGBoost Model

In [14]:
# train xgboost model
xgb_model = train_model(xtrain=X_train, ytrain=y_train, model_class=xgb.XGBClassifier) 

# save trained model
with open('trained_xgb_model.pickle', 'wb') as f: 
    dill.dump(xgb_model, f)

### Neural Network Model

In [15]:
# train neural network model
model_nn, history = neural_network_model(X=X_train, y=y_train, loss=tf.keras.losses.CategoricalCrossentropy(), metrics='auc', activations='relu', output_activation='softmax', widths=[32, 4, 2], num_layers=2, epochs=10, batch_size=32, learning_rate=0.00001, validation_split=0.333)

# save trained model
with open('trained_nn_model.pickle', 'wb') as f:
    dill.dump(model_nn, f)

Epoch 1/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - auc: 0.3387 - loss: 28812.0469 - val_auc: 0.9995 - val_loss: 3.1029
Epoch 2/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.3567 - loss: 19727.8281 - val_auc: 0.9295 - val_loss: 4.2079
Epoch 3/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.3885 - loss: 12184.9150 - val_auc: 0.3364 - val_loss: 4.2317
Epoch 4/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.4427 - loss: 7601.9023 - val_auc: 0.2235 - val_loss: 3.3887
Epoch 5/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.4897 - loss: 5399.7539 - val_auc: 0.1244 - val_loss: 2.2278
Epoch 6/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.5317 - loss: 3650.8687 - val_auc: 0.0897 - val_loss: 1.2218
Epoch 7/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [16]:
# model summary
model_nn.summary()