In this notebook, we aim to build and evaluate various machine learning models to predict a target variable from a given dataset. We start by loading and preprocessing the data, including handling missing values and encoding categorical features. After splitting the data into training and testing sets, we train multiple models such as Logistic Regression, Random Forest, XGBoost, and a Neural Network, and save the trained models for future use. The purpose of this project is to compare the performance of different models and select the best one for our prediction task.

# Libraries

In [1]:
!python -m pip install scikit-learn==1.3.1

Collecting scikit-learn==1.3.1
  Using cached scikit_learn-1.3.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.3.1-cp311-cp311-win_amd64.whl (9.2 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.3.1


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.8.0 requires scikit-learn>=1.6.0, but you have scikit-learn 1.3.1 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.3.1 which is incompatible.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dill
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from helper_functions import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping

# Load Data

In [3]:
df = load_data('application_train.csv')

In [4]:
df 

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Drop Features

In [5]:
df = drop_features(df, features_to_drop=['SK_ID_CURR'])

In [6]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,,,,,,
307507,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,,,,,,
307508,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Impute Data

In [7]:
df = clean_data(df)

In [8]:
df.isna().sum().sum()

0

In [9]:
df.isnull().sum().sum()

0

In [10]:
df.TARGET.value_counts() # imbalanced data

TARGET
0    282686
1     24825
Name: count, dtype: int64

# Encode Data

In [11]:
# Get only categorical features:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

df = encode_data(df, 'TARGET', categorical_cols, train=True, model=TargetEncoder)


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [12]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,0.083459,0.101419,0.085002,0.079616,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.083459,0.069993,0.085002,0.083249,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.054784,0.101419,0.072437,0.079616,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.083459,0.069993,0.085002,0.079616,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0.083459,0.101419,0.085002,0.079616,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,0.083459,0.101419,0.085002,0.083249,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307507,0,0.083459,0.069993,0.085002,0.079616,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307508,0,0.083459,0.069993,0.085002,0.079616,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,1,0.083459,0.069993,0.085002,0.079616,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Split train_test 

In [13]:
X_train, X_test, y_train, y_test = split_data(df, 'TARGET', feature_selected= None, features_dropped =[], balanced_data=False)

with open('splitted_train_test.pickle', 'wb') as f: 
    dill.dump((X_train, X_test, y_train, y_test), f)

# Train Models

### Logistic model

In [14]:
logistic_model = train_model(xtrain=X_train, ytrain=y_train, model_class=LogisticRegression) 

with open('trained_logistic_model.pickle', 'wb') as f: 
    dill.dump(logistic_model, f)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Random Forest Model

In [15]:
random_forest_model = train_model(xtrain=X_train, ytrain=y_train, model_class=RandomForestClassifier, n_estimators=100, random_state=42) 

with open('trained_random_forest_model.pickle', 'wb') as f: 
    dill.dump(random_forest_model, f)

### XGBoost Model

In [16]:
xgb_model = train_model(xtrain=X_train, ytrain=y_train, model_class=xgb.XGBClassifier) 

with open('trained_xgb_model.pickle', 'wb') as f: 
    dill.dump(xgb_model, f)

### Neural Network Model

In [18]:
# model_nn, history = neural_network_model(X=X_train, y=y_train, loss=tf.keras.losses.CategoricalCrossentropy(), metrics='auc', activations='relu', output_activation='softmax', widths=[32, 10, 2], num_layers=2, epochs=10, batch_size=32, learning_rate=0.00001, validation_split=0.333)

model_nn, history = neural_network_model(X=X_train, y=y_train, loss=tf.keras.losses.CategoricalCrossentropy(), metrics='auc', activations='relu', output_activation='softmax', widths=[32, 4, 2], num_layers=2, epochs=10, batch_size=32, learning_rate=0.00001, validation_split=0.333)

with open('trained_nn_model.pickle', 'wb') as f:
    dill.dump(model_nn, f)

Epoch 1/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - auc: 0.4666 - loss: 43337.9766 - val_auc: 0.9964 - val_loss: 4.9162
Epoch 2/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.5200 - loss: 27775.2422 - val_auc: 0.8641 - val_loss: 485.9510
Epoch 3/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.5820 - loss: 16730.8184 - val_auc: 0.1664 - val_loss: 4293.5342
Epoch 4/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6291 - loss: 10662.2178 - val_auc: 0.0090 - val_loss: 4994.0029
Epoch 5/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6587 - loss: 7703.9204 - val_auc: 0.0069 - val_loss: 3990.3523
Epoch 6/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - auc: 0.6795 - loss: 5495.0508 - val_auc: 0.0049 - val_loss: 3123.1467
Epoch 7/10
[1m829/829[0m [32m━━━━━━━━━━━━━━━━━━━━

In [19]:
model_nn.summary()