# Machine Learning & Increasing model penalization

For this notebook, I'll use the adjusted set from feature engineering notebook on Logit, XGBoost, Light GBM, SVM and Neural Network models.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgbm
from sklearn import svm
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

## Prepping data

### Loading datasets

In [2]:
# Import merged dataset (train + test)
default_df = pd.read_csv('default_df.csv')

# Separate between features (X) and answer (y)
x = default_df.drop(['Unnamed: 0','Loan Status'], axis=1)
y = default_df['Loan Status']

# Split dataset considering train and test must have default class
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101, stratify=y)

### Standardizing data

In [3]:
# Create a standard scaler based on train set
scaler = StandardScaler()

# Use train set as reference
scaler.fit_transform(x_train)

# Transform test without contaminating it
scaler.transform(x_test)

array([[-0.72383308,  1.86829987, -1.12465675, ..., -0.07329356,
        -0.26825135,  1.132089  ],
       [-0.21837308, -1.48909439,  2.60800807, ..., -0.07329356,
        -0.26825135, -0.88332278],
       [ 0.39358773, -0.8656642 ,  1.4718614 , ..., -0.07329356,
        -0.26825135,  1.132089  ],
       ...,
       [ 0.37456118,  0.50831164,  0.74053434, ..., -0.07329356,
        -0.26825135, -0.88332278],
       [ 1.41037126,  0.8771428 , -0.8366547 , ..., -0.07329356,
        -0.26825135,  1.132089  ],
       [ 1.77462794,  0.62045791, -0.30038245, ..., -0.07329356,
        -0.26825135, -0.88332278]])

## Logit Model

In [5]:
# Create instance for Logit model
logit = LogisticRegression(class_weight='balanced')

# Fit model in my training set
logit.fit(x_train, y_train)

# Predict y_test
logit_predict = logit.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Compare results
logit_matrix = metrics.confusion_matrix(y_test, logit_predict)
print('Confusion Matrix','\n',logit_matrix, '\n')

print(metrics.classification_report(y_test, logit_predict))

Confusion Matrix 
 [[12479 13658]
 [  780   996]] 

              precision    recall  f1-score   support

         0.0       0.94      0.48      0.63     26137
         1.0       0.07      0.56      0.12      1776

    accuracy                           0.48     27913
   macro avg       0.50      0.52      0.38     27913
weighted avg       0.89      0.48      0.60     27913



In [7]:
# Extract classification report
class_report2 = metrics.classification_report(y_test, logit_predict, output_dict=True)
class_report2 = pd.DataFrame(class_report2).round(2).transpose()
class_report2['Model'] = 'logit IP'
class_report2

Unnamed: 0,precision,recall,f1-score,support,Model
0.0,0.94,0.48,0.63,26137.0,logit IP
1.0,0.07,0.56,0.12,1776.0,logit IP
accuracy,0.48,0.48,0.48,0.48,logit IP
macro avg,0.5,0.52,0.38,27913.0,logit IP
weighted avg,0.89,0.48,0.6,27913.0,logit IP


## XGBoost

In [8]:
# Compute the positive class weight
pos_class_weight = (len(y) - np.sum(y)) / np.sum(y)

# Create XGBoost instance
XGB = xgb.XGBClassifier(scale_pos_weight=pos_class_weight)

# Fit he model
XGB.fit(x_train, y_train)

# Pedict
xgb_predict = XGB.predict(x_test)

In [9]:
# Analyze performance
print('Confusion matrix', '\n',metrics.confusion_matrix(y_test, xgb_predict), '\n')

print(metrics.classification_report(y_test, xgb_predict))

Confusion matrix 
 [[20886  5251]
 [ 1342   434]] 

              precision    recall  f1-score   support

         0.0       0.94      0.80      0.86     26137
         1.0       0.08      0.24      0.12      1776

    accuracy                           0.76     27913
   macro avg       0.51      0.52      0.49     27913
weighted avg       0.88      0.76      0.82     27913



In [10]:
# Create temporary class report 
temp_class_report2 = metrics.classification_report(y_test, xgb_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'xgb IP'

# Concat with main df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Display final df
# class_report2

## Light GBM

In [11]:
# Create instance
lgb = lgbm.LGBMClassifier(scale_pos_weight=pos_class_weight)

# Fit in train set
lgb.fit(x_train, y_train)

# Predict
lgb_predict = lgb.predict(x_test)

[LightGBM] [Info] Number of positive: 4144, number of negative: 60985
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4144
[LightGBM] [Info] Number of data points in the train set: 65129, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063628 -> initscore=-2.688966
[LightGBM] [Info] Start training from score -2.688966


In [12]:
# Confusion matrix
print('Confusion Matrix:', '\n',metrics.confusion_matrix(y_test,lgb_predict), '\n')

# Classification report
print(metrics.classification_report(y_test, lgb_predict))

Confusion Matrix: 
 [[16971  9166]
 [ 1051   725]] 

              precision    recall  f1-score   support

         0.0       0.94      0.65      0.77     26137
         1.0       0.07      0.41      0.12      1776

    accuracy                           0.63     27913
   macro avg       0.51      0.53      0.45     27913
weighted avg       0.89      0.63      0.73     27913



In [13]:
# Create temporary classification report df
temp_class_report2 = metrics.classification_report(y_test, lgb_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'Light GBM IP'

# Concat with main report
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)
# class_report2
class_report2.to_csv('classification_report2.csv')

## SVM

In [16]:
# Dictiornary with the ideal weight
dw = {0: 1, 1: pos_class_weight}

# Instance model
svc = svm.SVC(class_weight=dw)

# Fit
svc.fit(x_train, y_train)

# Predict
svc_predict = svc.predict(x_test)

In [17]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test, svc_predict))

# Classification report
print(metrics.classification_report(y_test, svc_predict))

Confusion matrix 
 [[10454 15683]
 [  627  1149]]
              precision    recall  f1-score   support

         0.0       0.94      0.40      0.56     26137
         1.0       0.07      0.65      0.12      1776

    accuracy                           0.42     27913
   macro avg       0.51      0.52      0.34     27913
weighted avg       0.89      0.42      0.53     27913



In [None]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test, svc_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'SVM IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)
# class_report2
class_report2.to_csv('classification_report2.csv')

## Neural Network

In [19]:
# First transform df in arrays. This is the way tensorflow builds its models
x_train_a = x_train.to_numpy()
y_train_a = y_train.to_numpy()
x_test_a = x_test.to_numpy()
y_test_a = y_test.to_numpy()

x_train_a.shape

(65129, 36)

In [20]:
# Build neural netwrok
ann_sgd = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(36,), activation='tanh'),
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dropout(0.20),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
# Compile and fit
opt = tf.keras.optimizers.SGD(learning_rate=0.01)

# Since this is a classification problem, our loss analysis also changes from MSE to binarycrossentropy
ann_sgd.compile(optimizer=opt, 
            loss='binary_crossentropy',
            metrics=['accuracy'])

# Predic
ann_sgd.fit(x_train_a, y_train_a, class_weight={0: 1.0, 1: pos_class_weight}, epochs=50)

Epoch 1/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.5207 - loss: 1.3374
Epoch 2/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4865 - loss: 1.3119
Epoch 3/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4508 - loss: 1.3123
Epoch 4/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5345 - loss: 1.2872
Epoch 5/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4943 - loss: 1.2999
Epoch 6/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6374 - loss: 1.2843
Epoch 7/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5147 - loss: 1.2950
Epoch 8/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3659 - loss: 1.3113
Epoch 9/50
[1m2036/2036

<keras.src.callbacks.history.History at 0x23d80f5d390>

In [23]:
# Predict using ANN, and here I'll round rthe predictions so we have 1 (defualt) and 0 (non-default) instead of float numbers
ann_sgd_predict = (ann_sgd.predict(x_test_a) > 0.5).astype(int)

[1m873/873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 866us/step


In [24]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test_a, ann_sgd_predict))

# Classification report
print(metrics.classification_report(y_test_a, ann_sgd_predict))

Confusion matrix 
 [[25909   228]
 [ 1765    11]]
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96     26137
         1.0       0.05      0.01      0.01      1776

    accuracy                           0.93     27913
   macro avg       0.49      0.50      0.49     27913
weighted avg       0.88      0.93      0.90     27913



In [25]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test_a, ann_sgd_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'ANN SGD IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Save
class_report2.to_csv('classification_report2.csv')

#### Adam optimizer

In [26]:
# Using similar architecture
ann_adam = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(36,), activation='relu'),
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dropout(0.20),
  tf.keras.layers.Dense(10, activation='tanh'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile and fit
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

# Since this is a classification problem, our loss analysis also changes from MSE to binarycrossentropy
ann_adam.compile(optimizer=opt, 
            loss='binary_crossentropy',
            metrics=['accuracy'])

# Fit in train sets
ann_adam.fit(x_train_a, y_train_a, class_weight={0: 1.0, 1: pos_class_weight}, epochs=50)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.4678 - loss: 1.3397
Epoch 2/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.5522 - loss: 1.2952
Epoch 3/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5600 - loss: 1.2954
Epoch 4/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.4660 - loss: 1.3110
Epoch 5/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.4500 - loss: 1.3187
Epoch 6/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.5112 - loss: 1.3143
Epoch 7/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.5180 - loss: 1.2974
Epoch 8/50
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.4860 - loss: 1.2977
Epoch 9/50
[1m2036/2036[0m [32m━

<keras.src.callbacks.history.History at 0x23da1dc8050>

In [27]:
# Predict using ANN, and here I'll round rthe predictions so we have 1 (defualt) and 0 (non-default) instead of float numbers
ann_adam_predict = (ann_adam.predict(x_test_a) > 0.5).astype(int)

[1m873/873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 873us/step


In [28]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test_a, ann_adam_predict))

# Classification report
print(metrics.classification_report(y_test_a, ann_adam_predict))

Confusion matrix 
 [[    0 26137]
 [    0  1776]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     26137
         1.0       0.06      1.00      0.12      1776

    accuracy                           0.06     27913
   macro avg       0.03      0.50      0.06     27913
weighted avg       0.00      0.06      0.01     27913



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test_a, ann_adam_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'ANN ADAM IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Save
class_report2.to_csv('classification_report2.csv')

# Display final results
class_report2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support,Model
0.0,0.94,0.48,0.63,26137.0,logit IP
1.0,0.07,0.56,0.12,1776.0,logit IP
accuracy,0.48,0.48,0.48,0.48,logit IP
macro avg,0.5,0.52,0.38,27913.0,logit IP
weighted avg,0.89,0.48,0.6,27913.0,logit IP
0.0,0.94,0.8,0.86,26137.0,xgb IP
1.0,0.08,0.24,0.12,1776.0,xgb IP
accuracy,0.76,0.76,0.76,0.76,xgb IP
macro avg,0.51,0.52,0.49,27913.0,xgb IP
weighted avg,0.88,0.76,0.82,27913.0,xgb IP
