# Machine Learning & Increasing model penalization

For this notebook, I'll use the adjusted set from feature engineering notebook on Logit, XGBoost, Light GBM, SVM and Neural Network models.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgbm
from sklearn import svm
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

## Prepping data

### Loading datasets

In [2]:
# Import merged dataset (train + test)
default_df = pd.read_csv('train_adj.csv')

# Separate between features (X) and answer (y)
x = default_df.drop(['Unnamed: 0','Loan Status'], axis=1)
y = default_df['Loan Status']

# Split dataset considering train and test must have default class
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101, stratify=y)

### Standardizing data

In [3]:
# Create a standard scaler based on train set
scaler = StandardScaler()

# Use train set as reference
scaler.fit_transform(x_train)

# Transform test without contaminating it
scaler.transform(x_test)

array([[ 2.09203679, -1.16221414, -0.18876999, ..., -0.07316163,
        -0.26651094, -0.91885213],
       [-0.5480217 , -0.77581639, -0.67685336, ..., -0.07316163,
        -0.26651094,  1.0883144 ],
       [ 0.28214712,  0.15508451,  1.17371725, ..., -0.07316163,
        -0.26651094,  1.0883144 ],
       ...,
       [ 0.89180421, -0.55860618,  1.24433392, ..., -0.07316163,
        -0.26651094, -0.91885213],
       [-1.50353277, -0.99327287,  0.98619904, ..., -0.07316163,
        -0.26651094,  1.0883144 ],
       [ 0.62704703,  2.3605562 ,  0.53540626, ..., -0.07316163,
        -0.26651094,  1.0883144 ]])

## Logit Model

In [4]:
# Create instance for Logit model
logit_m= LogisticRegression(class_weight='balanced')

# GridSearch parameters
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', 'l1', 'elasticnet', 'none'],
    'solver': ['liblinear', 'lbfgs', 'saga']
    }

logit = GridSearchCV(logit_m, param_grid)

# Fit model in my training set
logit.fit(x_train, y_train)

# Predict y_test
logit_predict = logit.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [5]:
# Compare results
logit_matrix = metrics.confusion_matrix(y_test, logit_predict)
print('Confusion Matrix','\n',logit_matrix, '\n')

print(metrics.classification_report(y_test, logit_predict))

Confusion Matrix 
 [[9647 7816]
 [ 967  809]] 

              precision    recall  f1-score   support

           0       0.91      0.55      0.69     17463
           1       0.09      0.46      0.16      1776

    accuracy                           0.54     19239
   macro avg       0.50      0.50      0.42     19239
weighted avg       0.83      0.54      0.64     19239



In [6]:
# Extract classification report
class_report2 = metrics.classification_report(y_test, logit_predict, output_dict=True)
class_report2 = pd.DataFrame(class_report2).round(2).transpose()
class_report2['Model'] = 'logit IP'
class_report2

Unnamed: 0,precision,recall,f1-score,support,Model
0,0.91,0.55,0.69,17463.0,logit IP
1,0.09,0.46,0.16,1776.0,logit IP
accuracy,0.54,0.54,0.54,0.54,logit IP
macro avg,0.5,0.5,0.42,19239.0,logit IP
weighted avg,0.83,0.54,0.64,19239.0,logit IP


## XGBoost

In [7]:
# Compute the positive class weight
pos_class_weight = (len(y) - np.sum(y)) / np.sum(y)

# Create XGBoost instance
XGB_m = xgb.XGBClassifier(scale_pos_weight=pos_class_weight)

# GridSearch
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2],
    'scale_pos_weight': [1, 10, 20],
    'objective': ['binary:logistic','multi:softmax']
}

XGB = GridSearchCV(XGB_m, param_grid)

# Fit he model
XGB.fit(x_train, y_train)

# Pedict
xgb_predict = XGB.predict(x_test)

4860 fits failed out of a total of 9720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4860 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Documentos\My_Py_Projects\github_DS_projects\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Documentos\My_Py_Projects\github_DS_projects\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "d:\Documentos\My_Py_Projects\github_DS_projects\.venv\Lib\site-packages\xgboost\sklearn.py", line 1531, in fit
    self._Booster = train(
                    ^^^^^^
  File "d:\Documentos\My_Py_Projects\

In [8]:
# Analyze performance
print('Confusion matrix', '\n',metrics.confusion_matrix(y_test, xgb_predict), '\n')

print(metrics.classification_report(y_test, xgb_predict))

Confusion matrix 
 [[17463     0]
 [ 1776     0]] 

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17463
           1       0.00      0.00      0.00      1776

    accuracy                           0.91     19239
   macro avg       0.45      0.50      0.48     19239
weighted avg       0.82      0.91      0.86     19239



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Create temporary class report 
temp_class_report2 = metrics.classification_report(y_test, xgb_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'xgb IP'

# Concat with main df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Display final df
# class_report2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Light GBM

In [11]:
# Create instance
lgb_m = lgbm.LGBMClassifier(scale_pos_weight=pos_class_weight)

# GridSearch
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [0, 0.1, 1.0],
    'scale_pos_weight': [1, 10, 50]
}

lgb = GridSearchCV(lgb_m, param_grid)

# Fit in train set
lgb.fit(x_train, y_train)

# Predict
lgb_predict = lgb.predict(x_test)

[LightGBM] [Info] Number of positive: 3316, number of negative: 32596
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4137
[LightGBM] [Info] Number of data points in the train set: 35912, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.092337 -> initscore=-2.285430
[LightGBM] [Info] Start training from score -2.285430
[LightGBM] [Info] Number of positive: 3315, number of negative: 32597
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4138
[LightGBM] [Info] Number of data points in the train set: 35912, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.092309 -> initscore=-2.285763
[Lig

In [12]:
# Confusion matrix
print('Confusion Matrix:', '\n',metrics.confusion_matrix(y_test,lgb_predict), '\n')

# Classification report
print(metrics.classification_report(y_test, lgb_predict))

Confusion Matrix: 
 [[17462     1]
 [ 1776     0]] 

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17463
           1       0.00      0.00      0.00      1776

    accuracy                           0.91     19239
   macro avg       0.45      0.50      0.48     19239
weighted avg       0.82      0.91      0.86     19239



In [13]:
# Create temporary classification report df
temp_class_report2 = metrics.classification_report(y_test, lgb_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'Light GBM IP'

# Concat with main report
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)
# class_report2
class_report2.to_csv('classification_report2.csv')

## SVM

In [13]:
# Dictiornary with the ideal weight
dw = {0: 1, 1: pos_class_weight}

# Instance model
svc_m = svm.SVC(class_weight=dw)

# GridSearch
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
    'class_weight': [None, 'balanced']
}

svc = GridSearchCV(svc_m, param_grid)

# Fit
svc.fit(x_train, y_train)

# Predict
svc_predict = svc.predict(x_test)

In [14]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test, svc_predict))

# Classification report
print(metrics.classification_report(y_test, svc_predict))

Confusion matrix 
 [[11153  6310]
 [ 1106   670]]
              precision    recall  f1-score   support

           0       0.91      0.64      0.75     17463
           1       0.10      0.38      0.15      1776

    accuracy                           0.61     19239
   macro avg       0.50      0.51      0.45     19239
weighted avg       0.83      0.61      0.70     19239



In [15]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test, svc_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'SVM IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)
# class_report2
class_report2.to_csv('classification_report2.csv')

## Neural Network

In [16]:
# First transform df in arrays. This is the way tensorflow builds its models
x_train_a = x_train.to_numpy()
y_train_a = y_train.to_numpy()
x_test_a = x_test.to_numpy()
y_test_a = y_test.to_numpy()

x_train_a.shape

(44890, 36)

In [17]:
# Build neural netwrok
ann_sgd = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(36,), activation='tanh'),
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dropout(0.20),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Compile and fit
opt = tf.keras.optimizers.SGD(learning_rate=0.01)

# Since this is a classification problem, our loss analysis also changes from MSE to binarycrossentropy
ann_sgd.compile(optimizer=opt, 
            loss='binary_crossentropy',
            metrics=['accuracy'])

# Predic
ann_sgd.fit(x_train_a, y_train_a, class_weight={0: 1.0, 1: pos_class_weight}, epochs=50)

Epoch 1/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 897us/step - accuracy: 0.5276 - loss: 1.2998
Epoch 2/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 867us/step - accuracy: 0.5047 - loss: 1.2610
Epoch 3/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 833us/step - accuracy: 0.5145 - loss: 1.2574
Epoch 4/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 872us/step - accuracy: 0.4432 - loss: 1.2634
Epoch 5/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 889us/step - accuracy: 0.5215 - loss: 1.2523
Epoch 6/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 859us/step - accuracy: 0.4355 - loss: 1.2673
Epoch 7/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 857us/step - accuracy: 0.3911 - loss: 1.2772
Epoch 8/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 849us/step - accuracy: 0.5183 - loss: 1.2555
Epoch 9/

<keras.src.callbacks.history.History at 0x1b5711caa90>

In [19]:
# Predict using ANN, and here I'll round rthe predictions so we have 1 (defualt) and 0 (non-default) instead of float numbers
ann_sgd_predict = (ann_sgd.predict(x_test_a) > 0.5).astype(int)

[1m602/602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step


In [20]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test_a, ann_sgd_predict))

# Classification report
print(metrics.classification_report(y_test_a, ann_sgd_predict))

Confusion matrix 
 [[   51 17412]
 [    5  1771]]
              precision    recall  f1-score   support

           0       0.91      0.00      0.01     17463
           1       0.09      1.00      0.17      1776

    accuracy                           0.09     19239
   macro avg       0.50      0.50      0.09     19239
weighted avg       0.84      0.09      0.02     19239



In [21]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test_a, ann_sgd_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'ANN SGD IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Save
class_report2.to_csv('classification_report2.csv')

#### Adam optimizer

In [22]:
# Using similar architecture
ann_adam = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, input_shape=(36,), activation='relu'),
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dropout(0.20),
  tf.keras.layers.Dense(10, activation='tanh'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile and fit
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

# Since this is a classification problem, our loss analysis also changes from MSE to binarycrossentropy
ann_adam.compile(optimizer=opt, 
            loss='binary_crossentropy',
            metrics=['accuracy'])

# Fit in train sets
ann_adam.fit(x_train_a, y_train_a, class_weight={0: 1.0, 1: pos_class_weight}, epochs=50)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 983us/step - accuracy: 0.5060 - loss: 1.2975
Epoch 2/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 960us/step - accuracy: 0.4963 - loss: 1.2653
Epoch 3/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 955us/step - accuracy: 0.5037 - loss: 1.2697
Epoch 4/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 959us/step - accuracy: 0.5489 - loss: 1.2582
Epoch 5/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 978us/step - accuracy: 0.5158 - loss: 1.2596
Epoch 6/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 962us/step - accuracy: 0.5059 - loss: 1.2691
Epoch 7/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - accuracy: 0.4520 - loss: 1.2849
Epoch 8/50
[1m1403/1403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 940us/step - accuracy: 0.5300 - loss: 1.2536
Epoch 9/50
[1m1403

<keras.src.callbacks.history.History at 0x1b50dcc4bd0>

In [23]:
# Predict using ANN, and here I'll round rthe predictions so we have 1 (defualt) and 0 (non-default) instead of float numbers
ann_adam_predict = (ann_adam.predict(x_test_a) > 0.5).astype(int)

[1m602/602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670us/step


In [24]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test_a, ann_adam_predict))

# Classification report
print(metrics.classification_report(y_test_a, ann_adam_predict))

Confusion matrix 
 [[17463     0]
 [ 1776     0]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17463
           1       0.00      0.00      0.00      1776

    accuracy                           0.91     19239
   macro avg       0.45      0.50      0.48     19239
weighted avg       0.82      0.91      0.86     19239



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# Temporary classification report
temp_class_report2 = metrics.classification_report(y_test_a, ann_adam_predict, output_dict=True)
temp_class_report2 = pd.DataFrame(temp_class_report2).round(2).transpose()
temp_class_report2['Model'] = 'ANN ADAM IP'

# Merge with report df
class_report2 = pd.concat([class_report2, temp_class_report2], axis=0)

# Save
class_report2.to_csv('classification_report2.csv')

# Display final results
class_report2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support,Model
0,0.91,0.52,0.66,17463.0,logit IP
1,0.1,0.5,0.16,1776.0,logit IP
accuracy,0.51,0.51,0.51,0.51,logit IP
macro avg,0.5,0.51,0.41,19239.0,logit IP
weighted avg,0.84,0.51,0.61,19239.0,logit IP
0,0.91,0.85,0.88,17463.0,xgb IP
1,0.09,0.15,0.11,1776.0,xgb IP
accuracy,0.78,0.78,0.78,0.78,xgb IP
macro avg,0.5,0.5,0.5,19239.0,xgb IP
weighted avg,0.83,0.78,0.81,19239.0,xgb IP
