In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, KFold

from sklearn.metrics import recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

In [13]:
df_selected = pd.read_csv("CleanData.csv")
df_selected.drop("Unnamed: 0", axis=1)

Unnamed: 0,Patron_Salary,Automobile_Possession,Two-Wheeler_Ownership,Ongoing_Borrowing,Residence_Proprietorship,Offspring_Number,Loan_Capital,Borrowing_Periodic_Payment,Customer_Revenue_Category,Patron_Academic_Qualification,...,Work_Duration_in_Days,Enlistment_Period_in_Days,Identity_Age_in_Days,Patron_Kin_Count,Customer_Urban_Area_Ranking,Patron_Constant_Correspondence_Marker,Sort_of_Institution,Solvency_Information_Agency,Combined_Rating,Default
0,6750.0,0.0,0.0,1.0,0.0,0.0,61190.55,3416.85,0,4,...,1062.0,6123.0,383.0,2.0,2.0,1.0,42,0.0,-0.030327,0
1,11250.0,0.0,1.0,1.0,1.0,1.0,13752.00,653.85,3,4,...,1184.0,3910.0,3910.0,2.0,2.0,1.0,33,0.0,0.188363,0
2,13500.0,1.0,1.0,0.0,1.0,1.0,45000.00,1200.15,0,0,...,7889.0,5455.0,2665.0,3.0,2.0,1.0,42,0.0,0.128827,0
3,27000.0,0.0,0.0,1.0,1.0,1.0,67500.00,3375.00,1,0,...,2263.0,2933.0,4640.0,2.0,2.0,1.0,11,1.0,0.262592,0
4,13500.0,1.0,1.0,1.0,0.0,1.0,18000.00,900.00,3,4,...,125.0,6775.0,2834.0,3.0,1.0,1.0,20,1.0,0.045462,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47110,18000.0,1.0,1.0,0.0,0.0,1.0,27302.40,2169.90,3,4,...,1521.0,4883.0,3602.0,3.0,2.0,1.0,13,2.0,0.114956,0
47111,10350.0,0.0,1.0,0.0,0.0,0.0,18792.90,1736.55,3,1,...,763.0,3773.0,1874.0,1.0,2.0,1.0,42,0.0,0.111925,0
47112,8100.0,0.0,1.0,0.0,1.0,1.0,55107.90,2989.35,1,4,...,1623.0,3980.0,353.0,3.0,3.0,0.0,50,0.0,-0.461028,0
47113,38250.0,1.0,1.0,0.0,1.0,0.0,45000.00,2719.35,3,0,...,847.0,895.0,2902.0,2.0,2.0,1.0,5,2.0,-0.408079,0


In [14]:
df_selected.Default.value_counts()

Default
0    42669
1     4446
Name: count, dtype: int64

In [15]:
## it is an imbalance dataset

In [16]:
#separate classes into two data frames: 1. df_major and 2. df_minor
df_major = df_selected[df_selected.Default == 0]
df_minor = df_selected[df_selected.Default == 1]

In [17]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 42669, random_state = 2018)

In [18]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])

In [19]:
df_minor_upsmapled.Default.value_counts()

Default
1    42669
0    42669
Name: count, dtype: int64

In [20]:
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model:   {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

In [21]:
X = df_minor_upsmapled.drop('Default', axis = 1)
Y = df_minor_upsmapled.Default


#Splitting into test and train set
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

### LogisticRegression

In [23]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()

In [24]:
logisticRegr.fit(xtrain_scaled, ytrain)

In [25]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [26]:
xtest_scaled = mms.transform(xtest)
lr_pred = logisticRegr.predict(xtest_scaled)

In [27]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.6355753456761191

Classification report: 
              precision    recall  f1-score   support

           0       0.64      0.64      0.64     10786
           1       0.63      0.63      0.63     10549

    accuracy                           0.64     21335
   macro avg       0.64      0.64      0.64     21335
weighted avg       0.64      0.64      0.64     21335


Confusion matrix: 
[[6929 3857]
 [3918 6631]]



In our final dataset, almost 60% of our features are categorical. 
Therefore, a tree-based model may be a better choice. Lets implement Random forest.

#### Random forest (RF) model

In [28]:
# Defining the hyperparameters
def random_forest(xtrain, xtest, ytrain):
    rf_params = {
        'n_estimators': 126, 
        'max_depth': 14
    }

    rf = RandomForestClassifier(**rf_params)
    rf.fit(xtrain, ytrain)
    rfpred = rf.predict(xtest)
    rfpred_proba = rf.predict_proba(xtest)
    
    return rfpred, rfpred_proba

In [29]:
rfpred, rfpred_proba = random_forest(xtrain_scaled, xtest_scaled, ytrain)

In [30]:
evaluate_model(ytest, rfpred, rfpred_proba)

ROC-AUC score of the model:   0.9836502527115257
Accuracy of the model: 0.9232247480665573

Classification report: 
              precision    recall  f1-score   support

           0       0.98      0.86      0.92     10786
           1       0.88      0.98      0.93     10549

    accuracy                           0.92     21335
   macro avg       0.93      0.92      0.92     21335
weighted avg       0.93      0.92      0.92     21335


Confusion matrix: 
[[ 9322  1464]
 [  174 10375]]

