## Credit Card Default Prediction - Random Forest

In [80]:
# importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [81]:
# reading file
df=pd.read_csv('credit-card-default.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaulted
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         30000 non-null  int64
 1   LIMIT_BAL  30000 non-null  int64
 2   SEX        30000 non-null  int64
 3   EDUCATION  30000 non-null  int64
 4   MARRIAGE   30000 non-null  int64
 5   AGE        30000 non-null  int64
 6   PAY_0      30000 non-null  int64
 7   PAY_2      30000 non-null  int64
 8   PAY_3      30000 non-null  int64
 9   PAY_4      30000 non-null  int64
 10  PAY_5      30000 non-null  int64
 11  PAY_6      30000 non-null  int64
 12  BILL_AMT1  30000 non-null  int64
 13  BILL_AMT2  30000 non-null  int64
 14  BILL_AMT3  30000 non-null  int64
 15  BILL_AMT4  30000 non-null  int64
 16  BILL_AMT5  30000 non-null  int64
 17  BILL_AMT6  30000 non-null  int64
 18  PAY_AMT1   30000 non-null  int64
 19  PAY_AMT2   30000 non-null  int64
 20  PAY_AMT3   30000 non-null  int64
 21  PAY_AMT4   3

### Data preparation

In [83]:
from sklearn.model_selection import train_test_split
y=df.pop('defaulted')
X=df

X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,test_size=0.3,random_state=101)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(21000, 24) (9000, 24) (21000,) (9000,)


### Building a default model

In [84]:
from sklearn.ensemble import RandomForestClassifier
rfe=RandomForestClassifier()

rfe.fit(X_train,y_train)

RandomForestClassifier()

In [85]:
# evaluating the model
predictions=rfe.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      7058
           1       0.65      0.36      0.46      1942

    accuracy                           0.82      9000
   macro avg       0.75      0.65      0.68      9000
weighted avg       0.80      0.82      0.80      9000



In [86]:
print(confusion_matrix(y_test,predictions))

[[6683  375]
 [1252  690]]


In [87]:
print("The accuract score is",accuracy_score(y_test,predictions))

The accuract score is 0.8192222222222222


### Building a model with hyperparameter tuning

In [88]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

n_folds=5
param_grid={
    'max_depth':[4,8,10],
    'min_samples_leaf':range(100,400,200),
    'min_samples_split':range(200,500,200),
    'n_estimators':[100,200,300],
    'max_features':[5,10]
}

rf=RandomForestClassifier()

grid_search=GridSearchCV(estimator=rf,
                        cv=n_folds,
                        param_grid=param_grid,
                        n_jobs=-1,
                        verbose=1)

In [99]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  8.7min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [4, 8, 10], 'max_features': [5, 10],
                         'min_samples_leaf': range(100, 400, 200),
                         'min_samples_split': range(200, 500, 200),
                         'n_estimators': [100, 200, 300]},
             verbose=1)

In [100]:
# printing the best score and the best_parameters
print("the best score achieved is ",grid_search.best_score_," and the best parameters are ",grid_search.best_params_)

the best score achieved is  0.818047619047619  and the best parameters are  {'max_depth': 4, 'max_features': 10, 'min_samples_leaf': 100, 'min_samples_split': 200, 'n_estimators': 100}


### Building the model with best hyperparameters

In [101]:
rf=RandomForestClassifier(max_depth= 4, max_features= 10, min_samples_leaf= 100, min_samples_split= 400, n_estimators=200)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=4, max_features=10, min_samples_leaf=100,
                       min_samples_split=400, n_estimators=200)

In [102]:
# evaluating the model
predictions=rf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.84      0.96      0.90      7058
           1       0.69      0.35      0.47      1942

    accuracy                           0.83      9000
   macro avg       0.77      0.66      0.68      9000
weighted avg       0.81      0.83      0.80      9000



In [103]:
print(confusion_matrix(y_test,predictions))

[[6750  308]
 [1254  688]]


In [104]:
print("The accuract score is",accuracy_score(y_test,predictions))

The accuract score is 0.8264444444444444
