# Predicting default of credit card clients 
### Classification using SHAP for feature interpretation
Models tested
* Logistic Regression
* Naive Bayes (Gaussian)
* Decision Tree
* Random Forest
* Extra Trees
* AdaBoost
* Gradient Boosting
* XGBoost

---

Data source:
https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

SHAP:
https://shap.readthedocs.io/en/latest/index.html


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

# datasets to test
from sklearn.datasets import load_iris
from sklearn.datasets import make_moons, make_circles, make_classification

# classification models 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
# import xgboost as xgb
# !pip install -q catboost
# from catboost import Pool, CatBoostClassifier

from sklearn.metrics import accuracy_score, auc, classification_report
import shap

In [15]:
df_cc = pd.read_excel('../data/defaultofcreditcardclients.xls', header=1)
print(df_cc.shape)
df_cc.head()

(30000, 25)


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [24]:
df_cc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   30000 non-null  int64
 1   SEX                         30000 non-null  int64
 2   EDUCATION                   30000 non-null  int64
 3   MARRIAGE                    30000 non-null  int64
 4   AGE                         30000 non-null  int64
 5   PAY_0                       30000 non-null  int64
 6   PAY_2                       30000 non-null  int64
 7   PAY_3                       30000 non-null  int64
 8   PAY_4                       30000 non-null  int64
 9   PAY_5                       30000 non-null  int64
 10  PAY_6                       30000 non-null  int64
 11  BILL_AMT1                   30000 non-null  int64
 12  BILL_AMT2                   30000 non-null  int64
 13  BILL_AMT3                   30000 non-null  int64
 14  BILL_A

In [16]:
# Drop ID
df_cc.drop(labels=['ID'], axis=1, inplace=True)

In [17]:
df_cc.iloc[:,-1].value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [26]:
# Create classifiers
lr = LogisticRegression(solver='liblinear')  
gnb = GaussianNB()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()
abc = AdaBoostClassifier()
gbc = GradientBoostingClassifier()

In [27]:
n_features = df_cc.shape[1]
X = df_cc.iloc[:,:n_features-1]
y = df_cc.iloc[:,-1]
# X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(20100, 23) (9900, 23) (20100,) (9900,)


In [31]:
# Cross validation calculate scores
models = [lr, gnb]
#  models = [lr, gnb, dtc, rfc, etc, abc, gbc]
for model in models:
    print('Fitting Model:\n', model)
    scores = cross_val_score(model, X_train, y_train)
    print("Mean Score: ", np.round(scores.mean(), 4))
    print("\n")

Fitting Model:
 LogisticRegression(solver='liblinear')
Mean Score:  0.7771


Fitting Model:
 GaussianNB()
Mean Score:  0.3794


