In [13]:
# Dependencies

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('ggplot')
%matplotlib inline

# Load libraries

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [14]:
df = pd.read_excel('../default_of_credit_card_clients.xls', header=0)

df.columns = df.iloc[0,:]

df.rename(index=str, columns={"default payment next month": "Y"}, inplace=True)

df.drop('ID', inplace=True, axis=0)

y = df.Y

df.head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Y
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null object
SEX          30000 non-null object
EDUCATION    30000 non-null object
MARRIAGE     30000 non-null object
AGE          30000 non-null object
PAY_0        30000 non-null object
PAY_2        30000 non-null object
PAY_3        30000 non-null object
PAY_4        30000 non-null object
PAY_5        30000 non-null object
PAY_6        30000 non-null object
BILL_AMT1    30000 non-null object
BILL_AMT2    30000 non-null object
BILL_AMT3    30000 non-null object
BILL_AMT4    30000 non-null object
BILL_AMT5    30000 non-null object
BILL_AMT6    30000 non-null object
PAY_AMT1     30000 non-null object
PAY_AMT2     30000 non-null object
PAY_AMT3     30000 non-null object
PAY_AMT4     30000 non-null object
PAY_AMT5     30000 non-null object
PAY_AMT6     30000 non-null object
Y            30000 non-null object
dtypes: object(24)
memory usage: 5.7+ MB


In [16]:
df = df.astype('float32')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null float32
SEX          30000 non-null float32
EDUCATION    30000 non-null float32
MARRIAGE     30000 non-null float32
AGE          30000 non-null float32
PAY_0        30000 non-null float32
PAY_2        30000 non-null float32
PAY_3        30000 non-null float32
PAY_4        30000 non-null float32
PAY_5        30000 non-null float32
PAY_6        30000 non-null float32
BILL_AMT1    30000 non-null float32
BILL_AMT2    30000 non-null float32
BILL_AMT3    30000 non-null float32
BILL_AMT4    30000 non-null float32
BILL_AMT5    30000 non-null float32
BILL_AMT6    30000 non-null float32
PAY_AMT1     30000 non-null float32
PAY_AMT2     30000 non-null float32
PAY_AMT3     30000 non-null float32
PAY_AMT4     30000 non-null float32
PAY_AMT5     30000 non-null float32
PAY_AMT6     30000 non-null float32
Y            30000 non-null float32
dtypes: float32(24)
memory usage

In [18]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'Y'],
      dtype='object', name='ID')

In [40]:
# divided columns into two types of variables - target variable (y) and feature variables (X).
#X = df.drop(columns="Y")
#y = df.Y

In [41]:
# created new column OUTSTANDING_BAL for each month

df["OUTSTANDING_BAL1"] = (df.BILL_AMT1 - df.PAY_AMT1)
df["OUTSTANDING_BAL2"] = (df.BILL_AMT2 - df.PAY_AMT2)
df["OUTSTANDING_BAL3"] = (df.BILL_AMT3 - df.PAY_AMT3)
df["OUTSTANDING_BAL4"] = (df.BILL_AMT4 - df.PAY_AMT4)
df["OUTSTANDING_BAL5"] = (df.BILL_AMT5 - df.PAY_AMT5)
df["OUTSTANDING_BAL6"] = (df.BILL_AMT6 - df.PAY_AMT6)

df.head(5)

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,OUTSTANDING_BAL3,OUTSTANDING_BAL4,OUTSTANDING_BAL5,OUTSTANDING_BAL6,%_LIMIT_BAL1,%_LIMIT_BAL2,%_LIMIT_BAL3,%_LIMIT_BAL4,%_LIMIT_BAL5,%_LIMIT_BAL6
1,20000.0,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.19565,0.12065,0.03445,0.0,0.0,0.0
2,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,0.0,0.0,0.0,...,1682.0,2272.0,3455.0,1261.0,0.02235,0.006042,0.014017,0.018933,0.028792,0.010508
3,90000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,12559.0,13331.0,13948.0,10549.0,0.308011,0.139189,0.139544,0.148122,0.154978,0.117211
4,50000.0,2.0,2.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,...,48091.0,27214.0,27890.0,28547.0,0.8998,0.92428,0.96182,0.54428,0.5578,0.57094
5,50000.0,1.0,2.0,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,25835.0,11940.0,18457.0,18452.0,0.13234,-0.62022,0.5167,0.2388,0.36914,0.36904


In [42]:
# created new column %_LIMIT_BAL for each month

df["%_LIMIT_BAL1"] = (df.OUTSTANDING_BAL1 / df.LIMIT_BAL)
df["%_LIMIT_BAL2"] = (df.OUTSTANDING_BAL2 / df.LIMIT_BAL)
df["%_LIMIT_BAL3"] = (df.OUTSTANDING_BAL3 / df.LIMIT_BAL)
df["%_LIMIT_BAL4"] = (df.OUTSTANDING_BAL4 / df.LIMIT_BAL)
df["%_LIMIT_BAL5"] = (df.OUTSTANDING_BAL5 / df.LIMIT_BAL)
df["%_LIMIT_BAL6"] = (df.OUTSTANDING_BAL6 / df.LIMIT_BAL)


df.head(5)

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,OUTSTANDING_BAL3,OUTSTANDING_BAL4,OUTSTANDING_BAL5,OUTSTANDING_BAL6,%_LIMIT_BAL1,%_LIMIT_BAL2,%_LIMIT_BAL3,%_LIMIT_BAL4,%_LIMIT_BAL5,%_LIMIT_BAL6
1,20000.0,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.19565,0.12065,0.03445,0.0,0.0,0.0
2,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,0.0,0.0,0.0,...,1682.0,2272.0,3455.0,1261.0,0.02235,0.006042,0.014017,0.018933,0.028792,0.010508
3,90000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,12559.0,13331.0,13948.0,10549.0,0.308011,0.139189,0.139544,0.148122,0.154978,0.117211
4,50000.0,2.0,2.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,...,48091.0,27214.0,27890.0,28547.0,0.8998,0.92428,0.96182,0.54428,0.5578,0.57094
5,50000.0,1.0,2.0,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,25835.0,11940.0,18457.0,18452.0,0.13234,-0.62022,0.5167,0.2388,0.36914,0.36904


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 36 columns):
LIMIT_BAL           30000 non-null float32
SEX                 30000 non-null float32
EDUCATION           30000 non-null float32
MARRIAGE            30000 non-null float32
AGE                 30000 non-null float32
PAY_0               30000 non-null float32
PAY_2               30000 non-null float32
PAY_3               30000 non-null float32
PAY_4               30000 non-null float32
PAY_5               30000 non-null float32
PAY_6               30000 non-null float32
BILL_AMT1           30000 non-null float32
BILL_AMT2           30000 non-null float32
BILL_AMT3           30000 non-null float32
BILL_AMT4           30000 non-null float32
BILL_AMT5           30000 non-null float32
BILL_AMT6           30000 non-null float32
PAY_AMT1            30000 non-null float32
PAY_AMT2            30000 non-null float32
PAY_AMT3            30000 non-null float32
PAY_AMT4            30000 non-null f

In [46]:
# divided columns into two types of variables - target variable (y) and feature variables (X).
#X = df.drop(columns="Y")
#y = df.Y

# divided columns into two types of variables - target variable (y) and feature variables (X).
# X = df.drop(columns=["PAY_0","PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "Y"])
# y = df.Y

In [48]:
# Split dataset into training set and test set
# 70% training and 30% test

X_train, X_test = train_test_split(X, test_size=0.3, random_state=2019) 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2019) 

In [49]:
print("***** X_train *****")
print(X_train.head())
print("\n")
print("***** X_test *****")
print(X_test.head())

***** X_train *****
ID     LIMIT_BAL  SEX  EDUCATION  MARRIAGE   AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
20748   360000.0  1.0        2.0       1.0  36.0   -2.0   -2.0   -2.0   -2.0   
22071   100000.0  2.0        2.0       2.0  23.0    0.0    0.0    0.0    0.0   
3439    200000.0  2.0        1.0       1.0  34.0   -1.0   -1.0   -1.0   -1.0   
21757   270000.0  2.0        2.0       1.0  25.0    0.0    0.0    0.0    0.0   
7796    500000.0  2.0        1.0       1.0  38.0    0.0    0.0    0.0    2.0   

ID     PAY_5      ...       OUTSTANDING_BAL3  OUTSTANDING_BAL4  \
20748   -2.0      ...                 4884.0             -98.0   
22071    0.0      ...                17126.0           19225.0   
3439     0.0      ...                 9041.0           12088.0   
21757    0.0      ...               165153.0          151710.0   
7796    -1.0      ...                48785.0           -3095.0   

ID     OUTSTANDING_BAL5  OUTSTANDING_BAL6  %_LIMIT_BAL1  %_LIMIT_BAL2  \
20748           -1744.0      

In [50]:
# For the train set
X_train.isna().head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,OUTSTANDING_BAL3,OUTSTANDING_BAL4,OUTSTANDING_BAL5,OUTSTANDING_BAL6,%_LIMIT_BAL1,%_LIMIT_BAL2,%_LIMIT_BAL3,%_LIMIT_BAL4,%_LIMIT_BAL5,%_LIMIT_BAL6
20748,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22071,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3439,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
21757,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7796,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [51]:
# For the test set
X_test.isna().head()

ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,OUTSTANDING_BAL3,OUTSTANDING_BAL4,OUTSTANDING_BAL5,OUTSTANDING_BAL6,%_LIMIT_BAL1,%_LIMIT_BAL2,%_LIMIT_BAL3,%_LIMIT_BAL4,%_LIMIT_BAL5,%_LIMIT_BAL6
9707,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13035,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
23848,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14913,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
11360,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [52]:
print("*****In the X_train set*****")
print(X_train.isna().sum())
print("\n")
print("*****In the X_test set*****")
print(X_test.isna().sum())

*****In the X_train set*****
ID
LIMIT_BAL           0
SEX                 0
EDUCATION           0
MARRIAGE            0
AGE                 0
PAY_0               0
PAY_2               0
PAY_3               0
PAY_4               0
PAY_5               0
PAY_6               0
BILL_AMT1           0
BILL_AMT2           0
BILL_AMT3           0
BILL_AMT4           0
BILL_AMT5           0
BILL_AMT6           0
PAY_AMT1            0
PAY_AMT2            0
PAY_AMT3            0
PAY_AMT4            0
PAY_AMT5            0
PAY_AMT6            0
OUTSTANDING_BAL1    0
OUTSTANDING_BAL2    0
OUTSTANDING_BAL3    0
OUTSTANDING_BAL4    0
OUTSTANDING_BAL5    0
OUTSTANDING_BAL6    0
%_LIMIT_BAL1        0
%_LIMIT_BAL2        0
%_LIMIT_BAL3        0
%_LIMIT_BAL4        0
%_LIMIT_BAL5        0
%_LIMIT_BAL6        0
dtype: int64


*****In the X_test set*****
ID
LIMIT_BAL           0
SEX                 0
EDUCATION           0
MARRIAGE            0
AGE                 0
PAY_0               0
PAY_2               