In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('UCI_Credit_Card.csv')
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
# Rename Column names
data.rename(columns={'PAY_0': 'PAY_1', 'default.payment.next.month': 'def_pay'}, inplace=True)

In [4]:
data.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'def_pay'],
      dtype='object')

In [6]:
#Education Column -- combine 5,6,0, into 4
data.EDUCATION.value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

In [10]:
fill = (data['EDUCATION'] == 5) | (data['EDUCATION'] == 6) | (data['EDUCATION'] == 0)
data.loc[fill, 'EDUCATION'] = 4
data.EDUCATION.value_counts()

2    14030
1    10585
3     4917
4      468
Name: EDUCATION, dtype: int64

In [12]:
# Combine 0 and 3
data.loc[data.MARRIAGE == 0, 'MARRIAGE'] = 3
data.MARRIAGE.value_counts()

2    15964
1    13659
3      377
Name: MARRIAGE, dtype: int64

In [17]:
data.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'def_pay'],
      dtype='object')

In [18]:
#Explorign pay columns
data.PAY_1.value_counts()

 0    14737
-1     5686
 1     3688
-2     2759
 2     2667
 3      322
 4       76
 5       26
 8       19
 6       11
 7        9
Name: PAY_1, dtype: int64

In [15]:
#Everything 0 and under is considered ontime?
data.PAY_4.value_counts()

 0    16455
-1     5687
-2     4348
 2     3159
 3      180
 4       69
 7       58
 5       35
 6        5
 8        2
 1        2
Name: PAY_4, dtype: int64

In [23]:
#fill all pay columns 
fil = (data.PAY_1 == -2) | (data.PAY_1 == -1) | (data.PAY_1 == 0)
data.loc[fil, 'PAY_1'] = 0

In [24]:
fil = (data.PAY_2 == -2) | (data.PAY_2 == -1) | (data.PAY_2 == 0)
data.loc[fil, 'PAY_2'] = 0

In [25]:
fil = (data.PAY_3 == -2) | (data.PAY_3 == -1) | (data.PAY_3 == 0)
data.loc[fil, 'PAY_3'] = 0

In [26]:
fil = (data.PAY_4 == -2) | (data.PAY_4 == -1) | (data.PAY_4 == 0)
data.loc[fil, 'PAY_4'] = 0

In [27]:
fil = (data.PAY_5 == -2) | (data.PAY_5 == -1) | (data.PAY_5 == 0)
data.loc[fil, 'PAY_1'] = 0

In [28]:
fil = (data.PAY_6 == -2) | (data.PAY_6 == -1) | (data.PAY_6 == 0)
data.loc[fil, 'PAY_6'] = 0

In [29]:
data.PAY_6.value_counts()

0    26921
2     2766
3      184
4       49
7       46
6       19
5       13
8        2
Name: PAY_6, dtype: int64

In [32]:
#no cleaning needed for Bill amount or pay amount

In [33]:
data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,def_pay
0,1,20000.0,2,2,1,24,0,2,0,0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,0,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,0,0,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,0,0,0,0,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,0,3,2,0,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,0,0,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


# Modeling

In [34]:
#Defining X and Y
X = data.drop(['def_pay'], axis=1)
y = data['def_pay']

## Logestic Regeression

In [38]:
#test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [41]:
#notmalizing data
from sklearn.preprocessing import Normalizer
#instantiate 
norm = Normalizer()

In [42]:
#Normalize the trainin and test sets
norm_data_train = norm.fit_transform(X_train)
norm_data_test = norm.transform(X_test)

In [50]:
#Convert normalized data into DF
norm_df_train = pd.DataFrame(norm_data_train, y_train)
norm_df_train

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
def_pay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.057841,0.819361,0.000003,0.000008,0.000005,0.000085,0.000000,0.000000,0.000000,0.000000,...,0.230683,0.235602,0.238265,0.243558,0.010925,0.010925,0.011198,0.009013,0.009559,0.009559
0,0.404483,0.337872,0.000017,0.000034,0.000034,0.000405,0.000000,0.000000,0.000000,0.000000,...,0.591884,0.248234,0.285738,0.237760,0.022181,0.035645,0.067574,0.101362,0.067574,0.002939
0,0.013847,0.998590,0.000022,0.000022,0.000022,0.000388,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.029592,0.000000,0.000000,0.000000,0.000000,0.000000
0,0.078500,0.996914,0.000007,0.000007,0.000003,0.000133,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.196243,0.482762,0.000014,0.000014,0.000014,0.000248,0.000000,0.000000,0.000000,0.000000,...,0.408037,0.202167,0.198925,0.203057,0.023035,0.014097,0.012228,0.008145,0.008379,0.008945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.203007,0.340581,0.000007,0.000014,0.000014,0.000218,0.000000,0.000000,0.000000,0.000000,...,0.379673,0.381158,0.199158,0.041680,0.013623,0.020435,0.010837,0.000490,0.007724,0.500116
1,0.011869,0.440342,0.000002,0.000002,0.000004,0.000081,0.000004,0.000004,0.000004,0.000004,...,0.370737,0.361481,0.372152,0.378879,0.029723,0.013210,0.000000,0.016513,0.013210,0.008807
0,0.017217,0.999852,0.000020,0.000020,0.000040,0.000520,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.108803,0.482159,0.000014,0.000014,0.000014,0.000172,0.000000,0.000000,0.000000,0.000000,...,0.352335,0.241920,0.197224,0.193132,0.020664,0.013776,0.030996,0.008266,0.000000,0.008266


In [44]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
#instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

In [51]:
#fit the model
logreg.fit(norm_data_train, y_train)

LogisticRegression(C=1000000000000.0, fit_intercept=False, solver='liblinear')

In [56]:
#predict
log_test_preds = logreg.predict(norm_data_test)

In [68]:
#print Metrics and confusion matrix
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, test_preds))
    
print_metrics(y_test, log_test_preds)

Precision Score: 0.45323741007194246
Recall Score: 0.03872157344806392
Accuracy Score: 0.7813333333333333
F1 Score: 0.07134767836919592
Confusion Matrix:
[[5797   76]
 [1564   63]]


In [None]:
# F-1 score super low, may have made mistake somew


## KNN

In [58]:
#instantiate standard scaler
scaler = StandardScaler()

In [59]:
#transform the trainin and test sets to scaled
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

In [60]:
#Convert scaled data into DF
scaled_df_train = pd.DataFrame(scaled_data_train, y_train)
scaled_df_train

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
def_pay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.710956,1.023649,-1.242733,1.554718,0.852757,-0.487699,-0.232843,-0.39970,-0.383663,-0.340216,...,0.545403,0.677926,0.779082,0.853431,-0.096853,-0.079817,-0.061177,-0.098131,-0.085901,-0.096464
0,1.029880,-1.137989,-1.242733,0.213056,0.852757,-1.245983,-0.232843,-0.39970,-0.383663,-0.340216,...,-0.168733,-0.441692,-0.383467,-0.417923,-0.254771,-0.156398,-0.066722,0.070718,-0.054272,-0.285969
0,-1.587837,-0.597580,0.804678,0.213056,0.852757,-0.054394,-0.232843,-0.39970,-0.383663,-0.340216,...,-0.674955,-0.671563,-0.663078,-0.656173,-0.175195,-0.241892,-0.288493,-0.304503,-0.307303,-0.295883
0,0.992970,1.023649,0.804678,0.213056,-1.064604,0.487237,-0.232843,-0.39970,-0.383663,-0.340216,...,-0.674955,-0.671563,-0.663078,-0.656173,-0.331937,-0.241892,-0.288493,-0.304503,-0.307303,-0.295883
1,1.550309,-0.751982,0.804678,0.213056,0.852757,0.053932,-0.232843,-0.39970,-0.383663,-0.340216,...,0.179897,-0.212978,-0.186248,-0.157751,-0.135642,-0.159072,-0.190193,-0.230647,-0.230445,-0.221984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.705792,-0.906385,-1.242733,0.213056,0.852757,-0.379373,-0.232843,-0.39970,-0.383663,-0.340216,...,0.130396,0.203822,-0.179735,-0.552588,-0.214395,-0.120336,-0.200284,-0.300000,-0.235569,3.887428
1,-1.109969,0.251635,-1.242733,-1.128606,0.852757,0.162258,3.392738,2.09877,2.166912,2.296664,...,1.757988,1.896883,2.131196,2.256927,0.461471,0.001221,-0.288493,0.164524,0.072244,-0.067975
0,-1.632474,-0.906385,-1.242733,-1.128606,0.852757,-1.029331,-0.232843,-0.39970,-0.383663,-0.340216,...,-0.674955,-0.671563,-0.663078,-0.656173,-0.331937,-0.241892,-0.288493,-0.304503,-0.307303,-0.295883
1,0.090178,-0.751982,0.804678,0.213056,0.852757,-1.137657,-0.232843,-0.39970,-0.383663,-0.340216,...,0.064120,-0.122118,-0.189737,-0.181519,-0.155624,-0.160855,-0.039000,-0.229459,-0.307303,-0.227511


In [61]:
#import Classifier
from sklearn.neighbors import KNeighborsClassifier

In [62]:
#instantiate model
knn = KNeighborsClassifier()

In [64]:
#fit classifier
knn.fit(scaled_data_train, y_train)

KNeighborsClassifier()

In [66]:
#predict on the test set
knn_test_preds = knn.predict(scaled_data_test)

In [69]:
#print Metrics and Confusion Matrix
print_metrics(y_test, knn_test_preds)

Precision Score: 0.48662041625371655
Recall Score: 0.30178242163491087
Accuracy Score: 0.7794666666666666
F1 Score: 0.37253414264036416
Confusion Matrix:
[[5797   76]
 [1564   63]]


In [70]:
#model improvment function
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [71]:
find_best_k(scaled_data_train, y_train, scaled_data_test, y_test)

Best Value for k: 5
F1-Score: 0.37253414264036416


In [72]:
# origional model was most accurate with k: 5

## Naive Bayes

In [73]:
#import model 
from sklearn.naive_bayes import GaussianNB

In [74]:
#instantiate model
gnb = GaussianNB()

In [75]:
#fit classifier
gnb.fit(X_train, y_train)

GaussianNB()

In [76]:
#predict on the test set
gnb_test_preds = gnb.predict(X_test)

In [77]:
#print Metrics and Confusion Matrix
print_metrics(y_test, gnb_test_preds)

Precision Score: 0.24356168995118666
Recall Score: 0.8893669330055316
Accuracy Score: 0.3768
F1 Score: 0.38239957716701894
Confusion Matrix:
[[5797   76]
 [1564   63]]
