# MODELS 101

In [1]:
## standard and modeling imports

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from Modules import *
sns.set()
%matplotlib inline
import imblearn
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score,  KFold
from sklearn.ensemble import RandomForestClassifier
#import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

In [2]:
# read the csv & clean up some of the unusual values
df, y = read_data('../default_of_credit_card_clients.xls')
df = proc_cat_df(df)

df.head().T

Unnamed: 0,1,2,3,4,5
LIMIT_BAL,20000,120000,90000,50000,50000
AGE,24,26,34,37,57
PAY_1,2,-1,0,0,-1
PAY_2,2,2,0,0,0
PAY_3,-1,0,0,0,-1
PAY_4,-1,0,0,0,0
PAY_5,-2,0,0,0,0
PAY_6,-2,2,0,0,0
BILL_AMT1,3913,2682,29239,46990,8617
BILL_AMT2,3102,1725,14027,48233,5670


In [3]:
X = df.drop(columns = 'Y')
y = df['Y']

In [4]:
df.describe()

Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,PAY_AMT5,PAY_AMT6,Y,SEX_Female,SEX_Male,EDUCATION_Graduate School,EDUCATION_Other,EDUCATION_University,MARRIAGE_Married,MARRIAGE_Non-married
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,49179.075167,...,4799.387633,5215.502567,0.2212,0.603733,0.396267,0.352833,0.1795,0.467667,0.4553,0.5447
std,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,71173.768783,...,15278.305679,17777.465775,0.415062,0.489129,0.489129,0.477859,0.383777,0.498962,0.498006,0.498006
min,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,...,252.5,117.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,140000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,21200.0,...,1500.0,1500.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,240000.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,64006.25,...,4031.5,4000.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
max,1000000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,...,426529.0,528666.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
##splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 2019, test_size = .2)

In [6]:
#create  oversampled data to train on
oversampler = SMOTE(random_state = 2019)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

In [7]:
#Put the oversampled data back into a dataframe
X_train_oversampled = pd.DataFrame(X_train_oversampled, columns = X_train.columns)
y_train_oversampled = pd.Series(y_train_oversampled)

In [8]:
#check the size of the data
print("X_train_oversampled", X_train_oversampled.shape)
print("X_test",X_test.shape)
print("y_train_oversampled",y_train_oversampled.shape)
print("y_test",y_test.shape)

X_train_oversampled (37308, 27)
X_test (6000, 27)
y_train_oversampled (37308,)
y_test (6000,)


# Logistic Regression 

In [None]:
clfLR = LogisticRegression(solver = 'lbfgs',
                           max_iter = 500,
                          random_state = 2019)
clfLR.fit(X_train_oversampled,y_train_oversampled)

predLR = clfLR.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_LR = cross_val_score(clfLR, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_LR.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predLR).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predLR).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predLR).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predLR).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(clfLR, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# SVC

In [None]:
clfSVC = SVC(kernel = 'rbf',
             gamma = 'scale',
                random_state = 2019)

clfSVC.fit(X_train_oversampled,y_train_oversampled)

predSVC = clfSVC.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_SVC = cross_val_score(clfSVC, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_SVC.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predSVC).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predSVC).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predSVC).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predSVC).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(p, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# Kneighbours


In [None]:
clfKNN = KNeighborsClassifier(n_neighbors = 3)
k = clfKNN.fit(X_train_oversampled,y_train_oversampled)

predKNN = clfKNN.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_KNN = cross_val_score(clfKNN, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_KNN.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predKNN).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predKNN).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predKNN).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predKNN).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(k, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# Random Forest Classifier

In [None]:
clfRF = RandomForestClassifier(criterion = 'gini',
                              n_estimators = 100,
                              verbose = False,
                              random_state = 0)

r = clfRF.fit(X_train_oversampled,y_train_oversampled)

predRF = clfRF.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_RF = cross_val_score(clfRF, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_RF.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predRF).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predRF).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predRF).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predRF).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(r, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# light gbm 

In [None]:
clfLGB = LGBMClassifier(n_estimators = 100,
                           learning_rate = .2,
                           random_state = 0)

g = clfLGB.fit(X_train_oversampled,y_train_oversampled)

predLGB = g.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_LGB = cross_val_score(clfLGB, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_LGB.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predLGB).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predLGB).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predLGB).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predLGB).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(g, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# Cat Boost Classifier

https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html

In [None]:
clfCB = CatBoostClassifier(iterations = 100,
                           learning_rate = .2,
                           depth = 5,
                           eval_metric = 'AUC',
                           random_seed = 0)

clfCB.fit(X_train_oversampled,y_train_oversampled)

predCB = clfCB.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_LGB = cross_val_score(clfCB, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_LGB.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, predCB).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, predCB).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, predCB).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, predCB).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(clfCB, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

# Decision Tree

In [None]:
# Create Decision Tree classifer object
clfTR = DecisionTreeClassifier()

# Train Decision Tree Classifer
clfTR = clfTR.fit(X_train_oversampled,y_train_oversampled)

#Predict the response for test dataset
y_pred = clfTR.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_LGB = cross_val_score(clfTR, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_LGB.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, y_pred).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, y_pred).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, y_pred).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, y_pred).round(2))

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(clfTR, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clfTR, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph.write_png('diabetes.png')
#Image(graph.create_png())


# SVM - taking to long

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clfSVM = svm.SVC(kernel='linear',cache_size = 7000) # Linear Kernel


In [None]:

#Train the model using the training sets
#clfSVM.fit(X_train_oversampled,y_train_oversampled)


In [None]:

#Predict the response for test dataset
y_predSVM = clfSVM.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_SVM = cross_val_score(clfSVM, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_SVM.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, y_predSVM).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, y_predSVM).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, y_predSVM).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, y_predSVM).round(2))

# GaussianNB

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
clfGNB = gnb.fit(X_train_oversampled,y_train_oversampled)

#Predict the response for test dataset
y_predgnb = gnb.predict(X_test)

In [None]:
# Cross Validation
cross_val_score_GNB = cross_val_score(clfGNB, X_test, y_test, cv = 10)
print('cross_val_score: ',cross_val_score_GNB.mean().round(2))

# Precision Score
print('precision score is ',precision_score(y_test, y_predgnb).round(2))

# Recall Score
print('recall_score is ',recall_score(y_test, y_predgnb).round(4))
# F1 Score
print('f1 score is ',f1_score(y_test, y_predgnb).round(3))

# ROC_AUC
print('ROC AUC is ',roc_auc_score(y_test, y_predgnb).round(2))

In [None]:
#Use 5-fold cross validation to see how well the classfier you built is doing on test data.

kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(clfGNB, X_test, y_test, cv=kfold, scoring = 'f1')

print(f"5-fold cross-validation results: {np.mean(results)}")

In [None]:
### ENSEMBLE

In [None]:
# Bagged Decision Trees for Classification - necessary dependencies

from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# AdaBoost Classification

from sklearn.ensemble import AdaBoostClassifier
seed = 2019
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

In [None]:
# Voting Ensemble for Classification

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier



In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))


In [None]:
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))


In [None]:
model3 = SVC()
estimators.append(('svm', model3))


In [None]:
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

In [None]:
# Confusion Matrix
cmLR = confusion_matrix(y_test, predLR)
cmSVC = confusion_matrix(y_test, predSVC)
cmKNN = confusion_matrix(y_test, predKNN)
cmRF = confusion_matrix(y_test, predRF)
cmTR = confusion_matrix(y_test, y_pred)
cmLGB = confusion_matrix(y_test, predLGB)
cmCB = confusion_matrix(y_test, predCB)


# Confusion Matrix List
cmList = [cmLR, cmSVC,cmKNN, cmRF,cmTR,  cmLGB, cmCB]
cmTitle = ['Logistic Regression','Support Vector Machines classifier','K Nearest Neighbors','Random Forest',"Classification Tree", 'LightGB','CatGBM', None]
i = 0
plt.figure()
fig, ax = plt.subplots(2,4, num = 6, figsize = (30,10))
for cm in cmList:
    i += 1
    plt.subplot(2,4,i)
    plt.title(cmTitle[i-1])
    sns.heatmap(cm, annot = True, cmap = 'YlGnBu')
plt.show()
plt.savefig('matrix.png');

In [9]:
X.columns

Index(['LIMIT_BAL', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'SEX_Female', 'SEX_Male',
       'EDUCATION_Graduate School', 'EDUCATION_Other', 'EDUCATION_University',
       'MARRIAGE_Married', 'MARRIAGE_Non-married'],
      dtype='object')

In [10]:
continuous = X.loc[:,['LIMIT_BAL','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]


In [11]:
categorical = X.loc[:,['AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'PAY_6','SEX_Female', 'SEX_Male',
       'EDUCATION_Graduate School', 'EDUCATION_Other', 'EDUCATION_University',
       'MARRIAGE_Married', 'MARRIAGE_Non-married']]

In [21]:
continuous.head()

Unnamed: 0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0.010101,0.149982,0.069164,0.086723,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0
1,0.111111,0.148892,0.067858,0.087817,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783
2,0.080808,0.172392,0.079532,0.093789,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458
3,0.040404,0.1881,0.111995,0.113407,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892
4,0.040404,0.154144,0.071601,0.10602,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284


In [22]:
categorical.head()

Unnamed: 0,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,SEX_Female,SEX_Male,EDUCATION_Graduate School,EDUCATION_Other,EDUCATION_University,MARRIAGE_Married,MARRIAGE_Non-married
1,24,2,2,-1,-1,-2,-2,1,0,0,0,1,1,0
2,26,-1,2,0,0,0,2,1,0,0,0,1,0,1
3,34,0,0,0,0,0,0,1,0,0,0,1,0,1
4,37,0,0,0,0,0,0,1,0,0,0,1,1,0
5,57,-1,0,-1,0,0,0,0,1,0,0,1,1,0


In [13]:
m = continuous

In [14]:
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)

In [15]:
continuous = scaler.fit_transform(continuous)

  return self.partial_fit(X, y)


In [16]:
continuous = pd.DataFrame(continuous, columns = m.columns)

In [17]:
continuous.head()

Unnamed: 0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0.010101,0.149982,0.069164,0.086723,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0
1,0.111111,0.148892,0.067858,0.087817,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783
2,0.080808,0.172392,0.079532,0.093789,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458
3,0.040404,0.1881,0.111995,0.113407,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892
4,0.040404,0.154144,0.071601,0.10602,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284


In [18]:
categorical.head()

Unnamed: 0,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,SEX_Female,SEX_Male,EDUCATION_Graduate School,EDUCATION_Other,EDUCATION_University,MARRIAGE_Married,MARRIAGE_Non-married
1,24,2,2,-1,-1,-2,-2,1,0,0,0,1,1,0
2,26,-1,2,0,0,0,2,1,0,0,0,1,0,1
3,34,0,0,0,0,0,0,1,0,0,0,1,0,1
4,37,0,0,0,0,0,0,1,0,0,0,1,1,0
5,57,-1,0,-1,0,0,0,0,1,0,0,1,1,0


In [19]:
X = pd.concat([continuous, categorical],axis=1)

  return self._int64index.union(other)


In [20]:
X.head().T

Unnamed: 0,0,1,2,3,4
LIMIT_BAL,0.010101,0.111111,0.080808,0.040404,0.040404
BILL_AMT1,0.149982,0.148892,0.172392,0.1881,0.154144
BILL_AMT2,0.069164,0.067858,0.079532,0.111995,0.071601
BILL_AMT3,0.086723,0.087817,0.093789,0.113407,0.10602
BILL_AMT4,0.160138,0.16322,0.173637,0.186809,0.179863
BILL_AMT5,0.080648,0.084074,0.09547,0.109363,0.099633
BILL_AMT6,0.260979,0.263485,0.272928,0.283685,0.275681
PAY_AMT1,0.0,0.0,0.001738,0.00229,0.00229
PAY_AMT2,0.000409,0.000594,0.000891,0.001199,0.021779
PAY_AMT3,0.0,0.001116,0.001116,0.001339,0.01116
