In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
# Loaded in the data
df = pd.read_csv('../csv_files/p1_comp2_final_use.csv', index_col=0)
df.head()

Unnamed: 0,X1,X2,X3,X4,X5_Age_Bins,X6_Payment_Bins,X7_Payment_Bins,X8_Payment_Bins,X9_Payment_Bins,X10_Payment_Bins,...,X17_abs,X17_sign,X17_monthlydif,X18_binary,X19_binary,X20_binary,X21_binary,X22_binary,X23_binary,Y
1,0.46216,2,2,1,1,2,2,1,1,2,...,0.299742,0,1,0,1,0,0,0,0,1
2,0.394132,2,2,2,1,1,2,2,2,2,...,0.504505,1,1,0,1,1,1,0,1,1
3,0.278693,2,2,2,2,2,2,2,2,2,...,0.624888,1,1,1,1,1,1,1,1,0
4,0.278693,2,2,1,2,2,2,2,2,2,...,0.540601,1,1,1,1,1,1,1,1,0
5,0.278693,1,2,1,3,1,2,1,2,2,...,0.548885,1,1,1,1,1,1,1,1,0


In [4]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Y # Target

In [5]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)

In [6]:
# save features and targets from the 'iris'
features, targets = X_resample, Y_resample

models = []
models.append(('LogisticRegression', LogisticRegression(solver='liblinear')))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
#models.append(('SVC', SVC(kernel='rbf',gamma='auto')))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))

# KFold with 'stratify' option
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print("Model:{0}, Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:LogisticRegression, Score: mean=0.65051, var=0.00005
Model:KNeighborsClassifier, Score: mean=0.72225, var=0.00009
Model:DecisionTreeClassifier, Score: mean=0.87663, var=0.00009
Model:AdaBoostClassifier, Score: mean=0.64848, var=0.00002


In [9]:
# Voting Ensemble for Classification
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

kfold = model_selection.KFold(n_splits=10, random_state=2019)
# create the sub models
estimators = []

model1 = LogisticRegression(solver='liblinear')
estimators.append(('Logistic Regression', model1))

model2 = DecisionTreeClassifier()
estimators.append(('Decision Tree', model2))

#model3 = SVC()
#estimators.append(('SVM', model3))

model4 = KNeighborsClassifier()
estimators.append(('KNeighborsClassifier', model4))

model5 = AdaBoostClassifier()
estimators.append(('AdaBoostClassifier', model5))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X_resample, Y_resample, cv=kfold)
print(results.mean())

0.654332048873007
