In [15]:
import warnings 
warnings.filterwarnings('ignore')

import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.neural_network import MLPClassifier

In [2]:
# load dataset

wd = '/Users/ewenwang/Documents/practice_data/'
file = 'credit_card.csv'
data = pd.read_csv(wd+file)
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [54]:
# prepare models
models = []
# linear
models.append(('LR', LogisticRegression()))
models.append(('SVC', LinearSVC(loss='hinge')))
models.append(('SDG', SGDClassifier()))

# 
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
# tree-based
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('GBDT', GradientBoostingClassifier()))
# neutral network
models.append(('NN', MLPClassifier()))

In [55]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=0)

target = 'Class'
features = [x for x in data.columns if x not in [target]]

X = train[features]
y = train[target]

In [None]:
# prepare configuration for cross validation test harness
seed = 0

# evaluate each model in turn
results = []
names = []
cost = []
scoring = 'accuracy'
for name, model in models:
    start = time.time()
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    timecost = time.time()-start
    results.append(cv_results)
    names.append(name)
    cost.append(timecost)
    msg = "%s:\t%f (%f)\ttime: %f s" % (name, cv_results.mean(), cv_results.std(), timecost)
    print(msg)

LR:	0.998907 (0.000168)	time: 23.449516 s
SDG:	0.998069 (0.000360)	time: 3.162077 s
LDA:	0.999390 (0.000179)	time: 6.472701 s
KNN:	0.998398 (0.000192)	time: 17.784213 s
NB:	0.992609 (0.000470)	time: 1.385648 s


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.title('Algorithm Comparison')
plt.grid()
plt.show()

In [None]:
means = []
stds = []
for i in results:
    means.append(i.mean())
    stds.append(i.std())

In [None]:
AmongResults = pd.DataFrame(columns=['model', 'score_mean', 'score_std', 'time'])
AmongResults['model'] = names
AmongResults['score_mean'] = means
AmongResults['score_std'] = stds
AmongResults['time'] = cost
AmongResults

In [37]:
import seaborn as sns

In [None]:
plt.figure()
sns.boxplot(data=results)
plt.show()

In [None]:
x_values1=AmongResults['model']
y_values1=AmongResults['score_mean']
e_values1=AmongResults['score_std']

x_values2=AmongResults['model']
y_values2=AmongResults['time']

fig=plt.figure(figsize=(8,8))
fig.suptitle('Algorithm Comparison')

ax=fig.add_subplot(111, label="1")
ax2=fig.add_subplot(111, label="2", frame_on=False)

# ax.scatter(x_values1, y_values1, color="C0")
ax.errorbar(x_values1, y_values1, e_values1, color="C0", linestyle='None', marker='o')
ax.set_xlabel("model", color="C0")
ax.set_ylabel("score mean", color="C0")
ax.tick_params(axis="model", colors="C0")
ax.tick_params(axis="score mean", colors="C0")

ax2.scatter(x_values2, y_values2, color="C1")
ax2.xaxis.tick_top()
ax2.yaxis.tick_right()
ax2.set_xlabel('model', color="C1") 
ax2.set_ylabel('time', color="C1")   
ax2.xaxis.set_label_position('top') 
ax2.yaxis.set_label_position('right') 
ax2.tick_params(axis='model', colors="C1")
ax2.tick_params(axis='time', colors="C1")

plt.grid()
plt.show()