In [5]:
import time
import seaborn as sns
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import  LabelEncoder
import pandas as pd
import numpy as np
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data/Adult-Data2018.csv', na_values='?')
data.rename(columns={'Class': 'class'}, inplace=True)

for column in data.columns:
    if data[column].dtype == type(object):
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))

names = []
models = []

models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

X=data[['A1', 'A2', 'A3', 'A4', 'A5', 'A6','A7', 'A8', 'A9', 'A10']]
Y=data['class']

# Crossvalidation


In [6]:
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
for name, m in models:
    start_time = time.time()
    pred = cross_val_predict(m, X,Y, cv=kfold )
    elapsed_time = time.time() - start_time   
    confusion = metrics.confusion_matrix(Y,pred)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    specificity = TN / (TN + FP)
    sensitivity = TP / float(FN + TP)
    error = (FP + FN) / float(TP + TN + FP + FN) # equal #error_rate = 1 - accuracy
    F1Score = f1_score(Y, pred, average='binary')
    AUC=metrics.roc_auc_score(Y, pred)
    print(name)
    print('%.4f' % elapsed_time)
    print('%.4f' % float(accuracy *100.0))
    print('%.4f' %float(specificity*100.0))
    print('%.4f' %float(sensitivity*100.0))
    print('%.4f' %float(AUC*100.0))
    print('%.4f' %float(F1Score*100.0))
    print('%.4f' %float(error*100.0))
    print(' =======================')      



LR
0.1568
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
LDA
0.0766
96.1538
96.5789
95.2514
95.9152
94.0690
3.8462
CART
0.0591
95.3488
96.1842
93.5754
94.8798
92.7978
4.6512
NB
0.0508
96.2433
97.8947
92.7374
95.3161
94.0510
3.7567
KNN
0.1148
96.1538
96.7105
94.9721
95.8413
94.0526
3.8462
SVM
0.1330
99.6422
99.8684
99.1620
99.5152
99.4398
0.3578
AB
0.9589
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
GBM
1.0443
98.4794
99.0789
97.2067
98.1428
97.6157
1.5206
RF
1.6019
97.1377
97.8947
95.5307
96.7127
95.5307
2.8623
ET
1.3088
95.9750
97.1053
93.5754
95.3403
93.7063
4.0250


# leave  


In [8]:
num_folds = 10
for name, m in models: 
    loocv = LeaveOneOut() 
    start_time=time.time()
    pred = cross_val_predict(m, X,Y, cv=loocv )
    elapsed_time = time.time() - start_time
    confusion = metrics.confusion_matrix(Y,pred)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    specificity = TN / (TN + FP)
    sensitivity = TP / float(FN + TP)
    F1Score = f1_score(Y, pred, average='binary')
    AUC=metrics.roc_auc_score(Y, pred)
    error = (FP + FN) / float(TP + TN + FP + FN)
    print(name)
    print('%.4f' % elapsed_time)
    print('%.4f' % float(accuracy *100.0))
    print('%.4f' %float(specificity*100.0))
    print('%.4f' %float(sensitivity*100.0))
    print('%.4f' %float(AUC*100.0))
    print('%.4f' %float(F1Score*100.0))
    print('%.4f' %float(error*100.0))
    print(' =======================') 

LR
14.2283
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
LDA
7.8433
95.9750
96.3158
95.2514
95.7836
93.8102
4.0250
CART
3.4084
97.0483
96.8421
97.4860
97.1641
95.4856
2.9517
NB
3.0109
96.6011
97.7632
94.1341
95.9486
94.6629
3.3989
KNN
4.8448
96.1538
96.5789
95.2514
95.9152
94.0690
3.8462
SVM
14.5764
99.8211
99.8684
99.7207
99.7945
99.7207
0.1789
AB
103.3836
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
GBM
123.4531
99.1055
99.4737
98.3240
98.8989
98.5994
0.8945
RF
181.4878
97.6744
97.8947
97.2067
97.5507
96.3989
2.3256
ET
145.5204
96.6905
97.2368
95.5307
96.3838
94.8682
3.3095


# train test split

In [9]:
test_size = 0.3
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
for name, m in models:
    model = m
    start_time = time.time()
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    elapsed_time = time.time() - start_time
    confusion = metrics.confusion_matrix(Y_test,pred)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    specificity = TN / (TN + FP)
    sensitivity = TP / float(FN + TP)
    F1Score = f1_score(Y_test, pred, average='binary')
    AUC=metrics.roc_auc_score(Y_test, pred)
    class_error = (FP + FN) / float(TP + TN + FP + FN)
    print(name)
    print('%.4f' % elapsed_time)
    print('%.4f' % float(accuracy *100.0))
    print('%.4f' %float(specificity*100.0))
    print('%.4f' %float(sensitivity*100.0))
    print('%.4f' %float(AUC*100.0))
    print('%.4f' %float(F1Score*100.0))
    print('%.4f' %float(class_error*100.0))
    print(' ============================')

LR
0.0166
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
LDA
0.0061
95.8333
95.6522
96.2264
95.9393
93.5780
4.1667
CART
0.0032
94.6429
94.7826
94.3396
94.5611
91.7431
5.3571
NB
0.0048
97.0238
98.2609
94.3396
96.3002
95.2381
2.9762
KNN
0.0194
94.9405
96.5217
91.5094
94.0156
91.9431
5.0595
SVM
0.0119
99.4048
100.0000
98.1132
99.0566
99.0476
0.5952
AB
0.0924
100.0000
100.0000
100.0000
100.0000
100.0000
0.0000
GBM
0.0985
99.7024
99.5652
100.0000
99.7826
99.5305
0.2976
RF
0.1594
96.7262
97.3913
95.2830
96.3372
94.8357
3.2738
ET
0.1443
95.8333
96.5217
94.3396
95.4307
93.4579
4.1667
