## Simple Classifiers
Train classifiers based on annotations.

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten, BatchNormalization

Using TensorFlow backend.


In [3]:
train_path = '../data/simple_classier_inputs/train.csv'
test_path = '../data/simple_classier_inputs/test.csv'

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [5]:
y_train = train['label']
y_test = test['label']

X_train = train.drop(['label', 'fileID'], axis=1)
X_test = test.drop(['label', 'fileID'], axis=1)

In [6]:
sum(y_train)

167

In [7]:
sm = SMOTE(random_state=12, ratio = 1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [8]:
sum(y_train)

1277

In [9]:
models = []

models.append(("KNeighbors",
               KNeighborsClassifier(weights='distance')))

models.append(("LogisticRegression",
               LogisticRegression(solver='liblinear',
                                  max_iter=1000000,
                                  class_weight={1:.21},
                                  penalty='l1')))

dtBase = DecisionTreeClassifier(max_depth=10, 
                               max_features=1000,
                               class_weight={1:.2})

models.append(("DecisionTree",dtBase))

rdfBase = RandomForestClassifier(n_estimators=1000,
                                class_weight={1:.2})

models.append(("RandomForest",rdfBase))


rf2 = RandomForestClassifier(n_estimators=1000, 
                             criterion='gini',
                             max_depth=10,
                             class_weight={1:.2})

models.append(("RandomForest-2",rf2))

models.append(("BaggingClassifier",
               BaggingClassifier(DecisionTreeClassifier(max_depth=3),
                                n_estimators=100,
                                max_features=1000)))

models.append(("BaggingClassifier-2",
               BaggingClassifier(dtBase,
                                n_estimators=100,
                                max_features=1000)))

models.append(("AdaBoostClassifier",
               AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                  algorithm="SAMME",
                                  n_estimators=1000)))
               
models.append(("AdaBoostClassifier-2",
               AdaBoostClassifier(dtBase,
                                  algorithm="SAMME",
                                  n_estimators=100)))

models.append(("GradientBoostingClassifier",
               GradientBoostingClassifier(n_estimators=100, 
                                          max_leaf_nodes=4, 
                                          max_depth = 10)))

In [10]:
result_rows = []
count = 1
results_frame = pd.DataFrame()

predictionsList = []

for name, model in models:
    model.fit(X_train, y_train)
    prediction_vec = model.predict(X_test)
    
    predictionsList.append((name, prediction_vec))
    
    report = classification_report(y_test, prediction_vec)
    print(' #### ', name, '\n', report, '\n')
    
    ac = accuracy_score(y_test, prediction_vec)
    pr = precision_score(y_test, prediction_vec)
    roc = roc_auc_score(y_test, prediction_vec)

    results_frame.loc[count, 'Model'] = name
    results_frame.loc[count, 'Accuracy'] = ac
    results_frame.loc[count, 'Precision'] = pr
    results_frame.loc[count, 'AUCROC'] = roc

    count = count + 1

 ####  KNeighbors 
               precision    recall  f1-score   support

           0       0.93      0.55      0.69       548
           1       0.16      0.66      0.26        71

   micro avg       0.56      0.56      0.56       619
   macro avg       0.54      0.61      0.47       619
weighted avg       0.84      0.56      0.64       619
 

 ####  LogisticRegression 
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       548
           1       0.58      0.27      0.37        71

   micro avg       0.89      0.89      0.89       619
   macro avg       0.74      0.62      0.65       619
weighted avg       0.87      0.89      0.88       619
 

 ####  DecisionTree 
               precision    recall  f1-score   support

           0       0.91      0.93      0.92       548
           1       0.37      0.32      0.35        71

   micro avg       0.86      0.86      0.86       619
   macro avg       0.64      0.63      0.63       619


In [23]:
# add majority class classifier for comparison
majority_class = np.zeros(len(y_test))

ac = accuracy_score(y_test, majority_class)
pr = precision_score(y_test, majority_class)
roc = roc_auc_score(y_test, majority_class)

results_frame.loc[count, 'Model'] = 'MajorityClassClassifier-Naive'
results_frame.loc[count, 'Accuracy'] = ac
results_frame.loc[count, 'Precision'] = pr
results_frame.loc[count, 'AUCROC'] = roc

  'precision', 'predicted', average, warn_for)


In [24]:
results_frame.head(len(results_frame))

Unnamed: 0,Model,Accuracy,Precision,AUCROC
1,KNeighbors,0.562197,0.159864,0.605621
2,LogisticRegression,0.893376,0.575758,0.621029
3,DecisionTree,0.859451,0.370968,0.626388
4,RandomForest,0.880452,0.434783,0.558561
5,RandomForest-2,0.885299,0.5,0.50613
6,BaggingClassifier,0.798061,0.240385,0.603976
7,BaggingClassifier-2,0.890145,0.8,0.527257
8,AdaBoostClassifier,0.85622,0.333333,0.593914
9,AdaBoostClassifier-2,0.857835,0.319149,0.576437
10,GradientBoostingClassifier,0.85622,0.35,0.612303


## Neural Networks

In [13]:
# print(X_train.shape)
# print(X_test.shape)

In [14]:
# y_train = keras.utils.to_categorical(y_train, num_classes=2)
# y_test = keras.utils.to_categorical(y_test, num_classes=2)

In [15]:
# # Create model
# model = Sequential()
# model.add(Dense(units=1000, activation='relu', input_dim=5491))
# model.add(Dense(units=500, activation='relu'))
# model.add(Dense(units=70, activation='relu'))
# model.add(Dense(units=2, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer=keras.optimizers.Adagrad(lr=0.01),
#               metrics=['accuracy'])

# # Train
# mod = model.fit(X_train, y_train, validation_data=(X_test,y_test), 
#           epochs=20, batch_size=10)

In [16]:
# %matplotlib inline
# plt.plot(mod.history['acc'])
# plt.plot(mod.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [17]:
# plt.plot(mod.history['loss'])
# plt.plot(mod.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()