In [1]:
import pandas as pd
import numpy as np

import tensorflow
from tensorflow.keras import optimizers, losses, activations, models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Input, Dropout, Convolution1D, MaxPool1D, GlobalMaxPool1D, GlobalAveragePooling1D, \
    concatenate, Add, Activation
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import plot_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score, accuracy_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator

In [2]:
df_train = pd.read_csv("data/mitbih_train.csv", header=None)
df_train = df_train.sample(frac=1)
df_test = pd.read_csv("data/mitbih_test.csv", header=None)

Y = np.array(df_train[187].values).astype(np.int8)
X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

Y_test = np.array(df_test[187].values).astype(np.int8)
X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

In [3]:
base_models = ['baseline/baseline_cnn_mitbih', 'paper/cnn_paper_mitbih', 'rnn/rnn_mitbih', 'deep_res/cnn_res_deep_mitbih']
base_models = [load_model(model_name + '.h5') for model_name in base_models]

### One apprach: Take the softmax outputs, sum them up for each model, take the largest class

In [61]:
def simple_ensemble_predict(models, X):
    predicted = np.array([model.predict(X) for model in models])
    predicted = predicted.sum(axis=0)
    return np.argmax(predicted, axis=1)

##### Will take a few minutes

In [26]:
predictions = simple_ensemble_predict(base_models, X_test)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(
    Y_test, predictions))

Test f1 score : 0.9233999817799375 
Test accuracy score : 0.9879864790791156 
[[18086    19     9     2     2]
 [  107   442     6     0     1]
 [   37     5  1387    16     3]
 [   27     0    16   119     0]
 [   13     0     0     0  1595]]


### Second approach: Take the softmax outputs as inputs to a model

In [4]:
newX = np.array([model.predict(X) for model in base_models])
# reshape such that rows are samples and columns are the outputs from all of the networks concatenated
newX = np.transpose(newX, (1, 0, 2)).reshape(X.shape[0], -1)

newX_test = np.array([model.predict(X_test) for model in base_models])
# reshape such that rows are samples and columns are the outputs from all of the networks concatenated
newX_test = np.transpose(newX_test, (1, 0, 2)).reshape(X_test.shape[0], -1)

In [33]:
log = LogisticRegression().fit(newX, Y)

In [34]:
predictions = log.predict(newX_test)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(
    Y_test, predictions))

Test f1 score : 0.9227097706614031 
Test accuracy score : 0.9876210487849443 
[[18055    33    21     4     5]
 [   89   458     7     1     1]
 [   30     6  1396    14     2]
 [   30     0    14   118     0]
 [   14     0     0     0  1594]]


### Hyperparameter tuning for svm

In [5]:
params = {
    'C' : [0.01, 0.1, 1],
    'kernel' : ['rbf'],
    'gamma' : ['auto', 'scale'],
    'class_weight' : [None, 'balanced']
}
clf = GridSearchCV(SVC(random_state=42),
                   params,
                   cv=5,
                   refit=True,
                   n_jobs=-1,
                   scoring=make_scorer(f1_score, average='macro'),
                  verbose=10)
clf.fit(newX, Y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:  4.8min remaining:   44.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1],
                         'class_weight': [None, 'balanced'],
                         'gamma': ['auto', 'scale'], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(f1_score, average=macro), verbose=10)

In [6]:
predictions = clf.predict(newX_test)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(
    Y_test, predictions))

Test f1 score : 0.9237115143111663 
Test accuracy score : 0.9873926548510872 
[[18050    37    20     4     7]
 [   87   459     8     1     1]
 [   32     6  1389    18     3]
 [   28     0    11   123     0]
 [   13     0     0     0  1595]]


### We take the same approach for the PTBDB dataset

In [2]:
df_1 = pd.read_csv("data/ptbdb_normal.csv", header=None)
df_2 = pd.read_csv("data/ptbdb_abnormal.csv", header=None)
df = pd.concat([df_1, df_2])

df_train, df_test = train_test_split(df, test_size=0.2, random_state=1337, stratify=df[187])

Y = np.array(df_train[187].values).astype(np.int8)
X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

Y_test = np.array(df_test[187].values).astype(np.int8)
X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

In [3]:
base_models = ['baseline/baseline_cnn_ptbdb', 'paper/cnn_paper_ptbdb', 'deep_res/cnn_res_deep_ptbdb']# 'rnn/rnn_ptbdb'
base_models = [load_model(model_name + '.h5') for model_name in base_models]

### One apprach: Take the softmax outputs, sum them up for each model, take the largest class

In [77]:
def simple_ensemble_predict(models, X):
    predicted = np.array([model.predict(X) for model in models])
    predicted = predicted.sum(axis=0)/len(base_models)
    return (predicted>0.5).astype(np.int8)

In [79]:
predictions = simple_ensemble_predict(base_models, X_test)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, predictions)

print("AUROC score : %s "% auc_roc)

precision, recall, _ = precision_recall_curve(Y_test, predictions)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, predictions))

Test f1 score : 0.9948527374854859 
Test accuracy score : 0.9958777052559258 
AUROC score : 0.9937239711664329 
AUPRC score : 0.9976669558958856 
[[ 800    9]
 [   3 2099]]


### Second approach: Take the softmax outputs as inputs to a model

In [6]:
newX_pt = np.array([model.predict(X) for model in base_models])
# reshape such that rows are samples and columns are the outputs from all of the networks concatenated
newX_pt = np.transpose(newX_pt, (1, 0, 2)).reshape(X.shape[0], -1)

newX_test_pt = np.array([model.predict(X_test) for model in base_models])
# reshape such that rows are samples and columns are the outputs from all of the networks concatenated
newX_test_pt = np.transpose(newX_test_pt, (1, 0, 2)).reshape(X_test.shape[0], -1)

In [7]:
log = LogisticRegression().fit(newX_pt, Y)

In [8]:
predictions = log.predict(newX_test_pt)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, predictions)

print("AUROC score : %s "% auc_roc)

precision, recall, _ = precision_recall_curve(Y_test, predictions)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, predictions))

Test f1 score : 0.9948566723677329 
Test accuracy score : 0.9958777052559258 
AUROC score : 0.994104149441523 
AUPRC score : 0.9978362391054255 
[[ 801    8]
 [   4 2098]]


### Hyperparameter tuning for svm

In [28]:
params = {
    'C' : [0.01, 0.1, 1],
    'kernel' : ['rbf', 'poly'],
    'gamma' : ['auto', 'scale'],
    'class_weight' : [None, 'balanced']
}
clf = GridSearchCV(SVC(random_state=42),
                   params,
                   cv=5,
                   refit=True,
                   n_jobs=1,
                   scoring=make_scorer(f1_score, average='macro'),
                  verbose=1,
                  return_train_score=True)
clf.fit(newX_pt, Y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   16.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=1,
             param_grid={'C': [0.01, 0.1, 1],
                         'class_weight': [None, 'balanced'],
                         'gamma': ['auto', 'scale'],
                         'kernel': ['rbf', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=make_scorer(f1_score, average=macro), verbose=1)

In [29]:
predictions = clf.predict(newX_test_pt)
f1 = f1_score(Y_test, predictions, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, predictions)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, predictions)

print("AUROC score : %s "% auc_roc)

precision, recall, _ = precision_recall_curve(Y_test, predictions)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, predictions))

Test f1 score : 0.9927384934681365 
Test accuracy score : 0.9941600824458949 
AUROC score : 0.9936751625093061 
AUPRC score : 0.9978422142652383 
[[ 803    6]
 [  11 2091]]
