In [52]:
import pandas as pd
import numpy as np

import tensorflow
from sklearn.model_selection import train_test_split
import sklearn.decomposition as decomposition
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_recall_curve, auc, accuracy_score, roc_auc_score, confusion_matrix

from xgboost import XGBClassifier

from tensorflow.keras import Model
from tensorflow.keras.models import load_model

### Load MIT dataset

In [8]:
df_train = pd.read_csv("data/mitbih_train.csv", header=None)
df_train = df_train.sample(frac=1)
df_test = pd.read_csv("data/mitbih_test.csv", header=None)

Y = np.array(df_train[187].values).astype(np.int8)
X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

Y_test = np.array(df_test[187].values).astype(np.int8)
X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

### Get embeddings from baseline model

In [13]:
model = load_model("baseline_cnn_mitbih.h5")
embedding_model = Model(inputs=model.inputs, outputs=model.layers[15].output) # get embeddings from last conv layer

embeddings = embedding_model.predict(X)
embeddings_test = embedding_model.predict(X_test)
embeddings.shape

(87554, 256)

### SVM (takes a while)

In [14]:
svm = SVC()
svm.fit(embeddings, Y)

0.9852457518728303

In [41]:
pred_test = svm.predict(embeddings_test)

f1 = f1_score(Y_test, pred_test, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.9118967999599399 
Test accuracy score : 0.9852457518728303 
[[18082    17    13     4     2]
 [  126   420     9     0     1]
 [   57     3  1368    15     5]
 [   28     1    18   115     0]
 [   22     0     2     0  1584]]


### Try applying pca to embeddings (much faster)

In [71]:
embeddings_normalized = embeddings - embeddings.mean(axis=0)
embeddings_test_normalized = embeddings_test - embeddings_test.mean(axis=0)
pca = decomposition.PCA(n_components=64)
components = pca.fit_transform(embeddings_normalized)
components_test = pca.transform(embeddings_test_normalized)
svm_pca = SVC()
svm_pca.fit(components, Y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [72]:
pred_test = svm_pca.predict(components_test)

f1 = f1_score(Y_test, pred_test, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.9097184546721195 
Test accuracy score : 0.9848803215786589 
[[18078    22    11     5     2]
 [  121   421    13     0     1]
 [   62     3  1362    16     5]
 [   28     1    18   115     0]
 [   21     0     2     0  1585]]


### Forest

In [67]:
clf = XGBClassifier(objective='multi:softmax', n_jobs=-1)
clf.fit(embeddings, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [70]:
pred_test = clf.predict(embeddings_test)

f1 = f1_score(Y_test, pred_test, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.8997709339167841 
Test accuracy score : 0.9817741640782021 
[[18070    19    25     1     3]
 [  147   401     6     0     2]
 [   87     6  1337    14     4]
 [   34     0    19   109     0]
 [   29     0     3     0  1576]]


### Try with PCA

In [65]:
clf_pca = XGBClassifier(objective='multi:softmax', n_jobs=-1)
clf_pca.fit(components, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [66]:
pred_test = svm_pca.predict(components_test)

f1 = f1_score(Y_test, pred_test, average="macro")

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

print(confusion_matrix(
    Y_test, pred_test))

Test f1 score : 0.8949858228853669 
Test accuracy score : 0.9810889822766308 
[[18055    29    25     7     2]
 [  148   395    12     0     1]
 [   80     8  1340    15     5]
 [   31     0    19   112     0]
 [   27     0     5     0  1576]]
