In [1]:
import pandas as pd
import numpy as np

import tensorflow
from sklearn.model_selection import train_test_split
import sklearn.decomposition as decomposition
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_recall_curve, auc, accuracy_score, roc_auc_score, confusion_matrix

from xgboost import XGBClassifier

from tensorflow.keras import Model
from tensorflow.keras.models import load_model

### Load PTBDB dataset

In [4]:
df_1 = pd.read_csv("../data/ptbdb_normal.csv", header=None)
df_2 = pd.read_csv("../data/ptbdb_abnormal.csv", header=None)
df = pd.concat([df_1, df_2])

df_train, df_test = train_test_split(df, test_size=0.2, random_state=1337, stratify=df[187])


Y = np.array(df_train[187].values).astype(np.int8)
X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

Y_test = np.array(df_test[187].values).astype(np.int8)
X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

### Get embeddings from baseline model

In [5]:
model = load_model("../baseline/baseline_cnn_ptbdb.h5")
embedding_model = Model(inputs=model.inputs, outputs=model.layers[15].output) # get embeddings from last conv layer

embeddings = embedding_model.predict(X)
embeddings_test = embedding_model.predict(X_test)
embeddings.shape

(11641, 256)

### SVM

In [6]:
svm = SVC()
svm.fit(embeddings, Y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [7]:
pred_test = svm.predict(embeddings_test)

f1 = f1_score(Y_test, pred_test)

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, pred_test)

print("AUROC score : %s "% acc)

precision, recall, _ = precision_recall_curve(Y_test, pred_test)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.9950225171841669 
Test accuracy score : 0.9927859841978701 
AUROC score : 0.9927859841978701 
AUPRC score : 0.9955503817455404 
[[ 791   18]
 [   3 2099]]


### Try applying pca to embeddings

In [8]:
embeddings_normalized = embeddings - embeddings.mean(axis=0)
embeddings_test_normalized = embeddings_test - embeddings_test.mean(axis=0)
pca = decomposition.PCA(n_components=64)
components = pca.fit_transform(embeddings_normalized)
components_test = pca.transform(embeddings_test_normalized)
svm_pca = SVC()
svm_pca.fit(components, Y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
pred_test = svm_pca.predict(components_test)

f1 = f1_score(Y_test, pred_test)

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, pred_test)

print("AUROC score : %s "% acc)

precision, recall, _ = precision_recall_curve(Y_test, pred_test)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.9945432977461446 
Test accuracy score : 0.9920989350738578 
AUROC score : 0.9920989350738578 
AUPRC score : 0.9955806449903406 
[[ 792   17]
 [   6 2096]]


### Forest

In [10]:
clf = XGBClassifier(n_jobs=-1)
clf.fit(embeddings, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [11]:
pred_test = clf.predict(embeddings_test)

f1 = f1_score(Y_test, pred_test)

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, pred_test)

print("AUROC score : %s "% acc)

precision, recall, _ = precision_recall_curve(Y_test, pred_test)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.993361782835467 
Test accuracy score : 0.9903813122638269 
AUROC score : 0.9903813122638269 
AUPRC score : 0.9945750622750296 
[[ 788   21]
 [   7 2095]]


### Try with PCA

In [12]:
clf_pca = XGBClassifier(n_jobs=-1)
clf_pca.fit(components, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [13]:
pred_test = clf_pca.predict(components_test)

f1 = f1_score(Y_test, pred_test)

print("Test f1 score : %s "% f1)

acc = accuracy_score(Y_test, pred_test)

print("Test accuracy score : %s "% acc)

auc_roc = roc_auc_score(Y_test, pred_test)

print("AUROC score : %s "% acc)

precision, recall, _ = precision_recall_curve(Y_test, pred_test)

auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)

print(confusion_matrix(Y_test, pred_test))

Test f1 score : 0.9886148007590133 
Test accuracy score : 0.9835108210237032 
AUROC score : 0.9835108210237032 
AUPRC score : 0.9917145310682545 
[[ 779   30]
 [  18 2084]]
