# Setup

## Imports

In [None]:
import sys
sys.path.append("../")
from models import *

import pandas as pd
import numpy as np

from pathlib import Path
from typing import Tuple, Optional

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.layers import Input, Dropout, Convolution1D, MaxPool1D, UpSampling1D, concatenate, GlobalMaxPool1D

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

## Setting up GPU

In [None]:
%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != "/device:GPU:0":
  device_name = "/cpu:0"
print('Found device at: {}'.format(device_name))

Found device at: /cpu:0


## Setting up Folder Structure

In [None]:
data_dir = Path("../input/")
model_dir = Path(".")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Loading

format_data is a utility function for relaibly loading and lightly formatting the heartbeat data signals. It can add padding to ensure that signals have a certain lenght.

In [None]:
"""
  :param df: Dataframe containing signal and labels
  :param padded_size: Integer indicating if signal should be padded
                      to certain length
  :return: Signal, Labels
"""
def format_data(
    df : pd.DataFrame,
    padded_size : Optional[int] = None
) -> Tuple[np.array, np.array]:

    # Load signal and labels from the dataframe
    Y = np.array(df[187].values).astype(np.int8)
    X = np.array(df[list(range(187))].values)[..., np.newaxis]

    # Add padding if padded_size is specified
    if not padded_size is None:
        X = np.concatenate([X, np.zeros((X.shape[0], padded_size - X.shape[1], 1))], axis=1)

    return X, Y

mask is a function to softly mask a heartbeat signal.\
masked_loss is the loss function described in the report.\
We combine the soft mask and the true signal in one 4D tensor to not run into troubles with batching.

In [None]:
"""
  :param X: Signal to be masked
  :param unpadded_size: Length of signal before any padding
                        Only want to mask parts of the original signal
  :param mask_size: Length of patches to mask
  :param max_repeats: Maximum number of patches to mask for a single signal
  :param alpah: Parameter to construct soft mask
  :return: Masked signal, Soft mask
"""
def mask(X, unpadded_size, mask_size , max_repeats, alpha):

    # All part of signal is initially unmasked
    M = np.ones_like(X, dtype=np.float32)

    # Randomly mask patches of size mask_size
    for i in range(X.shape[0]):
        for _ in range(np.random.randint(low=0, high=max_repeats) + 1):
            start = np.random.randint(low=0, high=unpadded_size - mask_size)
            end = start + mask_size
            M[i, start : end] = 0

    return M * X, np.stack([X, 1 - alpha * M], axis=-1)

"""
    :param stacked_input: 4d Tensor. 
                          stacked_input[..., 0] is the original unmaksed signal
                          stacked_input[..., 1] is the soft mask
    :param pred: 3d tensor which is the predicted signal from the model
    :return: Softly masked reconstruction loss
"""
def masked_loss(stacked_input, pred):
    squared_difference = stacked_input[:, : ,: , 1] * tf.square(pred - stacked_input[:, :, :, 0])
    return tf.reduce_mean(squared_difference, axis=-1)

We load the data using the previously defined utility functions

In [None]:

# Problem parameters
unpadded_size = 187
padded_size = 256

# Masking parameters
mask_size = 60
max_repeats = 2
alpha = 0.95


# Load data PTB
df_1 = pd.read_csv(data_dir.joinpath("ptbdb_normal.csv"), header=None)
df_2 = pd.read_csv(data_dir.joinpath("ptbdb_abnormal.csv"), header=None)
df   = pd.concat([df_1, df_2])

df_train_ptb, df_test_ptb = train_test_split(
    df, test_size=0.2, 
    random_state=1337, stratify=df[unpadded_size]
)

# Load data MIT
df_train_mit = pd.read_csv(data_dir.joinpath("mitbih_train.csv"), header=None)
df_train_mit = df_train_mit.sample(frac=1)
df_test_mit = pd.read_csv(data_dir.joinpath("mitbih_test.csv"), header=None)

# Combine
df_train = pd.concat([df_train_ptb, df_train_mit])
df_test = pd.concat([df_test_ptb, df_test_mit])

# Format and mask data
X_test, Y_test     = format_data(df_test, padded_size)
X_train, Y_train   = format_data(df_train, padded_size)
X_masked, Y_masked = mask(X_train, unpadded_size, mask_size , max_repeats, alpha)


# Training

Training procedure for the U-Net\
We don't do much hyperparameter tuning due to the imense training time for a single model.
We observe that $\alpha$ is not too high so that the model collapses to the mean signal strength. Otherwise, we use the paramteres described by the authors of the U-Net paper. Moreover, we use a low patience for early stopping and reduction of the learning rate due to the big size of the model and the dataset.

In [None]:
# Setting up callbacks 
# Due to the long training time for a single epoch,
# we use a low patience for reducing learning rate 
# and for stopping early
unet_file_path = model_dir.joinpath("unet.h5")
checkpoint = ModelCheckpoint(unet_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode="min")
early = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_loss", mode="min", patience=3, verbose=2)
callbacks_list = [checkpoint, early, redonplat]

# Run training
model = Unet(
    (padded_size,1), loss = masked_loss,
    epochs=100, verbose=2, callbacks=callbacks_list, validation_split=0.1
)
model.fit(X_masked, Y_masked)
model.load_weights(unet_file_path)

# Train Models in Latent Space

## Train on PTBDB Dataset

In [None]:
# Load PTB Dataset
X_train, Y_train = format_data(df_train_ptb, padded_size)
X_test, Y_test   = format_data(df_test_ptb, padded_size)

# Parameters to test
# We adjust depth and with. We use power of two widths and depth of 1 or 2
parameters = {
    "depth": [1, 2],
    "width": [32, 64, 128],
}

with tf.device(device_name):

    # Set up callbacks for base model
    file_path = model_dir.joinpath("latent_ptbdb.h5")
    checkpoint = ModelCheckpoint(file_path, monitor="val_acc", verbose=1, save_best_only=True, mode='max')
    early = EarlyStopping(monitor="val_acc", mode="max", patience=30, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_acc", mode="max", patience=15, verbose=2)
    callbacks_list = [checkpoint, early, redonplat]

    # Set up base model
    base = LatentClassifier(classes = 1, width=None, depth=None, encoder=model, 
                            epochs=1000, verbose=0, callbacks=callbacks_list, validation_split=0.1)
    
    # Run grid search
    search = GridSearchCV(base, parameters, verbose=3, cv=2)
    search.fit(X_train, Y_train)
    print(f"Finished CV for PTB Dataset: Top score {search.best_score_}\n"
          f"Best parameters: {search.cv_results_['params'][search.best_index_]}")
    
    # Run tests
    pred_test = search.best_estimator_.predict(X_test)

    f1 = f1_score(Y_test, pred_test)
    print("Test f1 score : %s "% f1)

    acc = accuracy_score(Y_test, pred_test)
    print("Test accuracy score : %s "% acc)

    auroc = roc_auc_score(Y_test, pred_test)
    print("Test AUROC : %s "% auroc)

    auprc = average_precision_score(Y_test, pred_test)
    print("Test AUPRC : %s "% auprc)


## Train on MIT-BIH Dataset

In [None]:
# Load PTB Dataset
X_train, Y_train = format_data(df_train_mit, padded_size)
X_test, Y_test   = format_data(df_test_mit, padded_size)

# Parameters to test
# We adjust depth and with. We use power of two widths and depth of 1 or 2
parameters = {
    "depth": [1, 2],
    "width": [32, 64, 128],
}

with tf.device(device_name):

    # Set up callbacks for base model
    file_path = model_dir.joinpath("latent_mit.h5")
    checkpoint = ModelCheckpoint(file_path, monitor="val_acc", verbose=1, save_best_only=True, mode="max")
    early = EarlyStopping(monitor="val_acc", mode="max", patience=30, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_acc", mode="max", patience=15, verbose=2)
    callbacks_list = [checkpoint, early, redonplat]

    # Set up base model
    base = LatentClassifier(classes = 5, width=None, depth=None, encoder=model, 
                            epochs=1000, verbose=0, callbacks=callbacks_list, validation_split=0.1)
    
    # Run grid search
    search = GridSearchCV(base, parameters, verbose=3, cv=2)
    search.fit(X_train, Y_train)
    print(f"Finished CV for MIT-BIH Dataset: Top score {search.best_score_}\n"
          f"Best parameters: {search.cv_results_['params'][search.best_index_]}")
    
    # Run tests
    pred_test = search.best_estimator_.predict(X_test)

    f1 = f1_score(Y_test, pred_test, average="macro")
    print("Test f1 score : %s "% f1)

    acc = accuracy_score(Y_test, pred_test)
    print("Test accuracy score : %s "% acc)
