# Setup

## Imports

In [None]:
import sys
sys.path.append("../")
from models import *

import pandas as pd
import numpy as np

from pathlib import Path
from typing import Tuple, Optional

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.layers import Input, Dropout, Convolution1D, MaxPool1D, UpSampling1D, concatenate, GlobalMaxPool1D

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

## Setting up GPU

In [None]:
%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != "/device:GPU:0":
  device_name = "/cpu:0"
print('Found device at: {}'.format(device_name))

## Setting up Folder Structure

In [None]:
data_dir = Path("../input/")
model_dir = Path(".")

# Data Loading

format_data is a utility function for relaibly loading and lightly formatting the heartbeat data signals. It can add padding to ensure that signals have a certain lenght.

In [5]:
"""
  :param df: Dataframe containing signal and labels
  :param padded_size: Integer indicating if signal should be padded
                      to certain length
  :return: Signal, Labels
"""
def format_data(
    df : pd.DataFrame,
    padded_size : Optional[int] = None
) -> Tuple[np.array, np.array]:

    # Load signal and labels from the dataframe
    Y = np.array(df[187].values).astype(np.int8)
    X = np.array(df[list(range(187))].values)[..., np.newaxis]

    # Add padding if padded_size is specified
    if not padded_size is None:
        X = np.concatenate([X, np.zeros((X.shape[0], padded_size - X.shape[1], 1))], axis=1)

    return X, Y

# PTB Dataset

## Load Data

We load the data using the previously defined utility functions

In [8]:
# Problem parameters
unpadded_size = 187
padded_size = 256

# Load data PTB
df_1 = pd.read_csv(data_dir.joinpath("ptbdb_normal.csv"), header=None)
df_2 = pd.read_csv(data_dir.joinpath("ptbdb_abnormal.csv"), header=None)
df   = pd.concat([df_1, df_2])

df_train, df_test = train_test_split(
    df, test_size=0.2, 
    random_state=1337, stratify=df[unpadded_size]
)

# Format data
X_test, Y_test   = format_data(df_test)
X_train, Y_train = format_data(df_train)


## Training

We first perform a grid search and then we extract the top-performing model and use this to get our results

In [None]:
# Parameters to test
parameters = {
    "classes": [1],
    "n_estimators":[50, 100],
    "n_filters": [32, 64, 128], 
    "n_dense": [16], 
    "kernel_size": [5, 8]
}

# Run CV and predict
with tf.device(device_name):

    # Initializing base learner
    base = BoostingCNN(classes = 1, n_estimators = 1)

    # Running grid search
    search = GridSearchCV(base, parameters, verbose=3, cv=3)
    search.fit(X_train, Y_train)
    print(f"Finished CV for PTB Dataset: Top score {search.best_score_}\n"
              f"Best parameters: {search.cv_results_['params'][search.best_index_]}")
    
    # Run tests
    pred_test = search.best_estimator_.predict(X_test)

    f1 = f1_score(Y_test, pred_test)
    print("Test f1 score : %s "% f1)

    acc = accuracy_score(Y_test, pred_test)
    print("Test accuracy score : %s "% acc)

    auroc = roc_auc_score(Y_test, pred_test)
    print("Test AUROC : %s "% auroc)

    auprc = average_precision_score(Y_test, pred_test)
    print("Test AUPRC : %s "% auprc)


# MIT-BIH Dataset

## Load Data

We load the data using the previously defined utility functions\
In order to make to combat both class imbalance and exploding runtimes, we down-sample the majority class and upsample the minority classes for the training set as described in the report.

In [6]:
# Problem parameters
unpadded_size = 187
padded_size = 256

# Load data MIT
df_train = pd.read_csv(data_dir.joinpath("mitbih_train.csv"), header=None)
df_train = df_train.sample(frac=1)
df_test = pd.read_csv(data_dir.joinpath("mitbih_test.csv"), header=None)

# Separate majority and minority classes
majority_size = 8000
minority_size = 2000
 
# Downsample majority class
df_majority = resample(df_train.loc[df_train[187] == 0], 
                       replace=False,    
                       n_samples=majority_size)

# Upsample minority class
df_minority = [resample(df_train.loc[df_train[187] == i], 
                        replace=True,     
                        n_samples=minority_size)
               for i in range(1, 5)]

# Combine minority class with downsampled majority class
df_up_down_sampled = pd.concat([df_majority] + df_minority)

# Format data
X_test, Y_test   = format_data(df_test)
X_train, Y_train = format_data(df_up_down_sampled)


## Training

We first perform a grid search and then we extract the top-performing model and use this to get our results

In [None]:
# Parameters to test
parameters = {
    "classes": [5],
    "n_estimators":[50, 100],
    "n_filters": [32, 64, 128], 
    "n_dense": [16], 
    "kernel_size": [5, 8]
}

# Run CV and predict
with tf.device(device_name):

    # Initializing base learner
    base = BoostingCNN(classes = 5, n_estimators = 1)

    # Running grid search
    search = GridSearchCV(base, parameters, verbose=3, cv=3)
    search.fit(X_train, Y_train)
    print(f"Finished CV for PTB Dataset: Top score {search.best_score_}\n"
          f"Best parameters: {search.cv_results_['params'][search.best_index_]}")
    
    # Run tests
    pred_test = search.best_estimator_.predict(X_test)

    f1 = f1_score(Y_test, pred_test, average="macro")
    print("Test f1 score : %s "% f1)

    acc = accuracy_score(Y_test, pred_test)
    print("Test accuracy score : %s "% acc)
