In [1]:
import os

import librosa
import logging
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.signal
import re

import pickle

from scipy.io import wavfile
import tensorflow as tf
import matplotlib.pyplot as plt

# Input files

In [2]:
# Load the train dataset
train_df = pd.read_pickle('../train_physionet_2016.pkl')
val_df = pd.read_pickle('../validation_physionet_2016.pkl')
#test_df = pd.read_pickle('../test_physionet_2016.pkl')

# Convert the loaded DataFrame to the desired numpy format
train_data = []
for index, row in train_df.iterrows():
    patient_id = row['Patient ID']
    homomorphic = row['Homomorphic']
    cwt_morl = row['CWT_Morl']
    cwt_mexh = row['CWT_Mexh']
    hilbert_env = row['Hilbert_Env']
    labels = row['Labels']

    # Append each patient's data as a tuple to the train_data list
    train_data.append([patient_id, homomorphic, cwt_morl, cwt_mexh, hilbert_env, labels])

val_data = []
for index, row in val_df.iterrows():
    patient_id = row['Patient ID']
    homomorphic = row['Homomorphic']
    cwt_morl = row['CWT_Morl']
    cwt_mexh = row['CWT_Mexh']
    hilbert_env = row['Hilbert_Env']
    labels = row['Labels']

    # Append each patient's data as a tuple to the train_data list
    val_data.append([patient_id, homomorphic, cwt_morl, cwt_mexh, hilbert_env, labels])

""" test_data = []
for index, row in val_df.iterrows():
    patient_id = row['Patient ID']
    homomorphic = row['Homomorphic']
    cwt_morl = row['CWT_Morl']
    cwt_mexh = row['CWT_Mexh']
    hilbert_env = row['Hilbert_Env']
    labels = row['Labels']

    # Append each patient's data as a tuple to the train_data list
    test_data.append([patient_id, homomorphic, cwt_morl, cwt_mexh, hilbert_env, labels]) """

# Convert train_data to a numpy array with dtype=object to handle mixed types
train = np.array(train_data, dtype=object)
val = np.array(val_data, dtype=object)
#test = np.array(test_data, dtype=object)

## Filtering Smaller than patch

In [3]:
def filter_smaller_than_patch(features, patch_size):
    # Remove sounds shorter than patch size and return their indices
    return np.array([j for j in range(len(features)) if len(features[j]) >= patch_size], dtype=int)

patch_size = 64
nch = 4
stride = 32

# Ensure indices are integers and apply them correctly to filter the datasets
train_indices = filter_smaller_than_patch(train[:,2], patch_size)
val_indices = filter_smaller_than_patch(val[:,2], patch_size)
#test_indices = filter_smaller_than_patch(test[:,2], patch_size)

train = train[train_indices, ...]
val = val[val_indices, ...]
#test = test[test_indices, ...]

## Compute Patches

In [4]:
class PCGDataPreparer:
    def __init__(self, patch_size: int , stride: int, number_channels: int=4, num_states: int=4):
        self.patch_size = patch_size
        self.stride = stride
        self.number_channels = number_channels
        self.num_states = num_states
        self.features = None
        self.labels = None

    def _compute_pcg_patches(self, sound, label):
        #TODO: ask them to implement this
        num_samples = len(sound)
        # TODO: they should complete this for
        num_windows = int((num_samples - self.patch_size) / self.stride) + 1
        for window_idx in range(num_windows):
            patch_start = window_idx * self.stride
            yield sound[patch_start:patch_start + self.patch_size, :],  label[patch_start: patch_start + self.patch_size, :]

        window_remain = num_samples - self.patch_size
        if window_remain % self.stride > 0:
          yield sound[window_remain:, :], label[window_remain:, :]

    def set_features_and_labels(self, features, labels):
        self.features = features
        self.labels = labels
        num_observations = len(self.features)
        total_windows = 0
        for obs in features:
          num_samples = len(features)
          num_windows = int((num_samples - self.patch_size) / self.stride) + 1
          window_remain = num_samples - self.patch_size
          if window_remain % self.stride > 0:
              num_windows += 1
          total_windows += num_windows
        self.num_steps = total_windows

    def __call__(self):
        num_observations = len(self.labels)
        for obs_idx in range(num_observations):
            features = tf.stack(self.features[obs_idx], axis=1) # np.column_stack
            labels = self.labels[obs_idx]
            for s,y in (self._compute_pcg_patches(features, labels)):
              yield s, y

### Instantiate the Data Preparers

In [5]:
patch_size = 64
nch = 4
stride = 32
train_dp = PCGDataPreparer(patch_size=patch_size,
                     number_channels=nch,
                     stride=stride,
                     num_states=4)
train_dp.set_features_and_labels(train[:, [1,2,3,4]], train[:, 5])

val_dp = PCGDataPreparer(patch_size=patch_size,
                     number_channels=nch,
                     stride=stride,
                     num_states=4)
val_dp.set_features_and_labels(val[:, [1,2,3,4]], val[:, 5])

""" test_dp = PCGDataPreparer(patch_size=patch_size,
                     number_channels=nch,
                     stride=stride,
                     num_states=4)
test_dp.set_features_and_labels(test[:, [1,2,3,4]], test[:, 5]) """

' test_dp = PCGDataPreparer(patch_size=patch_size,\n                     number_channels=nch,\n                     stride=stride,\n                     num_states=4)\ntest_dp.set_features_and_labels(test[:, [1,2,3,4]], test[:, 5]) '

### Tensorflow Dataset and caching

In [6]:
BATCH_SIZE = 32

def get_data_from_generator(*, data_processor, batch_size, patch_size, number_channels, number_classes, trainable=True):
    data = tf.data.Dataset.from_generator(data_processor,
                                          output_signature=(
                                              tf.TensorSpec(shape=(patch_size, number_channels), dtype=tf.float32),
                                              tf.TensorSpec(shape=(patch_size, number_classes), dtype=tf.float32))
                                          )
    if trainable:
        data = data.shuffle(5000, reshuffle_each_iteration=True)
        data.cache()
    data = data.batch(batch_size)
    data = data.prefetch(tf.data.AUTOTUNE)
    return data

train_dataset = get_data_from_generator(data_processor=train_dp,
                                                batch_size=BATCH_SIZE,
                                                patch_size=patch_size,
                                                number_channels=nch,
                                                number_classes=4,
                                                trainable=True)


val_dataset = get_data_from_generator(data_processor=val_dp,
                                                batch_size=BATCH_SIZE,
                                                patch_size=patch_size,
                                                number_channels=nch,
                                                number_classes=4,
                                                trainable=False)

""" test_dataset = get_data_from_generator(data_processor=test_dp,
                                                batch_size=BATCH_SIZE,
                                                patch_size=patch_size,
                                                number_channels=nch,
                                                number_classes=4,
                                                trainable=False) """

' test_dataset = get_data_from_generator(data_processor=test_dp,\n                                                batch_size=BATCH_SIZE,\n                                                patch_size=patch_size,\n                                                number_channels=nch,\n                                                number_classes=4,\n                                                trainable=False) '

# Training Pipeline

In [7]:
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, UpSampling1D, concatenate

# TODO: provide u-net with one encoder layer only and suggest for them to
# increase its size.
def unet_pcg(nch, patch_size, dropout=0.0):
    inputs = tf.keras.layers.Input(shape=(patch_size, nch))
    conv1 = tf.keras.layers.Conv1D(8, 3, activation='relu', padding='same')(inputs)
    conv1 = tf.keras.layers.Conv1D(8, 3, activation='relu', padding='same')(conv1)
    pool1 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1)
    pool1 = tf.keras.layers.Dropout(dropout)(pool1)

    conv2 = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(pool1)
    conv2 = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(conv2)
    pool2 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv2)
    pool2 = tf.keras.layers.Dropout(dropout)(pool2)

    conv3 = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(pool2)
    conv3 = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(conv3)
    pool3 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv3)
    pool3 = tf.keras.layers.Dropout(dropout)(pool3)

    conv4 = tf.keras.layers.Conv1D(64, 3, activation='relu', padding='same')(pool3)
    conv4 = tf.keras.layers.Conv1D(64, 3, activation='relu', padding='same')(conv4)
    pool4 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv4)
    pool4 = tf.keras.layers.Dropout(dropout)(pool4)

    conv5 = tf.keras.layers.Conv1D(128, 3, activation='relu', padding='same')(pool4)
    conv5 = tf.keras.layers.Conv1D(128, 3, activation='relu', padding='same')(conv5)

    up6_prep = tf.keras.layers.UpSampling1D(size=2)(conv5)

    up6 = tf.keras.layers.concatenate([tf.keras.layers.Conv1D(64, 2, padding='same')(up6_prep), conv4], axis=2)
    up6 = tf.keras.layers.Dropout(dropout)(up6)
    conv6 = tf.keras.layers.Conv1D(64, 3, activation='relu', padding='same')(up6)
    conv6 = tf.keras.layers.Conv1D(64, 3, activation='relu', padding='same')(conv6)

    up7_prep = tf.keras.layers.UpSampling1D(size=2)(conv6)

    up7 = tf.keras.layers.concatenate([tf.keras.layers.Conv1D(64, 2, padding='same')(up7_prep), conv3], axis=2)
    up7 = tf.keras.layers.Dropout(dropout)(up7)
    conv7 = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(up7)
    conv7 = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(conv7)

    up8_prep = tf.keras.layers.UpSampling1D(size=2)(conv7)

    up8 = tf.keras.layers.concatenate([tf.keras.layers.Conv1D(32, 2, padding='same')(up8_prep), conv2], axis=2)
    up8 = tf.keras.layers.Dropout(dropout)(up8)
    conv8 = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(up8)
    conv8 = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(conv8)

    up9_prep = tf.keras.layers.UpSampling1D(size=2)(conv8)

    up9 = tf.keras.layers.concatenate([tf.keras.layers.Conv1D(8, 2, padding='same')(up9_prep), conv1], axis=2)
    up9 = tf.keras.layers.Dropout(dropout)(up9)
    conv9 = tf.keras.layers.Conv1D(8, 3, activation='relu', padding='same')(up9)
    conv9 = tf.keras.layers.Conv1D(8, 3, activation='tanh', padding='same')(conv9)

    conv10 = tf.keras.layers.Conv1D(4, 1, activation='softmax')(conv9)

    model = tf.keras.Model(inputs=[inputs], outputs=[conv10])
    return model

## Hyperparameter Setting

In [8]:
from sklearn.metrics import accuracy_score, precision_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import Precision

In [9]:
# tune hyperpararmeter, epochs, e optimizer
# choose adequate metrics
# loss crossentropy, others?
checkpoint_path = './unet_weights/checkpoint.keras'
EPOCHS = 10
learning_rate = 1e-4
model = unet_pcg(nch, patch_size)
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy',
                  metrics=['CategoricalAccuracy', 'Precision', 'Recall'])
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', save_best_only=True)
history = model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=EPOCHS,
                    # steps_per_epoch = int(np.floor(train_dp.num_steps / BATCH_SIZE)),
                    verbose=1,
                    shuffle=True, callbacks=[model_checkpoint])

model.load_weights(checkpoint_path)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Inference pipeline
Collect the predictions of the U-Net

In [10]:
predictions_train = model.predict(train_dataset)
val_test = model.predict(val_dataset)



## Post processing