In [1]:
import os

import warnings
warnings.filterwarnings('ignore') # :clown:

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
import neurokit2 as nk

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
import multiprocessing
import joblib
from joblib import Parallel, delayed
num_cores = multiprocessing.cpu_count()

In [6]:
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
import matplotlib.dates as md

In [7]:
import tensorflow as tf
from tensorflow import keras




In [8]:
def get_csv_file_paths(target):
    if os.path.isdir(target):
        file_paths = [os.path.join(target, f) for f in os.listdir(target) if f.endswith('.csv')]
    elif os.path.isfile(target):
        file_paths = [target]
    else:
        raise Exception('Target is not a file or directory.')

    return file_paths

In [9]:
paths = get_csv_file_paths('./data/ecg_preprocessed/')

In [10]:
def get_data(path):
    df = pd.read_csv(
        path, 
        skiprows=[0],
        names=['timestamp','signal','signal_normalised','subject_id','category','code','ECG_Raw','ECG_Clean','ECG_Rate','ECG_Quality','ECG_R_Peaks','ECG_P_Peaks','ECG_P_Onsets','ECG_P_Offsets','ECG_Q_Peaks','ECG_R_Onsets','ECG_R_Offsets','ECG_S_Peaks','ECG_T_Peaks','ECG_T_Onsets','ECG_T_Offsets','ECG_Phase_Atrial','ECG_Phase_Completion_Atrial','ECG_Phase_Ventricular','ECG_Phase_Completion_Ventricular','Index','Label'],#,'heartbeat'],
        dtype={
            'timestamp': str, 
            'signal': float, 
            'signal_normalised': float, 
            'subject_id': str, 
            'category': str, 
            'code': str, 
            'ECG_Raw': float, 
            'ECG_Clean': float, 
            'ECG_Rate': float, 
            'ECG_Quality': float, 
            'ECG_R_Peaks': float, 
            'ECG_P_Peaks': float, 
            'ECG_P_Onsets': float, 
            'ECG_P_Offsets': float, 
            'ECG_Q_Peaks': float, 
            'ECG_R_Onsets': float, 
            'ECG_R_Offsets': float, 
            'ECG_S_Peaks': float, 
            'ECG_T_Peaks': float, 
            'ECG_T_Onsets': float, 
            'ECG_T_Offsets': float, 
            'ECG_Phase_Atrial': float, 
            'ECG_Phase_Completion_Atrial': float, 
            'ECG_Phase_Ventricular': float, 
            'ECG_Phase_Completion_Ventricular': float, 
            'Index': 'Int64', 
            'Label': str,
        }
    )
    return df[['timestamp', 'ECG_Clean', 'subject_id', 'category']]

data = Parallel(n_jobs=4)(delayed(get_data)(path) for path in paths)

In [11]:
train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.25)

In [12]:
# pd.concat(train).to_hdf('./data/dataset.h5', key='train') ## memory error
for df in train:
    df.to_hdf('./data/dataset.h5', key='train', format='table', mode='a')

In [13]:
del(train)

In [14]:
# pd.concat(test).to_hdf('./data/dataset.h5', key='test') # memory error
for df in test:
    df.to_hdf('./data/dataset.h5', key='test', format='table', mode='a')

In [15]:
del(test)

In [16]:
# pd.concat(val).to_hdf('./data/dataset.h5', key='val') # memory error
for df in val:
    df.to_hdf('./data/dataset.h5', key='val', format='table', mode='a')

In [17]:
del(val)