# load all long ECG recordings, and indivisual ECG recordings from the original Physionet dataset
This file is showing an example of how to load the ECG data from the physionet dataset that was originally downloaded from https://www.physionet.org/content/qtdb/1.0.0/

This is an exercise of refactorize my code developed couple of years ago.

This file contains using different machine learning methods to conduct signal segmentation.


Make sure to refer to 'requirements.txt' to use the packages with the listed version. With some other versions of packages, it may not be compatible with 'wfdb' package, most likely the newer version of'numpy' package

In [9]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import wfdb  # waveform database package, a library of tools for reading, writing, and processing WFDB signals and annotations.
import pickle

from lib import HelpferFunctions as hf  # custom module with helperfunctions needed here

### Reorganize data files
Each data file contains recordings from two channels, these two channels are ECG recordings collected at the same time, and should be two different recording electrodes. Each data file may include different length of samples. 

Thus, here, we figure out what is the minimum length of a recording, and truncate all the data using this minimum length and reorganize data into a big single dataset. 

In [3]:
#%% load all the files
data_path   = 'qt-database-1.0.0'  # this cardiac data was downloaded and saved locally
all_file_names = os.listdir(data_path)

min_sig_len = None
all_pu1s =[] # get all the pu1 files

for a_file in all_file_names:
    if 'pu1' in a_file:
        a_file_name=a_file.split('.')       
        record_name = a_file_name[0] # just the file name, not including extension
        
        if record_name not in all_pu1s:  # make sure not dupilcated names, so one file will not be added twice
            all_pu1s.append(record_name) # only add the file name, not extension
            record_path = os.path.join(data_path, record_name)
            record = wfdb.rdrecord(record_path)
            signals, fields = wfdb.rdsamp(record_path)
            fs = fields['fs']
            length = fields['sig_len']
            
            if min_sig_len is not None:
                min_sig_len = min(min_sig_len, length)
            else:                
                min_sig_len = length

print("Total number of recording files are:", len(all_pu1s))
print("Minimum signal length is:" , min_sig_len)

Total number of recording files are: 105
Minimum signal length is: 224993


Now reorganize separate recordings into single data file

In [6]:
All_data_list = []  # create a space to host all the data
All_annotation_sample = []
All_annotation_symbol = []
All_annotation_list = [] # host all the ECG signal annotation

for record_name in all_pu1s:
    record_path = os.path.join(data_path, record_name)
    record = wfdb.rdrecord(record_path)        # read a single file
    signals, fields = wfdb.rdsamp(record_path) # extract the signals from a single file
   
    # read the annotation for a single file
    annotation   = wfdb.rdann(record_path, 'pu1')
    annot_expand = hf.expand_annotation(annotation.sample, annotation.symbol, length)
    
    # use the smallest signal length to truncate all the data, so that 
    # we can contatenate all the signal recordings in a single large dataset
    signals2add = signals[0:min_sig_len,:]
    annotation2add = annot_expand[0:min_sig_len]
    
    All_data_list.append(signals2add)
    All_annotation_sample.append(annotation.sample)
    All_annotation_symbol.append(annotation.symbol)
    All_annotation_list.append(annotation2add)
    
All_data = np.asarray(All_data_list)  # size(num_obs, time_stamps, n_features)
All_annotation = np.asarray(All_annotation_list)  # size(num_obs, time_stamps)

print("Data file shape is [number of observations, time stamps, number of features]")
print(All_data.shape)
print("Annotation file shape is [number of observations, time stamps]")
print(All_annotation.shape)

All_labels = All_annotation.reshape(All_annotation.shape[0], All_annotation.shape[1], 1)
print("Reshape annotation file to meet the requirement of the lstm model input")
print(All_labels.shape)

Data file shape is [number of observations, time stamps, number of features]
(105, 224993, 2)
Annotation file shape is [number of observations, time stamps]
(105, 224993)
Reshape annotation file to meet the requirement of the lstm model input
(105, 224993, 1)


### Save the data

In [7]:
# Example data to save
data = {
    "ECG": {'data': All_data, 
            'explain': 'data size is with [105, 224993, 2], 105 data files, with each recording length is 224993, and each recording has 2 channel of ECG',},
    "fs": fs,
    "All_annotation": {'data': All_annotation,
                       'explain': 'converted annotation of all data points. -1: baseline, 0: N, 1: st, 2:t, 3:iso, 4:p, 5:pq'},
    "All_annotation_sample": {'data': All_annotation_sample,
                              'explain': 'original annoation sample for all recordings files'},
    "All_annotation_symbol": {'data': All_annotation_symbol,
                              'explain': 'original annoation symbol for all recordings files'},
}

# File name for the pickle file
filename = "long_EKG_recording.pkl"
file_path = os.path.join("data", filename)




# Save the data to the pickle file
if os.path.exists(file_path):
        print(f"File '{file_path}' already exists. Skipping save.")
else:
    try:
        with open(file_path, "wb") as file:  # "wb" mode for writing binary data
            pickle.dump(data, file)
        print(f"Data successfully saved to {filename}")
    
    except Exception as e:
        print(f"An error occurred: {e}")#%%
    



File 'data/long_EKG_recording.pkl' already exists. Skipping save.


#### After saving the data, load the data. 

In [16]:
try:
    with open(file_path, "rb") as file: # "rb" mode for reading binary data
        loaded_data = pickle.load(file)
    print("Loaded data:", loaded_data.keys())

except FileNotFoundError:
    print(f"File {filename} not found.")
except Exception as e:
    print(f"An error occured during loading: {e}")

Loaded data: dict_keys(['ECG', 'fs', 'all_annotation'])


#### Save the long ECG recording files into individual ECG recordings,
Use a custom funtion seg_single_ECGs() to seprate individual ECG signals from the long recordings, 

In [12]:
#%%  save all long ECG file into smaller/individual ECG files

all_ECG_data = data['ECG']['data']
all_annotation_expand = data['All_annotation']['data']
All_annotation_sample = data['All_annotation_sample']['data']
All_annotation_symbol = data['All_annotation_symbol']['data']


n = len(all_ECG_data) # the total number of ECG recording files

# initialize the object to hold data in each iteration of the long ECG recording file.
all_ECG_array = None
all_annotation = None

# iterate through each long ECG long recordings
for i in range(n):
    my_signal_all_chs = all_ECG_data[i]  # all channels ECG signals for a single recording example
    
    one_anntype = All_annotation_symbol[i]
    one_annsamp = All_annotation_sample[i]
    one_expand  = all_annotation_expand[i]
    
    # the first channel ECG data
    my_signal1 = my_signal_all_chs[:, 0]
    (single_ECG_list1, single_annotation_list1) = hf.seg_single_ECGs(my_signal1, one_anntype, one_annsamp, one_expand)
    # the 2nd channel ECG data
    my_signal2 = my_signal_all_chs[:, 1]
    (single_ECG_list2, single_annotation_list2) = hf.seg_single_ECGs(my_signal2, one_anntype, one_annsamp, one_expand)
    ''' single_annotation_list1 and single_annotation_list2 should be the same, 
        as they started the same annotation for long ECG recordings. 
    '''
    
    # convert all the chs of single ECG signals into one big file,
    # size (num_obs, time_length, chs)
    single_ECG_array1 = np.asarray(single_ECG_list1)[:,:,np.newaxis]
    single_ECG_array2 = np.asarray(single_ECG_list2)[:,:,np.newaxis]
    single_ECG_array = np.concatenate((single_ECG_array1, single_ECG_array2),axis=2)
 
    # size (num_obs, time_length)
    single_annotation = np.asarray(single_annotation_list2)
    
    ## concatenate the single ECG file and annotation file into larger file with more observations
    if all_ECG_array is None: all_ECG_array = single_ECG_array
    else: all_ECG_array = np.concatenate((all_ECG_array, single_ECG_array),axis=0) # add more observations extracted from this new long recording file
        
    if all_annotation is None: all_annotation = single_annotation
    else: all_annotation = np.concatenate((all_annotation, single_annotation),axis=0) # add more observations extracted from this new long recording file
        


In [13]:
print("Data file shape is [number of observations, time stamps, number of features]")
print(all_ECG_array.shape)
print("Annotation file shape is [number of observations, time stamps]")
print(all_annotation.shape)

Data file shape is [number of observations, time stamps, number of features]
(111167, 140, 2)
Annotation file shape is [number of observations, time stamps]
(111167, 140)


In [14]:
# Example data to save
data = {
    "ECG": {'data': all_ECG_array, 
            'explain': 'data size is with [111167, 140, 2], 111167 individual ECG recordings, with each recording length is 140, and each recording has 2 channel of ECG',},
    "fs": fs,
    "all_annotation": {'data': all_annotation,
                       'explain': 'converted annotation of all data points for each individual ECG. -1: baseline, 0: N, 1: st, 2:t, 3:iso, 4:p, 5:pq'},
}

# File name for the pickle file
filename = "individual_EKG_recording.pkl"
file_path = os.path.join("data", filename)

# Save the data to the pickle file
if os.path.exists(file_path):
        print(f"File '{file_path}' already exists. Skipping save.")
else:
    try:
        with open(file_path, "wb") as file:  # "wb" mode for writing binary data
            pickle.dump(data, file)
        print(f"Data successfully saved to {filename}")
    
    except Exception as e:
        print(f"An error occurred: {e}")#%%


File 'data/individual_EKG_recording.pkl' already exists. Skipping save.


In [15]:
try:
    with open(file_path, "rb") as file: # "rb" mode for reading binary data
        individual_data = pickle.load(file)
    print("Loaded data:", individual_data.keys())

except FileNotFoundError:
    print(f"File {filename} not found.")
except Exception as e:
    print(f"An error occured during loading: {e}")

Loaded data: dict_keys(['ECG', 'fs', 'all_annotation'])


## Use LSTM layer
Decode the point-wise output of the ECG signal, so to achieve the segmentation of the overall signal.

You might have problem install tensorflow on Mac M1 machine, 
Below are the steps I followed:
#### 1.create a new conda environment:
conda create -n tensorflow-env python=3.10
conda activate tensorflow-env
#### 2.install the Apple tensorflow dependencies:
conda install -c apple tensorflow-deps
#### 3.insall tensorflow for macOS:
pip install tensorflow-macos
#### 4.install the metal plugin for GPU support:
pip install tensorflow-metal


In [4]:
import tensorflow as tf   # tf.__version__ : '2.0.0'
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'

split the data into 80% training and 20% testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_data, all_labels, test_size=0.2)  # 20% of the total data left as testing data
    
# get some data within training data, to be used as validation during training.
num_validation = 100
start_index    = 500
mask = range(start_index, start_index + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]