NOTE: Used to preprocess ECG samples. Takes downloaded EDF files on desktop and converts to usable ECG segments stored in Pickle files.

Retrieve imports

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from pyedflib import highlevel
import pyedflib as plib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.models import load_model
import neurokit2 as nk
import os

(Optional) Read metadata of the file (what the ECG channel is named as, sampling frequency, etc.)

In [None]:
file_path = "INSERT FILE PATH"
signals, signal_headers, header = highlevel.read_edf(file_path, ch_names="ECG")
print(signal_headers)

Preprocessing unlabeled data. Read directly from files stored on desktop (previously downloaded as EDFs). Apply Neurokit2 cleaning filter and save all the data in a DataFrame. Then save the dataframe as a pickle file (space-efficient).

Must run once for each database (change the root directory path and pickle file path as needed).

Notes: Only takes 42 samples per ECG from t=10000s to t=20000s. If the ECG channel name is not name "ECG", then the read_edf parameter must be modified as well.

In [None]:
root_directory_path = "F:\\Research\\wsc\\polysomnography"

pickle_file_path = "C:\\Users\\Gang Ren\\Research\\wsc_cleaned_ECG_data_30s_all.pkl.gz"

master_df = pd.DataFrame({})

# Iterate over files in directory
file_count = 0
sample_count = 0
for name in os.listdir(root_directory_path):
    try:
        # Open file
        file_path = os.path.join(root_directory_path, name);
        signals, signal_headers, header = highlevel.read_edf(file_path, ch_names="ECG")
        
        # Calculate the factor by which to downsample
        factor = (int) (signal_headers[0]['sample_frequency'] // 8.0) ## Frequency = 8.0 hz
    
        size = signals[0].size
        
        # Ensure the size is divisible by the factor
        truncated_size = size - (size % factor)
        downsampled = signals[0][:truncated_size].reshape(-1, factor).mean(axis=1)

    
        for i in range(10000, 20000, 240):
            downsampled_slice = downsampled[i:i+240] ## 240 samples, 30 s
            cleaned = nk.ecg_clean(downsampled_slice, sampling_rate=8.0, method="neurokit") ## Frequency = 8.0 hz
            df = pd.DataFrame({name : cleaned}).T
            master_df = pd.concat([master_df, df]);
            sample_count += 1
        file_count += 1
    
        print ("Done with file " + str(file_count) + ", Done with " + str(sample_count) + " samples, Original Sampling Rate " + str(signal_headers[0]['sample_frequency']))
    
        master_df.to_pickle(pickle_file_path, compression="gzip")
    
    except Exception as e:
        # Catch and log any errors for the current file
        print(f"Error processing file {name}: {e}")

Preprocessing labeled data (CAP database). Read directly from files stored on desktop (previously downloaded as EDFs). Apply Neurokit2 cleaning filter and save all the data in a DataFrame. Then save the dataframe as a pickle file (space-efficient).

In [None]:
root_directory_path = "F:\\Research\\capslpdb"

# File path for the pickle file
pickle_file_path = "C:\\Users\\Gang Ren\\Research\\capslpdb_cleaned_ECG_data_30s_all.pk1.gz"

master_df = pd.DataFrame({})

n_count = 0
rbd_count = 0

# Iterate over files in directory
for name in os.listdir(root_directory_path):
    if name.endswith(".edf") == False:
        continue
    
    if name.startswith("rbd"):
        label = -1
    elif name.startswith("n") and name.startswith("nfle") == False and name.startswith("narco") == False:
        label = 1
    else:
        continue
    
    # Open file -- account for the different channel names in CAP database
    file_path = os.path.join(root_directory_path, name);
    try:
        signals, signal_headers, header = highlevel.read_edf(file_path, ch_names="ECG1-ECG2")
        if (len(signals) == 0):
            signals, signal_headers, header = highlevel.read_edf(file_path, ch_names="ECG")
        if (len(signals) == 0):
            signals, signal_headers, header = highlevel.read_edf(file_path, ch_names="EKG")
    except OSError as err:
        print("OS error: " + file_path)

    # Calculate the factor by which to downsample
    try:
        factor = (int) (signal_headers[0]['sample_frequency'] // 8.0)
    except IndexError as err:
        print ("Index error: " + file_path)
    else:
        downsampled = signals[0].reshape(-1, factor).mean(axis=1)
        print(len(downsampled))
        for i in range(0, len(downsampled)-240, 240):
            downsampled_slice = downsampled[i:i+240] ## 240 samples, 30 s
            cleaned = nk.ecg_clean(downsampled_slice, sampling_rate=8.0, method="neurokit")
            cleaned_with_label = np.insert(cleaned, 0, label)
            df = pd.DataFrame({name : cleaned_with_label}).T
            master_df = pd.concat([master_df, df])

        if label == -1:
            rbd_count += 1
        else:
            n_count += 1

    print ("Done with " + name)

print(len(master_df))
print(rbd_count)
print(n_count)
master_df.to_pickle(pickle_file_path, compression="gzip")