## Pre-process data

In [1]:
# normalization
def normalize_data(data):
    """
    Normalize a signal using min-max normalization.
    
    Parameters:
    - signal: Input signal (numpy array or list)
    
    Returns:
    - normalized_signal: Normalized signal
    """
    
    row, __ = data.shape
    processed_data = np.zeros(data.shape)
    
    for lead in range(row):
        # Calculate the minimum and maximum values of the signal
        min_val = np.min(data[lead])
        max_val = np.max(data[lead])

        # Perform min-max normalization
        normalized_signal = (data[lead] - min_val) / (max_val - min_val)
        
        processed_data[lead] = normalized_signal
    
    return processed_data

# baseline 
def baseline(data):
    row,__ = data.shape
    sampling_frequency = 500

    win_size = int(np.round(0.2 * sampling_frequency)) + 1
    baseline = scipy.ndimage.median_filter(data, [1, win_size], mode="constant")
    win_size = int(np.round(0.6 * sampling_frequency)) + 1
    baseline = scipy.ndimage.median_filter(baseline, [1, win_size], mode="constant")
    filt_data = data - baseline

    return filt_data

# notch filter
def notch(data):
    sampling_frequency = 500
    row, __ = data.shape
    processed_data = np.zeros(data.shape)
    b = np.ones(int(0.02 * sampling_frequency)) / 50.
    a = [1]
    for lead in range(0, row):
        X = scipy.signal.filtfilt(b, a, data[lead,:])
        processed_data[lead,:] = X
        
    return processed_data


#### Dataset 1

In [None]:
# cfg["dataloader"]["normalize_y"]  # True
# cfg["dataloader"]["notch_filter"]   # False
# cfg["dataloader"]["baseline_filter"]  # True
# cfg["dataloader"]["mean_zero"]  # True

In [6]:
import time
import ecg
import config

import numpy as np
import pandas as pd
import scipy.signal
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

mode = "train"
task = "scd_classification"
cfg_updates={}

# configaration
cfg = config.update_config_dict(
    config.cfg,
    config.task_cfg[task],
)
cfg = config.update_config_dict(
    config.cfg,
    cfg_updates,
)

if mode == "train":

    # using TensorDataset to creat dataset directly
    waveform_file_path = cfg["dataloader"]["waveforms_file"]
    ecg_waveforms = np.load(waveform_file_path).astype(np.float32)

    labels_file_path = cfg["dataloader"]["label_file"]
    ecg_labels = np.array(pd.read_csv(labels_file_path)[cfg["dataloader"]["label_keys"]])

    # do some preprocess to the waveforms data
    if (cfg["dataloader"]["normalize_y"] or cfg["dataloader"]["notch_filter"] 
        or cfg["dataloader"]["baseline_filter"] or cfg["dataloader"]["mean_zero"]):

        for i in range(len(ecg_waveforms)):

            # baseline filter
            if cfg["dataloader"]["baseline_filter"]:
                ecg_waveforms[i] = baseline(ecg_waveforms[i])

            # notch filter
            if cfg["dataloader"]["notch_filter"]:
                ecg_waveforms[i] = notch(ecg_waveforms[i])

            # normalization
            if cfg["dataloader"]["normalize_y"]:
                ecg_waveforms[i] = normalize_data(ecg_waveforms[i])

            # zero mean
            if cfg["dataloader"]["mean_zero"]:
                for lead in range(len(ecg_waveforms[i])):
                    ecg_waveforms[i][lead] = ecg_waveforms[i][lead] - np.mean(ecg_waveforms[i][lead])

        print('data preprocessed!')

    # Step 1: Split into training and test data 
    temp_data, test_data, temp_labels, test_labels = train_test_split(ecg_waveforms, ecg_labels, test_size = 0.2, random_state=42)

    # Step 2: Split the temporary data into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(temp_data, temp_labels, test_size=0.25, random_state=42)

    waveforms = {"train": train_data,
                "test": test_data,
                "valid": val_data,
                "all": ecg_waveforms}

    labels = {"train": train_labels,
            "test": test_labels,
            "valid": val_labels,
             "all": ecg_labels}

    # creat datasets
    datasets = {k: TensorDataset(torch.tensor(waveforms[k], dtype=torch.float32), 
                                 torch.tensor(labels[k], dtype=torch.float32)) 
                for k in ["train", "valid", "test", "all"]}
    

# Save the datasets to a file
torch.save(datasets, 'datasets_baseline_norm_zero.pth')

data preprocessed!


In [None]:
# remove nan

# load dataset directly
datasets = torch.load('datasets_baseline_norm_zero.pth')

filtered_datasets = {}

for k in ["train", "valid", "test", "all"]:
    filtered_data, filtered_labels = zip(*[(data, label) for data, label in datasets[k] if not torch.isnan(data).any().item()])
    filtered_datasets[k] = TensorDataset(torch.stack(filtered_data), torch.stack(filtered_labels))

# Save the datasets to a file
torch.save(filtered_datasets, 'datasets_baseline_norm_zero_remove_nan.pth')

#### Dataset 2

In [2]:
import time
import ecg
import config

import numpy as np
import pandas as pd
import scipy.signal
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

mode = "train"
task = "scd_classification"
cfg_updates={}

# configaration
cfg = config.update_config_dict(
    config.cfg,
    config.task_cfg[task],
)
cfg = config.update_config_dict(
    config.cfg,
    cfg_updates,
)

cfg["dataloader"]["normalize_y"] = False

if mode == "train":

    # using the new dataset defined by myself
    # datasets = {k: ECGDataset(cfg["dataloader"], k) for k in ["train", "valid", "test", "all"]}

    # the original defined dataset
    # datasets = {k: ECGDataset(cfg["dataloader"], k, output=output, all_waveforms=True) for k in ["train", "valid", "test", "all"]}

    # using TensorDataset to creat dataset directly
    waveform_file_path = cfg["dataloader"]["waveforms_file"]
    ecg_waveforms = np.load(waveform_file_path).astype(np.float32)

    labels_file_path = cfg["dataloader"]["label_file"]
    ecg_labels = np.array(pd.read_csv(labels_file_path)[cfg["dataloader"]["label_keys"]])

    # do some preprocess to the waveforms data
    if (cfg["dataloader"]["normalize_y"] or cfg["dataloader"]["notch_filter"] 
        or cfg["dataloader"]["baseline_filter"] or cfg["dataloader"]["mean_zero"]):

        for i in range(len(ecg_waveforms)):

            # baseline filter
            if cfg["dataloader"]["baseline_filter"]:
                ecg_waveforms[i] = baseline(ecg_waveforms[i])

            # notch filter
            if cfg["dataloader"]["notch_filter"]:
                ecg_waveforms[i] = notch(ecg_waveforms[i])

            # normalization
            if cfg["dataloader"]["normalize_y"]:
                ecg_waveforms[i] = normalize_data(ecg_waveforms[i])

            # zero mean
            if cfg["dataloader"]["mean_zero"]:
                for lead in range(len(ecg_waveforms[i])):
                    ecg_waveforms[i][lead] = ecg_waveforms[i][lead] - np.mean(ecg_waveforms[i][lead])

        print('data preprocessed!')

    # Step 1: Split into training and test data 
    temp_data, test_data, temp_labels, test_labels = train_test_split(ecg_waveforms, ecg_labels, test_size = 0.2, random_state=42)

    # Step 2: Split the temporary data into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(temp_data, temp_labels, test_size=0.25, random_state=42)

    waveforms = {"train": train_data,
                "test": test_data,
                "valid": val_data,
                "all": ecg_waveforms}

    labels = {"train": train_labels,
            "test": test_labels,
            "valid": val_labels,
             "all": ecg_labels}

    # creat datasets
    datasets = {k: TensorDataset(torch.tensor(waveforms[k], dtype=torch.float32), 
                                 torch.tensor(labels[k], dtype=torch.float32)) 
                for k in ["train", "valid", "test", "all"]}
    

# Save the datasets to a file
torch.save(datasets, 'datasets_baseline_zero.pth')

data preprocessed!


#### Dataset 3

In [2]:
import time
import ecg
import config

import numpy as np
import pandas as pd
import scipy.signal
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

mode = "train"
task = "scd_classification"
cfg_updates={}

# configaration
cfg = config.update_config_dict(
    config.cfg,
    config.task_cfg[task],
)
cfg = config.update_config_dict(
    config.cfg,
    cfg_updates,
)

cfg["dataloader"]["normalize_y"] = False
cfg["dataloader"]["baseline_filter"] = False

if mode == "train":

    # using the new dataset defined by myself
    # datasets = {k: ECGDataset(cfg["dataloader"], k) for k in ["train", "valid", "test", "all"]}

    # the original defined dataset
    # datasets = {k: ECGDataset(cfg["dataloader"], k, output=output, all_waveforms=True) for k in ["train", "valid", "test", "all"]}

    # using TensorDataset to creat dataset directly
    waveform_file_path = cfg["dataloader"]["waveforms_file"]
    ecg_waveforms = np.load(waveform_file_path).astype(np.float32)

    labels_file_path = cfg["dataloader"]["label_file"]
    ecg_labels = np.array(pd.read_csv(labels_file_path)[cfg["dataloader"]["label_keys"]])

    # do some preprocess to the waveforms data
    if (cfg["dataloader"]["normalize_y"] or cfg["dataloader"]["notch_filter"] 
        or cfg["dataloader"]["baseline_filter"] or cfg["dataloader"]["mean_zero"]):

        for i in range(len(ecg_waveforms)):

            # baseline filter
            if cfg["dataloader"]["baseline_filter"]:
                ecg_waveforms[i] = baseline(ecg_waveforms[i])

            # notch filter
            if cfg["dataloader"]["notch_filter"]:
                ecg_waveforms[i] = notch(ecg_waveforms[i])

            # normalization
            if cfg["dataloader"]["normalize_y"]:
                ecg_waveforms[i] = normalize_data(ecg_waveforms[i])

            # zero mean
            if cfg["dataloader"]["mean_zero"]:
                for lead in range(len(ecg_waveforms[i])):
                    ecg_waveforms[i][lead] = ecg_waveforms[i][lead] - np.mean(ecg_waveforms[i][lead])

        print('data preprocessed!')

    # Step 1: Split into training and test data 
    temp_data, test_data, temp_labels, test_labels = train_test_split(ecg_waveforms, ecg_labels, test_size = 0.2, random_state=42)

    # Step 2: Split the temporary data into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(temp_data, temp_labels, test_size=0.25, random_state=42)

    waveforms = {"train": train_data,
                "test": test_data,
                "valid": val_data,
                "all": ecg_waveforms}

    labels = {"train": train_labels,
            "test": test_labels,
            "valid": val_labels,
             "all": ecg_labels}

    # creat datasets
    datasets = {k: TensorDataset(torch.tensor(waveforms[k], dtype=torch.float32), 
                                 torch.tensor(labels[k], dtype=torch.float32)) 
                for k in ["train", "valid", "test", "all"]}
    

# Save the datasets to a file
torch.save(datasets, 'datasets_zero.pth')

data preprocessed!


#### Dataset 4

In [4]:
import time
import ecg
import config

import numpy as np
import pandas as pd
import scipy.signal
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

mode = "train"
task = "scd_classification"
cfg_updates={}

# configaration
cfg = config.update_config_dict(
    config.cfg,
    config.task_cfg[task],
)
cfg = config.update_config_dict(
    config.cfg,
    cfg_updates,
)

cfg["dataloader"]["normalize_y"] = False
cfg["dataloader"]["baseline_filter"] = False
cfg["dataloader"]["mean_zero"] = False

if mode == "train":

    # using the new dataset defined by myself
    # datasets = {k: ECGDataset(cfg["dataloader"], k) for k in ["train", "valid", "test", "all"]}

    # the original defined dataset
    # datasets = {k: ECGDataset(cfg["dataloader"], k, output=output, all_waveforms=True) for k in ["train", "valid", "test", "all"]}

    # using TensorDataset to creat dataset directly
    waveform_file_path = cfg["dataloader"]["waveforms_file"]
    ecg_waveforms = np.load(waveform_file_path).astype(np.float32)

    labels_file_path = cfg["dataloader"]["label_file"]
    ecg_labels = np.array(pd.read_csv(labels_file_path)[cfg["dataloader"]["label_keys"]])

    # do some preprocess to the waveforms data
    if (cfg["dataloader"]["normalize_y"] or cfg["dataloader"]["notch_filter"] 
        or cfg["dataloader"]["baseline_filter"] or cfg["dataloader"]["mean_zero"]):

        for i in range(len(ecg_waveforms)):

            # baseline filter
            if cfg["dataloader"]["baseline_filter"]:
                ecg_waveforms[i] = baseline(ecg_waveforms[i])

            # notch filter
            if cfg["dataloader"]["notch_filter"]:
                ecg_waveforms[i] = notch(ecg_waveforms[i])

            # normalization
            if cfg["dataloader"]["normalize_y"]:
                ecg_waveforms[i] = normalize_data(ecg_waveforms[i])

            # zero mean
            if cfg["dataloader"]["mean_zero"]:
                for lead in range(len(ecg_waveforms[i])):
                    ecg_waveforms[i][lead] = ecg_waveforms[i][lead] - np.mean(ecg_waveforms[i][lead])

        print('data preprocessed!')

    # Step 1: Split into training and test data 
    temp_data, test_data, temp_labels, test_labels = train_test_split(ecg_waveforms, ecg_labels, test_size = 0.2, random_state=42)

    # Step 2: Split the temporary data into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(temp_data, temp_labels, test_size=0.25, random_state=42)

    waveforms = {"train": train_data,
                "test": test_data,
                "valid": val_data,
                "all": ecg_waveforms}

    labels = {"train": train_labels,
            "test": test_labels,
            "valid": val_labels,
             "all": ecg_labels}

    # creat datasets
    datasets = {k: TensorDataset(torch.tensor(waveforms[k], dtype=torch.float32), 
                                 torch.tensor(labels[k], dtype=torch.float32)) 
                for k in ["train", "valid", "test", "all"]}
    

# Save the datasets to a file
torch.save(datasets, 'datasets.pth')

#### Dataset 5

In [None]:
# cfg["dataloader"]["normalize_y"]  # True
# cfg["dataloader"]["notch_filter"]   # True
# cfg["dataloader"]["baseline_filter"]  # True
# cfg["dataloader"]["mean_zero"]  # True

In [2]:
import time
import ecg
import config

import numpy as np
import pandas as pd
import scipy.signal
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

mode = "train"
task = "scd_classification"
cfg_updates={}

# configaration
cfg = config.update_config_dict(
    config.cfg,
    config.task_cfg[task],
)
cfg = config.update_config_dict(
    config.cfg,
    cfg_updates,
)

cfg["dataloader"]["notch_filter"] = True

if mode == "train":

    # using TensorDataset to creat dataset directly
    waveform_file_path = cfg["dataloader"]["waveforms_file"]
    ecg_waveforms = np.load(waveform_file_path).astype(np.float32)

    labels_file_path = cfg["dataloader"]["label_file"]
    ecg_labels = np.array(pd.read_csv(labels_file_path)[cfg["dataloader"]["label_keys"]])

    # do some preprocess to the waveforms data
    if (cfg["dataloader"]["normalize_y"] or cfg["dataloader"]["notch_filter"] 
        or cfg["dataloader"]["baseline_filter"] or cfg["dataloader"]["mean_zero"]):

        for i in range(len(ecg_waveforms)):

            # baseline filter
            if cfg["dataloader"]["baseline_filter"]:
                ecg_waveforms[i] = baseline(ecg_waveforms[i])

            # notch filter
            if cfg["dataloader"]["notch_filter"]:
                ecg_waveforms[i] = notch(ecg_waveforms[i])

            # normalization
            if cfg["dataloader"]["normalize_y"]:
                ecg_waveforms[i] = normalize_data(ecg_waveforms[i])

            # zero mean
            if cfg["dataloader"]["mean_zero"]:
                for lead in range(len(ecg_waveforms[i])):
                    ecg_waveforms[i][lead] = ecg_waveforms[i][lead] - np.mean(ecg_waveforms[i][lead])

        print('data preprocessed!')

    # Step 1: Split into training and test data 
    temp_data, test_data, temp_labels, test_labels = train_test_split(ecg_waveforms, ecg_labels, test_size = 0.2, random_state=42)

    # Step 2: Split the temporary data into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(temp_data, temp_labels, test_size=0.25, random_state=42)

    waveforms = {"train": train_data,
                "test": test_data,
                "valid": val_data,
                "all": ecg_waveforms}

    labels = {"train": train_labels,
            "test": test_labels,
            "valid": val_labels,
             "all": ecg_labels}

    # creat datasets
    datasets = {k: TensorDataset(torch.tensor(waveforms[k], dtype=torch.float32), 
                                 torch.tensor(labels[k], dtype=torch.float32)) 
                for k in ["train", "valid", "test", "all"]}
    

# Save the datasets to a file
torch.save(datasets, 'datasets_baseline_notch_norm_zero.pth')

  normalized_signal = (data[lead] - min_val) / (max_val - min_val)


data preprocessed!


In [3]:
# remove nan

# load dataset directly
datasets = torch.load('datasets_baseline_notch_norm_zero.pth')

filtered_datasets = {}

for k in ["train", "valid", "test", "all"]:
    filtered_data, filtered_labels = zip(*[(data, label) for data, label in datasets[k] if not torch.isnan(data).any().item()])
    filtered_datasets[k] = TensorDataset(torch.stack(filtered_data), torch.stack(filtered_labels))

# Save the datasets to a file
torch.save(filtered_datasets, 'datasets_baseline_notch_norm_zero_remove_nan.pth')