## Parameters

In [3]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import os
import numpy as np
from functools import reduce

import matplotlib.pyplot as plt

In [4]:
DATA_DIR = os.path.join('/','home','ngsci','datasets','arrest-ntuh-ecg','v1')
LEAD_ORDER = ['I', 'II', 'III',
              'aVR', 'aVL', 'aVF',
              'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
LEAD_INDEX = {lead: i for i, lead in enumerate(LEAD_ORDER)}
ECG_CAPTURE_TIME = 10  # Each ECG is taken for 10 seconds
ECG_SAMPLE_RATE = 500 # Each ECG is sampled at 500 Hz

In [5]:
ecg_rhythm_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-rhythm.npy')
# ecg_rhythm_fp
ecg_lookup_fp = '/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-npy.csv'
pd.read_csv(ecg_lookup_fp)

Unnamed: 0,ecg_id,year,npy_index,ecg_timetag
0,4d2b35fb8850b75dd3f6644978e542b5,2011-14,0,[1]ROSC
1,ea01a56650768f3f41d255fe546e27a9,2011-14,1,[1]ROSC
2,e4a8dfe722c4686695265586d8b5b6bc,2011-14,2,[0]pre
3,bfae87358404a1fb0f2ee6e71296f399,2011-14,3,[1]ROSC
4,041608622e1cbdad60f80257bae3ab61,2011-14,4,[2]24hr
...,...,...,...,...
1681,104702b301e22d70dee15936c8f94376,2019,1681,[0]pre
1682,29c030ee8ec6c95ba278659535b71e4b,2019,1682,[1]ROSC
1683,ffa67b11e9cda3c9f527683a7ede1028,2019,1683,[0]pre
1684,4f788b2e831afd48205783c6174f0293,2019,1684,[1]ROSC


## Load waveform data

Load two files, `waveform-rhythm.npy` and `waveform-npy.csv`. The former one include the waveform data (1686, 12, 5000): 1686 patients, 12 leads, 5000 length of signal. The four columns of the later one are: ecg_id, year, npy_index, ecg_timetag. From these information we can know the exact patient information.

In [6]:
def load_study_group_rhythm_ecgs():
    '''Load Study Group rhythm ECGs npy and lookup CSV'''
    ecg_rhythm_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-rhythm.npy')
    waveform_rhythm_npy = np.load(ecg_rhythm_fp)
    print('Loaded {}: {}'.format(ecg_rhythm_fp, waveform_rhythm_npy.shape))
    
    ecg_lookup_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-npy.csv')
    waveform_npy_df = pd.read_csv(ecg_lookup_fp)
    print('Loaded {}: {}'.format(ecg_lookup_fp, waveform_npy_df.shape))
    
    return waveform_rhythm_npy, waveform_npy_df

In [7]:
waveform_rhythm_npy, waveform_npy_df = load_study_group_rhythm_ecgs()

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-rhythm.npy: (1686, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-npy.csv: (1686, 4)


In [8]:
waveform_rhythm_npy.shape

(1686, 12, 5000)

## Preprocess the data

Conbine with the metadata and get the information of data. In the `cardiac_df`, there is the label and filename&index.

In [9]:
CARDIAC_CONTROLS = ["qtc500", "lbbb", "stelev", "stdown", "dltwv", "ventricular_rate", "atrial_rate", "qrs_duration", "qt_interval", "qtc", "qtc_frederica", "avg_rr"]

CONTROL_COLS = [
    "patient_ngsci_id",
    "ecg_id",
    "npy_index",
    "days_to_event",
    "file_path",
    "age",
    "sex",
] + CARDIAC_CONTROLS
COLS = CONTROL_COLS + ["cause_of_cardiac_arrest", "VT/VF"]


In [10]:
def collapse_lead_df(lead_df):
    lead_df["stelev"] = lead_df["stelev"] == 8
    lead_df["stdown"] = lead_df["stdown"] == 4
    lead_df["dltwv"] = lead_df["dltwv"] == 32
    lead_df_collapsed = lead_df.groupby("ecg_id").max().reset_index()
    return lead_df_collapsed


def process_code_df(code_df):
    code_df["lbbb"] = code_df["stmt_text"].str.contains("left bundle branch block")
    code_df["paced"] = code_df["stmt_text"].str.contains("paced")
    code_df_collapsed = code_df[["ecg_id", "lbbb", "paced"]].groupby("ecg_id").max().reset_index()
    return code_df_collapsed

In [11]:
def process_study_df(study_path):
    # base df
    group_df = pd.read_csv(f"{study_path}/ecg-waveforms/waveform-npy.csv")[["ecg_id", "npy_index", "ecg_timetag"]]
    group_df = group_df.loc[group_df["ecg_timetag"] == "[0]pre"]
    
    # get additional information
    ecg_df = pd.read_csv(f"{study_path}/ecg.csv")[
        ["patient_ngsci_id", "ecg_id", "acquisition_datetime_offset", "ecg_timetag"]
    ]
    cohort_df = pd.read_csv(f"{study_path}/cohort.csv")[["ecg_id", "age", "sex"]]
    # use hospitalization for study because it's more complete - we do not have for control cases
    outcomes_df = pd.read_csv(f"{study_path}/rosc-outcomes.csv")[
        ["ecg_id", "cause_of_cardiac_arrest", "hospitalization_datetime_offset", "initial_rhythm"]
    ]
    outcomes_df["VT/VF"] = outcomes_df["initial_rhythm"] == "VT/VF"
    # lead level measurement
    lead_df = pd.read_csv(f"{study_path}/ecg-metadata/measurement-matrix-per-lead.csv")[
        ["ecg_id", "stelev", "stdown", "dltwv"]
    ]
    lead_df_collapsed = collapse_lead_df(lead_df)
    # qtc measurement
    global_df = pd.read_csv(f"{study_path}/ecg-metadata/measurement-matrix-global.csv")[
        ["ecg_id", "qtc", "avg_rr"]
    ]
    global_df["qtc500"] = global_df["qtc"] > 500
    # other resting measurements
    resting_df = pd.read_csv(f"{study_path}/ecg-metadata/resting-ecg-measurements.csv")[
        ["ecg_id", "ventricular_rate", "atrial_rate", "qrs_duration", "qt_interval", "qtc_frederica"]
    ]
    # code processing
    code_df = pd.read_csv(f"{study_path}/ecg-metadata/diagnosis.csv")[["ecg_id", "stmt_text"]]
    code_df_collapsed = process_code_df(code_df)
    # merge
    group_df = (
        group_df.merge(ecg_df[["patient_ngsci_id", "ecg_id", "acquisition_datetime_offset"]], how="left", on="ecg_id")
        .merge(cohort_df, how="left")
        .merge(outcomes_df, how="left")
        .merge(lead_df_collapsed, how="left")
        .merge(global_df, how="left")
        .merge(resting_df, how="left")
        .merge(code_df_collapsed, how="left")
    )
    group_df["days_to_event"] = (
        pd.to_datetime(group_df["hospitalization_datetime_offset"], format='mixed')
        - pd.to_datetime(group_df["acquisition_datetime_offset"])
    ).dt.days
    group_df["file_path"] = f"{study_path}/ecg-waveforms/waveform-rhythm.npy"
    return group_df[COLS]


In [12]:
def process_control_df(control_path):
    # base df
    group_df = pd.read_csv(f"{control_path}/pre/ecg-waveforms/waveform-npy.csv")[["ecg_id", "npy_index"]]
    # get additional information
    ecg_df = pd.read_csv(f"/home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/ecg-cohort.csv")[
        ["ecg_id", "patient_ngsci_id", "acquisition_datetime_offset", "ecg_timetag", "age"]
    ]
    cohort_df = pd.read_csv(f"/home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/ecg-cohort.csv")[["ecg_id", "sex"]]
    # subset to "pre" ECGs and get date of control event
    pre_df = ecg_df.loc[ecg_df["ecg_timetag"] == "pre"]
    control_df = ecg_df.loc[ecg_df["ecg_timetag"] == "control"][
        ["patient_ngsci_id", "acquisition_datetime_offset"]
    ].rename(columns={"acquisition_datetime_offset": "control_datetime"})
    final_df = pre_df.merge(control_df, how="left")
    final_df["days_to_event"] = (
        pd.to_datetime(
            final_df["control_datetime"], format='mixed') -
            pd.to_datetime(final_df["acquisition_datetime_offset"], format='mixed')
        ).dt.days
    # lead level measurements
    lead_df = pd.read_csv(f"{control_path}/pre/ecg-metadata/measurement-matrix-per-lead.csv")[
        ["ecg_id", "stelev", "stdown", "dltwv"]
    ]
    lead_df_collapsed = collapse_lead_df(lead_df)
    # global measurements
    global_df = pd.read_csv(f"{control_path}/pre/ecg-metadata/measurement-matrix-global.csv")[
        ["ecg_id", "qtc", "avg_rr"]
    ]
    global_df["qtc500"] = global_df["qtc"] > 500
    # other resting measurements
    resting_df = pd.read_csv(f"{control_path}/pre/ecg-metadata/resting-ecg-measurements.csv")[
        ["ecg_id", "ventricular_rate", "atrial_rate", "qrs_duration", "qt_interval", "qtc_frederica"]
    ]
    # code processing
    code_df = pd.read_csv(f"{control_path}/pre/ecg-metadata/diagnosis.csv")[["ecg_id", "stmt_text"]]
    code_df_collapsed = process_code_df(code_df)
    # merge
    group_df = (
        group_df.merge(final_df, how="left", on="ecg_id")
        .merge(cohort_df, how="left", on="ecg_id")
        .merge(lead_df_collapsed, how="left")
        .merge(global_df, how="left")
        .merge(resting_df, how="left")
        .merge(code_df_collapsed, how="left")
    )
    group_df["file_path"] = f"{control_path}/pre/ecg-waveforms/waveform-rhythm.npy"
    return group_df[CONTROL_COLS]


In [13]:
def calculate_rr_std(ecg):
    """calculate RR variance using first lead"""
    from biosppy.signals import ecg as becg

    rpeaks = becg.christov_segmenter(ecg[0])[0]
    diffs = [rpeaks[i] - rpeaks[i - 1] for i in range(1, len(rpeaks))]

    return np.std(diffs)

In [14]:
    study_path = "/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group"
    study_group_df = process_study_df(study_path)
    study_group_df["group"] = "study"
    # initialize full DF, starting by appending arrest patients
    cardiac_df = pd.DataFrame(columns=["group"] + COLS)
    cardiac_df = pd.concat([cardiac_df, study_group_df], ignore_index=True)

    # establish control group by year and append to full DF
    for year in os.listdir("/home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/"):
        if year != "ecg-cohort.csv":
            control_path = f"/home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/{year}"
            control_group_df = process_control_df(control_path)
            control_group_df["group"] = year
            cardiac_df = pd.concat([cardiac_df, control_group_df], ignore_index=True)

    # define SCD outcome as any study who had an SCD more than one day out from visit and within 1 year,
    # to be as consistent with Halland variable definition as possible (no visit information available in this dataset)
    cardiac_df["scd1"] = (
        (cardiac_df["group"] == "study")
        & (cardiac_df["days_to_event"] <= 365)
        & (cardiac_df["days_to_event"] >= 2)
        & (cardiac_df["cause_of_cardiac_arrest"] == "cardiac event")
    )
    # repeat for scd2 (within 2 years)
    cardiac_df["scd2"] = (
        (cardiac_df["group"] == "study")
         & (cardiac_df["days_to_event"] <= 730)
         & (cardiac_df["days_to_event"] >= 2)
        & (cardiac_df["cause_of_cardiac_arrest"] == "cardiac event")
    )
    # repeat for scd (no maximum time between ECG reading and cardiac event)
    cardiac_df["scd"] = (
        (cardiac_df["group"] == "study")
         & (cardiac_df["days_to_event"] >= 2)
        & (cardiac_df["cause_of_cardiac_arrest"] == "cardiac event")
    )
    cardiac_df[["lbbb", "qtc500", "stelev", "stdown", "dltwv"]] = cardiac_df[
        ["lbbb", "qtc500", "stelev", "stdown", "dltwv"]
    ].fillna(False)
    cardiac_df["age"] = cardiac_df["age"].str.extract("(\d+)", expand=False).astype(float)
    cardiac_df.to_csv("cardiac_df.csv", index=False)


  lead_df = pd.read_csv(f"{study_path}/ecg-metadata/measurement-matrix-per-lead.csv")[


In [15]:
cardiac_df

Unnamed: 0,group,patient_ngsci_id,ecg_id,npy_index,days_to_event,file_path,age,sex,qtc500,lbbb,...,qrs_duration,qt_interval,qtc,qtc_frederica,avg_rr,cause_of_cardiac_arrest,VT/VF,scd1,scd2,scd
0,study,100005,e4a8dfe722c4686695265586d8b5b6bc,2,-1.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-...,81.0,female,False,False,...,90,374,436.0,415,734.0,cardiac event,False,False,False,False
1,study,100062,0ae546edc7a3c6c8e2b5f5a3b75b8e04,6,25.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-...,90.0,male,False,False,...,104,452,436.0,442,1062.0,cardiac event,False,True,True,True
2,study,100065,57684bedf6a0a85719c8e1259865ed3d,9,11.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-...,61.0,male,True,False,...,174,620,620.0,620,998.0,cardiac event,False,True,True,True
3,study,100066,287d0f46b4ed42b092cad9df775eca17,12,-1.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-...,79.0,male,True,False,...,162,440,507.0,484,750.0,cardiac event,False,False,False,False
4,study,100067,623a71fefa3402d9a4ad729f0ed27e2c,15,-1.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/study-...,83.0,male,False,False,...,86,340,440.0,404,594.0,infectious event,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7010,2018,1827871993,96bf60631096ceea0f232af70c403bfb,1405,-2679.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/contro...,51.0,male,False,False,...,98,368,447.0,419,674.0,,,False,False,False
7011,2018,1836101994,8e655408b5022499cb13a47a3ec56479,1406,12844.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/contro...,73.0,male,False,False,...,82,384,411.0,402,872.0,,,False,False,False
7012,2018,1828901995,83a0b16c8d4660656976ffc2a0c73ab8,1407,8335.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/contro...,71.0,female,False,False,...,92,440,446.0,445,970.0,,,False,False,False
7013,2018,1831321998,aabd57c5bacce494e40499a7f91e486f,1408,-20062.0,/home/ngsci/datasets/arrest-ntuh-ecg/v1/contro...,88.0,male,False,False,...,94,404,463.0,443,760.0,,,False,False,False


## Load data

According to the information, we can load the ECG waveform and label.

In [40]:
# Load the ecg waveform

file_paths = cardiac_df['file_path'].unique()
ecg_waveform = np.zeros([len(cardiac_df),12,5000])

for i, file_path in enumerate(file_paths):
    # load the data
    ecg_data = np.load(file_path)
    
    # data index
    data_info = cardiac_df[cardiac_df['file_path'] == file_path]
    ecg_waveform[data_info.index] = ecg_data[np.array(data_info['npy_index']).astype(int)]

In [55]:
# load the label

scd_label = cardiac_df[['scd','scd1','scd2']]
scd_label

## Save data

In [57]:
cardiac_df.to_csv('data/meta_data.csv', index=False)
scd_label.to_csv('data/scd_label.csv', index=False)
np.save('data/ECG_waveform.npy', ecg_waveform)

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2023-12-13 23:54:18.106836: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[]

In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}", flush=True)

Device: cpu


In [3]:
torch.cuda.is_available()

False