In [9]:
import os
import pandas as pd
import numpy as np
import neurokit2 as nk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import interpolate

In [2]:
data_path = '../ECGDataDenoised'
meta_path = '../ECGDataDenoised/Diagnostics.xlsx'

In [3]:
meta = pd.read_excel(meta_path)
# meta

In [4]:
records_name = meta['FileName'].values
len(records_name)

10646

In [5]:
meta['Rhythm'].value_counts()

Rhythm
SB       3889
SR       1826
AFIB     1780
ST       1568
SVT       587
AF        445
SA        399
AT        121
AVNRT      16
AVRT        8
SAAWR       7
Name: count, dtype: int64

In [6]:
label_dict = {'SB': 0, # SB
              'AFIB': 1, 'AF': 1, # AF
              'ST': 2, 'SVT': 2, 'AT': 2, 'AVNRT': 2, 'AVRT': 2, 'SAAWR': 2, # GSVT
              'SR': 3, 'SA': 3} # SR
labels = meta['Rhythm'].replace(label_dict).values
len(labels)

  labels = meta['Rhythm'].replace(label_dict).values


10646

In [None]:
def resample(data, freq1=500, freq2=250, kind='linear'):
    t = np.linspace(1, len(data), len(data))
    f = interpolate.interp1d(t, data, kind=kind)
    t_new = np.linspace(1, len(data), int(len(data)/freq1 * freq2))
    new_data = f(t_new)
    return new_data

 
def normalize(data):
    scaler = StandardScaler()
    norm_data = scaler.fit_transform(data)
    return norm_data


def R_Peaks(ecg_data):
    # get R Peak positions
    pos = []
    # get R Peak intervals
    trial_interval = []
    for ch in range(ecg_data.shape[1]):
        cleaned_ecg = nk.ecg_clean(ecg_data[:, ch], sampling_rate=250, method='neurokit')
        signals, _ = nk.ecg_peaks(cleaned_ecg, sampling_rate=250, correct_artifacts=False)
        peaks = signals[signals['ECG_R_Peaks']==1].index.to_list()
        pos.append(peaks)
        channel_interval = []
        for i in range(len(peaks)-1):
            channel_interval.append(peaks[i+1] - peaks[i])
        trial_interval.append(channel_interval)
        
    df_peaks = pd.DataFrame(pos) # [num of the R-Peaks of a channel]
    df = pd.DataFrame(trial_interval).T
    med = df.median()
    return df, med, df_peaks

In [16]:
med_intervals = []
na_trails = []
allzero_trials = []
except_trials = []
for record in os.listdir(data_path):
    if record.endswith('.csv'):
        data = pd.read_csv(os.path.join(data_path, record)).values
        if np.all(data == 0):
            allzero_trials.append(record)
            print(f'All zero: {record}')
            continue
        if np.any(np.isnan(data)):
            na_trails.append(record)
            print(f'Nan: {record}')
            continue
        trial = []
        for ch in range(data.shape[1]):
            trial.append(resample(data[:, ch], freq1=500, freq2=250, kind='linear'))
        trial = np.array(trial).T
        trial_norm = normalize(trial)
        try:
            _, med, _ = R_Peaks(trial_norm)
            med_intervals.append(med.to_list())
        except IndexError:
            except_trials.append(record)
            print(f'IndexError: {record}')

All zero: MUSE_20181222_204243_08000.csv
All zero: MUSE_20181222_204157_58000.csv
All zero: MUSE_20181222_204236_34000.csv
All zero: MUSE_20181222_204249_88000.csv
All zero: MUSE_20181222_204222_63000.csv
Nan: MUSE_20180113_180425_75000.csv
All zero: MUSE_20181222_204217_03000.csv
All zero: MUSE_20181222_204154_20000.csv
Nan: MUSE_20180712_151351_36000.csv
All zero: MUSE_20181222_204240_84000.csv
All zero: MUSE_20181222_204248_77000.csv
All zero: MUSE_20181222_204226_00000.csv
All zero: MUSE_20181222_204245_36000.csv
Nan: MUSE_20180712_152022_92000.csv
All zero: MUSE_20181222_204122_52000.csv
All zero: MUSE_20181222_204156_45000.csv
All zero: MUSE_20181222_204118_08000.csv
Nan: MUSE_20180119_174843_24000.csv
All zero: MUSE_20181222_204155_31000.csv
All zero: MUSE_20181222_204140_77000.csv
All zero: MUSE_20181222_204310_31000.csv
All zero: MUSE_20181222_204212_44000.csv
All zero: MUSE_20181222_204314_78000.csv
Nan: MUSE_20180210_130454_71000.csv
All zero: MUSE_20181222_204219_27000.csv


In [17]:
len(na_trails), len(allzero_trials), len(except_trials)

(18, 40, 0)

In [26]:
df_med_intervals = pd.DataFrame(med_intervals).T
df_med_intervals.dropna(axis=1, inplace=True)
df_med_intervals

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10577,10578,10579,10580,10581,10582,10583,10584,10585,10586
0,182.0,203.0,79.0,147.0,203.0,279.0,207.5,171.0,270.0,274.0,...,243.0,131.0,134.0,141.0,217.0,168.0,239.0,265.0,262.5,147.0
1,182.0,240.0,80.0,147.0,203.0,278.5,207.5,171.5,269.0,273.5,...,243.0,130.0,134.0,141.0,217.0,168.0,240.0,265.0,263.0,148.0
2,182.0,160.5,79.0,149.0,203.0,278.0,213.5,172.5,261.0,276.0,...,243.0,130.0,130.0,141.0,213.5,170.0,238.5,262.5,263.0,147.0
3,181.5,198.0,80.0,147.0,203.0,283.0,204.5,212.0,271.0,279.0,...,243.0,128.0,133.5,141.0,218.0,168.0,240.0,263.5,284.0,164.0
4,181.0,186.0,79.0,149.0,202.0,279.0,207.5,202.0,270.0,274.0,...,243.0,130.0,134.0,141.0,217.0,169.0,238.5,265.0,263.5,147.0
5,180.5,198.5,80.0,148.0,203.0,278.0,207.0,171.5,269.0,273.5,...,243.0,131.0,131.0,141.0,217.0,170.0,240.0,262.5,263.0,148.0
6,182.0,177.0,80.0,147.0,203.0,278.0,208.5,225.0,272.5,273.5,...,243.0,130.0,146.0,140.5,218.0,174.0,240.0,262.0,263.5,350.0
7,182.0,240.0,79.5,147.0,202.0,279.0,207.5,216.0,272.5,273.5,...,243.0,130.0,134.0,141.0,215.0,168.0,240.0,263.0,263.0,466.5
8,181.0,240.0,80.0,147.0,202.0,279.0,207.0,171.0,270.0,273.5,...,243.0,130.0,134.0,141.0,217.0,167.5,240.0,262.0,263.0,150.0
9,182.0,241.0,80.0,147.0,203.0,278.5,207.5,164.5,270.0,273.5,...,243.0,130.0,134.0,141.0,217.0,169.0,241.0,265.5,263.0,147.5


In [27]:
med_all = df_med_intervals.median()
med_all[med_all <= 300].shape, med_all[med_all <= 300].max()

((9709,), 300.0)

In [29]:
new_df_med_intervals = df_med_intervals.loc[:, med_all[med_all <= 300].index]
new_df_med_intervals.columns = np.arange(new_df_med_intervals.shape[1])
new_df_med_intervals

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9699,9700,9701,9702,9703,9704,9705,9706,9707,9708
0,182.0,203.0,79.0,147.0,203.0,279.0,207.5,171.0,270.0,274.0,...,243.0,131.0,134.0,141.0,217.0,168.0,239.0,265.0,262.5,147.0
1,182.0,240.0,80.0,147.0,203.0,278.5,207.5,171.5,269.0,273.5,...,243.0,130.0,134.0,141.0,217.0,168.0,240.0,265.0,263.0,148.0
2,182.0,160.5,79.0,149.0,203.0,278.0,213.5,172.5,261.0,276.0,...,243.0,130.0,130.0,141.0,213.5,170.0,238.5,262.5,263.0,147.0
3,181.5,198.0,80.0,147.0,203.0,283.0,204.5,212.0,271.0,279.0,...,243.0,128.0,133.5,141.0,218.0,168.0,240.0,263.5,284.0,164.0
4,181.0,186.0,79.0,149.0,202.0,279.0,207.5,202.0,270.0,274.0,...,243.0,130.0,134.0,141.0,217.0,169.0,238.5,265.0,263.5,147.0
5,180.5,198.5,80.0,148.0,203.0,278.0,207.0,171.5,269.0,273.5,...,243.0,131.0,131.0,141.0,217.0,170.0,240.0,262.5,263.0,148.0
6,182.0,177.0,80.0,147.0,203.0,278.0,208.5,225.0,272.5,273.5,...,243.0,130.0,146.0,140.5,218.0,174.0,240.0,262.0,263.5,350.0
7,182.0,240.0,79.5,147.0,202.0,279.0,207.5,216.0,272.5,273.5,...,243.0,130.0,134.0,141.0,215.0,168.0,240.0,263.0,263.0,466.5
8,181.0,240.0,80.0,147.0,202.0,279.0,207.0,171.0,270.0,273.5,...,243.0,130.0,134.0,141.0,217.0,167.5,240.0,262.0,263.0,150.0
9,182.0,241.0,80.0,147.0,203.0,278.5,207.5,164.5,270.0,273.5,...,243.0,130.0,134.0,141.0,217.0,169.0,241.0,265.5,263.0,147.5


In [None]:
# split resampled trial to sample level(single heartbeat)
def trial2sample(data, max_duration=300):
    samples = []
    _, med, df_peaks = R_Peaks(data)
    trial_med = med.median()
    for i in range(df_peaks.shape[1]):
        RP_pos = df_peaks.iloc[:, i].median()
        beat = data[max(0,int(RP_pos)-int(trial_med/2)):min(int(RP_pos)+int(trial_med/2),data.shape[0]), :]
        left_zero_num = int((int(max_duration)-beat.shape[0])/2)
        padding_left = np.zeros([left_zero_num, data.shape[1]])
        padding_right = np.zeros([int(max_duration)-left_zero_num-beat.shape[0], data.shape[1]])
        beat = np.concatenate([padding_left, beat, padding_right], axis=0)
        samples.append(beat)
    return samples 


# concat samples to segmentations
def sample2trial(samples, size=10):
    trials = []
    index = 0
    while index <= len(samples)-size:
        beat = samples[index]
        for i in range(index+1, index+size):
            beat = np.vstack((beat, samples[i]))
        trials.append(beat)
        index += size
    return trials