In [2]:
import os
import pandas as pd
import numpy as np
from utils import normalize, resample, R_Peaks, trial2sample, sample2trial

In [2]:
# select the unziped data path and diagnostics file path
data_path = '../ECGDataDenoised'
meta_path = '../ECGDataDenoised/Diagnostics.xlsx'

In [3]:
meta = pd.read_excel(meta_path)
# meta

In [4]:
records_name = meta['FileName'].values
len(records_name)

10646

In [5]:
meta['Rhythm'].value_counts()

Rhythm
SB       3889
SR       1826
AFIB     1780
ST       1568
SVT       587
AF        445
SA        399
AT        121
AVNRT      16
AVRT        8
SAAWR       7
Name: count, dtype: int64

In [6]:
# group the rhythms into 4 classes
label_dict = {'SB': 0, # SB
              'AFIB': 1, 'AF': 1, # AF
              'ST': 2, 'SVT': 2, 'AT': 2, 'AVNRT': 2, 'AVRT': 2, 'SAAWR': 2, # GSVT
              'SR': 3, 'SA': 3} # SR
labels = meta[['FileName', 'Rhythm']].replace(label_dict)
# labels

  labels = meta[['FileName', 'Rhythm']].replace(label_dict)


In [7]:
med_intervals = []
na_trails = []
allzero_trials = []
except_trials = []
for record in os.listdir(data_path):
    if record.endswith('.csv'):
        data = pd.read_csv(os.path.join(data_path, record), header=None).values
        if np.all(data == 0):
            allzero_trials.append(record)
            print(f'All zero: {record}')
            continue
        if np.any(np.isnan(data)):
            na_trails.append(record)
            print(f'Nan: {record}')
            continue
        trial = []
        for ch in range(data.shape[1]):
            trial.append(resample(data[:, ch], freq1=500, freq2=250, kind='linear'))
        trial = np.array(trial).T
        trial_norm = normalize(trial)
        try:
            _, med, _ = R_Peaks(trial_norm)
            med_intervals.append(med.to_list())
        except IndexError:
            except_trials.append(record)
            print(f'IndexError: {record}')

All zero: MUSE_20181222_204243_08000.csv
All zero: MUSE_20181222_204157_58000.csv
All zero: MUSE_20181222_204236_34000.csv
All zero: MUSE_20181222_204249_88000.csv
All zero: MUSE_20181222_204222_63000.csv
Nan: MUSE_20180113_180425_75000.csv
All zero: MUSE_20181222_204217_03000.csv
All zero: MUSE_20181222_204154_20000.csv
Nan: MUSE_20180712_151351_36000.csv
All zero: MUSE_20181222_204240_84000.csv
All zero: MUSE_20181222_204248_77000.csv
All zero: MUSE_20181222_204226_00000.csv
All zero: MUSE_20181222_204245_36000.csv
Nan: MUSE_20180712_152022_92000.csv
All zero: MUSE_20181222_204122_52000.csv
All zero: MUSE_20181222_204156_45000.csv
All zero: MUSE_20181222_204118_08000.csv
Nan: MUSE_20180119_174843_24000.csv
All zero: MUSE_20181222_204155_31000.csv
All zero: MUSE_20181222_204140_77000.csv
All zero: MUSE_20181222_204310_31000.csv
All zero: MUSE_20181222_204212_44000.csv
All zero: MUSE_20181222_204314_78000.csv
Nan: MUSE_20180210_130454_71000.csv
All zero: MUSE_20181222_204219_27000.csv


In [8]:
len(na_trails), len(allzero_trials), len(except_trials)

(18, 40, 0)

In [9]:
new_records_name = [r for r in records_name if r+'.csv' not in na_trails + allzero_trials + except_trials]
len(new_records_name)

10588

In [10]:
df_med_intervals = pd.DataFrame(med_intervals).T
df_med_intervals.dropna(axis=1, inplace=True)
df_med_intervals

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10577,10578,10579,10580,10581,10582,10583,10584,10585,10586
0,182.0,202.5,80.0,148.0,203.0,279.0,207.5,171.0,269.0,274.0,...,243.0,131.0,134.0,141.0,217.0,169.0,238.5,265.0,263.0,148.0
1,181.0,241.0,80.0,147.0,203.0,278.5,207.5,171.5,270.0,273.5,...,243.0,131.0,134.0,141.0,217.0,169.0,240.0,265.5,263.0,147.0
2,182.0,160.5,80.0,149.0,203.0,278.0,214.0,172.0,262.0,277.0,...,242.0,131.0,131.0,141.0,213.5,170.0,239.0,262.0,263.0,148.0
3,182.0,198.5,80.0,148.0,203.0,283.0,205.0,217.0,271.0,279.0,...,243.0,128.0,133.5,141.0,217.0,169.0,240.0,263.0,284.0,147.5
4,182.0,186.0,79.0,149.0,202.0,280.0,208.5,202.0,270.0,274.0,...,243.0,130.0,134.0,141.0,217.0,168.0,238.5,265.0,263.0,147.0
5,181.0,198.5,80.0,148.0,203.0,278.5,207.5,172.0,269.0,274.0,...,243.0,131.0,130.0,141.0,218.0,169.0,240.0,262.5,263.0,148.0
6,182.0,175.0,80.0,147.0,203.0,278.5,208.0,225.0,272.5,273.5,...,243.0,130.0,145.0,141.0,217.0,175.0,240.0,262.5,263.0,336.0
7,182.0,214.5,79.5,147.0,203.0,279.0,208.5,217.0,272.5,274.0,...,243.0,130.0,134.0,141.0,216.0,168.0,240.0,262.5,263.0,475.0
8,182.0,240.0,80.0,147.0,202.0,279.0,207.5,171.5,269.0,274.0,...,243.0,131.0,134.0,141.0,217.0,168.5,240.0,262.5,263.0,150.0
9,182.0,241.0,80.0,147.0,202.0,278.5,207.5,164.5,270.0,273.5,...,243.0,130.0,134.0,141.0,218.0,169.0,240.0,265.5,263.0,148.0


In [11]:
# 9713 records suitable for analysis
med_all = df_med_intervals.median()
med_all[med_all <= 300].shape, med_all[med_all <= 300].max()

((9713,), 300.0)

In [12]:
# update the records
new_med_intervals = []
new_except_trials = []
med_nan_trials = []
for record in new_records_name:
    data = pd.read_csv(os.path.join(data_path, record+'.csv'), header=None).values
    trial = []
    for ch in range(data.shape[1]):
        trial.append(resample(data[:, ch], freq1=500, freq2=250, kind='linear'))
    trial = np.array(trial).T
    trial_norm = normalize(trial)
    try:
        _, med, _ = R_Peaks(trial_norm)
        if med.median() <= 300:
            if np.any(np.isnan(med)):
                med_nan_trials.append(record)
                print(f'Med Nan: {record}')
                continue
            new_med_intervals.append(med.to_list())
        else:
            new_except_trials.append(record)
            print(f'Outlider: {record}')
    except IndexError:
        except_trials.append(record)
        print(f'IndexError: {record}')

Med Nan: MUSE_20180712_160448_42000
Outlider: MUSE_20180112_133006_47000
Outlider: MUSE_20180112_072033_34000
Outlider: MUSE_20180114_121113_24000
Outlider: MUSE_20180115_130426_87000
Outlider: MUSE_20180116_130643_35000
Med Nan: MUSE_20180113_135710_28000
Outlider: MUSE_20180111_163357_09000
Outlider: MUSE_20180113_124658_83000
Outlider: MUSE_20180114_133820_34000
Med Nan: MUSE_20180210_131459_59000
Outlider: MUSE_20180113_173554_96000
Outlider: MUSE_20180112_071514_26000
Outlider: MUSE_20180118_131733_19000
Outlider: MUSE_20180113_125103_10000
Outlider: MUSE_20180116_122520_49000
Outlider: MUSE_20180118_125800_75000
Outlider: MUSE_20180118_122906_36000
Outlider: MUSE_20180112_075437_85000
Outlider: MUSE_20180115_124332_88000
Med Nan: MUSE_20180118_132425_03000
Outlider: MUSE_20180112_124231_23000
Outlider: MUSE_20180114_072231_70000
Outlider: MUSE_20180712_153518_42000
Med Nan: MUSE_20180115_123200_57000
Outlider: MUSE_20180113_134104_50000
Outlider: MUSE_20180118_120854_33000
Outlid

In [13]:
len(new_except_trials), len(med_nan_trials)

(446, 429)

In [14]:
pd.DataFrame(new_med_intervals).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9703,9704,9705,9706,9707,9708,9709,9710,9711,9712
0,133.0,287.0,223.5,282.5,92.0,266.5,156.0,238.5,259.0,257.5,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,148.5,107.0
1,132.5,288.0,224.0,283.0,92.0,266.5,155.0,238.0,260.0,257.5,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,149.0,106.0
2,132.5,287.0,224.5,283.0,92.0,266.5,157.0,238.0,259.0,256.0,...,111.0,77.0,86.0,93.0,99.0,88.0,87.0,78.0,224.0,107.0
3,133.0,296.0,223.5,283.5,92.0,264.0,172.0,238.0,259.0,257.5,...,111.0,77.0,86.0,93.0,99.0,88.0,87.0,79.0,149.0,107.0
4,131.5,287.0,224.0,282.5,92.0,266.0,157.0,238.0,259.0,257.0,...,111.0,77.0,86.0,93.0,99.0,89.0,87.0,78.0,148.5,107.0
5,132.5,287.0,224.0,280.0,92.0,266.5,155.0,238.0,260.0,257.0,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,149.0,106.0
6,132.5,287.0,225.5,283.0,92.0,266.0,158.0,238.5,259.0,257.5,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,149.0,107.0
7,133.0,287.0,225.5,283.0,92.0,266.5,156.0,238.5,260.0,256.0,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,149.0,107.0
8,132.0,288.0,224.5,282.5,92.0,266.5,157.0,238.0,259.0,256.0,...,111.0,77.0,86.0,93.0,99.0,89.0,87.0,78.0,149.0,107.0
9,133.0,287.0,223.5,282.5,92.0,266.5,156.0,238.5,259.0,257.5,...,111.0,77.0,86.0,93.5,99.0,88.0,87.0,78.0,149.0,107.0


In [15]:
final_records_name = [r for r in new_records_name if r not in new_except_trials + med_nan_trials]
len(final_records_name)

9713

In [16]:
# main
feature_path = '../data/chapman/feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)
    
new_label_dict = {}
id = 1
for record in final_records_name:
    label = labels[labels['FileName']==record]['Rhythm'].values[0]
    new_label_dict[id] = label
    data = pd.read_csv(os.path.join(data_path, record+'.csv'), header=None).values
    trial = []
    for ch in range(data.shape[1]):
        trial.append(resample(data[:, ch], freq1=500, freq2=250, kind='linear'))
    trial = np.array(trial).T
    trial_norm = normalize(trial)
    samples = trial2sample(trial_norm, max_duration=300)
    trials = sample2trial(samples, size=6)
    
    if trials != []:
        trials = np.array(trials)
        print(trials.shape)
        np.save(os.path.join(feature_path, f'feature_{id:05d}'), trials)
        id += 1
        
    else:
        print(f'No sample: {record}')
    
    

(3, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(4, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(3, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(4, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(2, 1800, 12)
(2, 1800, 12)
(3, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 1800, 12)
(3, 1800, 12)
(1, 1800, 12)
(2, 1800, 12)
(1, 18

In [17]:
len(new_label_dict)

9713

In [18]:
label_path = '../data/chapman/label'
if not os.path.exists(label_path):
    os.mkdir(label_path)

df_label = pd.DataFrame([new_label_dict]).T
df_label = df_label.reset_index().astype('int64')
labels = df_label[[0, 'index']].values
np.save(os.path.join(label_path, 'label'), labels)