# Overview
    Convert training and testing data (.dat, .apn, etc.) to .pkl files
    Folder 'raw': Raw data (paired ecg, apn, ect.)
    Folder 'processed': With problematic minutes (AT THE BEGINNING OF DATA) deleted 

In [1]:
import numpy as np
import load
import pickle
import matplotlib.pyplot as plt
import importlib

# 1. Pre-process training data

In [12]:
train_full = [f'a{i:02d}' for i in range(
    1, 21)] + [f'b{i:02d}' for i in range(1, 6)] + [f'c{i:02d}' for i in range(1, 11)]
train_test = ['a01']
data_folder = 'data/training/'
ratio_lb, ratio_ub = 0.6, 2
diagPlot = 0

## 1.1 Raw 

In [8]:
output_folder = 'data/raw/'
for file in train_full:
    apn, _, t_apn = load.get_apn_train(data_folder + file)
    [apn, ecg, r_peaks, atfs] = load.single_file_res(data_folder, file, apn, t_apn[-1])
    
    # Save result
    res = {'ecg': ecg, 'apn': apn, 'r_peaks': r_peaks, 'atfs': atfs}
    with open(output_folder + file + '.pkl', 'wb') as f:
        pickle.dump(res, f)

## 1.2 Processed

In [14]:
output_folder = 'data/processed/'
for file in train_full:
    apn, _, t_apn = load.get_apn_train(data_folder + file)
    [apn, ecg, r_peaks, atfs] = load.single_file_res(data_folder, file, apn, t_apn[-1])
    
    # Delete segments with abnormal s.d.
    idx_valid = load.clean_data(ecg, ratio_lb, ratio_ub, diagPlot)
    if diagPlot:
        plt.title(file)
        plt.show()
    
    s_idx = np.where(idx_valid)[0][0]
    ecg = ecg[s_idx : , :]
    apn = apn[s_idx : ]
    r_peaks = r_peaks[s_idx : , :]
    atfs = atfs[s_idx : , :]
    
    # Save result
    res = {'ecg': ecg, 'apn': apn, 'r_peaks': r_peaks, 'atfs': atfs}
    with open(output_folder + file + '.pkl', 'wb') as f:
        pickle.dump(res, f)
        
    if not diagPlot:
        print(f'{file}: {s_idx} minutes discarded')


a01: 0 minutes discarded
a02: 0 minutes discarded
a03: 0 minutes discarded
a04: 0 minutes discarded
a05: 0 minutes discarded
a06: 0 minutes discarded
a07: 0 minutes discarded
a08: 0 minutes discarded
a09: 0 minutes discarded
a10: 0 minutes discarded
a11: 0 minutes discarded
a12: 0 minutes discarded
a13: 0 minutes discarded
a14: 0 minutes discarded
a15: 1 minutes discarded
a16: 0 minutes discarded
a17: 0 minutes discarded
a18: 0 minutes discarded
a19: 0 minutes discarded
a20: 0 minutes discarded
b01: 0 minutes discarded
b02: 0 minutes discarded
b03: 0 minutes discarded
b04: 9 minutes discarded
b05: 0 minutes discarded
c01: 10 minutes discarded
c02: 10 minutes discarded
c03: 0 minutes discarded
c04: 20 minutes discarded
c05: 0 minutes discarded
c06: 1 minutes discarded
c07: 0 minutes discarded
c08: 0 minutes discarded
c09: 13 minutes discarded
c10: 11 minutes discarded


In [18]:
# TBC: Figure of problematic signals

# 2. Pre-process testing data

In [15]:
test_full = [f'x{i:02d}' for i in range(1, 36)] 
test_test = ['x01']
data_folder = 'data/testing/'
ratio_lb, ratio_ub = 0.6, 2
diagPlot = 0

## 2.1 Process event-2-answers

In [58]:
import re
import itertools

with open(data_folder + 'event-2-answers') as f:
    lines = f.read()

lines = lines.split(sep='\n\n')
lines = lines[:-1] # Delete the last empty item
lines = [line[3: ] for line in lines] # Delete leading filename
apns = [list(itertools.chain(*re.findall(r'[AN]+', line))) for line in lines]
d_apn = {'N': 0, 'A': 1}
apns = [np.array([d_apn[str] for str in apn]).astype(bool) for apn in apns]

with open(data_folder + 'event-2-answers.pkl', 'wb') as f:
    pickle.dump(apns, f)

## 2.2 Raw

In [10]:
output_folder = 'data/raw/'
with open(data_folder + 'event-2-answers.pkl', 'rb') as f:
    apns = pickle.load(f)
    
for file, apn in zip(test_full, apns):
    t_end = 60 * (len(apn) - 1)
    [apn, ecg, r_peaks, atfs] = load.single_file_res(data_folder, file, apn, t_end)
    
    # Save result
    res = {'ecg': ecg, 'apn': apn, 'r_peaks': r_peaks, 'atfs': atfs}
    with open(output_folder + file + '.pkl', 'wb') as f:
        pickle.dump(res, f)

## 2.3 Processed 

In [16]:
output_folder = 'data/processed/'
with open(data_folder + 'event-2-answers.pkl', 'rb') as f:
    apns = pickle.load(f)
    
for file, apn in zip(test_full, apns):
    t_end = 60 * (len(apn) - 1)
    [apn, ecg, r_peaks, atfs] = load.single_file_res(data_folder, file, apn, t_end)
    
    # Delete segments with abnormal s.d.
    idx_valid = load.clean_data(ecg, ratio_lb, ratio_ub, diagPlot)
    if diagPlot:
        plt.title(file)
        plt.show()
    
    s_idx = np.where(idx_valid)[0][0]
    ecg = ecg[s_idx : , :]
    apn = apn[s_idx : ]
    r_peaks = r_peaks[s_idx : , :]
    atfs = atfs[s_idx : , :]
    
    # Save result
    res = {'ecg': ecg, 'apn': apn, 'r_peaks': r_peaks, 'atfs': atfs}
    with open(output_folder + file + '.pkl', 'wb') as f:
        pickle.dump(res, f)
        
    if not diagPlot:
        print(f'{file}: {s_idx} minutes discarded')

x01: 10 minutes discarded
x02: 0 minutes discarded
x03: 1 minutes discarded
x04: 9 minutes discarded
x05: 0 minutes discarded
x06: 5 minutes discarded
x07: 2 minutes discarded
x08: 0 minutes discarded
x09: 0 minutes discarded
x10: 0 minutes discarded
x11: 0 minutes discarded
x12: 0 minutes discarded
x13: 0 minutes discarded
x14: 0 minutes discarded
x15: 0 minutes discarded
x16: 0 minutes discarded
x17: 0 minutes discarded
x18: 15 minutes discarded
x19: 0 minutes discarded
x20: 1 minutes discarded
x21: 0 minutes discarded
x22: 11 minutes discarded
x23: 0 minutes discarded
x24: 14 minutes discarded
x25: 0 minutes discarded
x26: 1 minutes discarded
x27: 0 minutes discarded
x28: 10 minutes discarded
x29: 18 minutes discarded
x30: 0 minutes discarded
x31: 0 minutes discarded
x32: 0 minutes discarded
x33: 0 minutes discarded
x34: 0 minutes discarded
x35: 11 minutes discarded
