In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import wfdb
from random import sample, randint
import pandas as pd
import pickle


PLOT = 0


def return_noise_snippet(noise_type, bw_noise, ma_noise, em_noise, multi_noise):
    # Pick noise signal 1 or 2
    idx_a = randint(0, 1)
    if noise_type == 0:
        # Randomly select snippet of noise type
        idx = randint(0, len(bw_noise.p_signal) - 1025)
        noise_snippet = bw_noise.p_signal[idx:idx+1024, idx_a]
    elif noise_type == 1:
        idx = randint(0, len(ma_noise.p_signal) - 1025)
        noise_snippet = ma_noise.p_signal[idx:idx + 1024, idx_a]
    elif noise_type == 2:
        idx = randint(0, len(em_noise.p_signal) - 1025)
        noise_snippet = em_noise.p_signal[idx:idx + 1024, idx_a]
    else:
        idx = randint(0, len(multi_noise) - 1025)
        noise_snippet = multi_noise[idx:idx + 1024]

    return noise_snippet


if __name__ == '__main__':
    fp_nst = 'mit-bih-noise-stress-test-database-1.0.0/'
    fp_arr = 'mit-bih-arrhythmia-database-1.0.0/'

    # 1. Read noise profiles BW (baseline wander), MA (muscle artifact), EM (electrode motion)
    bw_noise = wfdb.rdrecord(fp_nst + 'bw')
    ma_noise = wfdb.rdrecord(fp_nst + 'ma')
    em_noise = wfdb.rdrecord(fp_nst + 'em')
    w = 1/6  # normalization weight for combined noise signals
    multi_noise = w*bw_noise.p_signal[:, 0] + w*ma_noise.p_signal[:, 0] + w*em_noise.p_signal[:, 0] + \
                  w*bw_noise.p_signal[:, 1] + w*ma_noise.p_signal[:, 1] + w*em_noise.p_signal[:, 1]

    # 2. Read in clean ecg signal from MIT-BIH arrhythmia database
    discard = ['ANNOTA', 'REC', 'SHA256SUMS', 'mitd', 'x_m', '102-0']
    recs = os.listdir(fp_arr)
    rec_names = [rec[:-4] for rec in recs]
    rec_names = np.unique(rec_names)
    rec_names = [r for r in rec_names if r not in discard]

    # 3. For each record, grab 200 segments of length 1024
    # check that sampling frequency matches. pick channel that is Lead II.
    # cache first column = clean (truth), second column = noisy (model input)
    df_output = []
    for rec in rec_names:
        print('On rec ' + rec)
        data = wfdb.rdrecord(fp_arr + rec)
        if data.fs != 360:
            print(rec + ' has FS = ' + str(data.fs))
            continue

        ldII = np.where(np.array(data.sig_name) == 'MLII')[0]
        if len(ldII) == 0:
            print(rec + ' does not contain Lead II')
            continue
        else:
            ldII = ldII[0]
        # Select 200 sections of length 1024 samples
        start_indices = sample(range(len(data.p_signal[:, ldII])-1025), 200)
        for s in start_indices:
            data_snippet = data.p_signal[s:s+1024, 0]
            # Randomly choose noise: 0. bw 1. ma 2. em 3. combination
            noise_type = randint(0, 3)
            noise_snippet = return_noise_snippet(noise_type, bw_noise, ma_noise, em_noise, multi_noise)
            data_snippet_with_noise = data_snippet + noise_snippet

            if PLOT:
                plt.plot(data_snippet_with_noise, label='Noisy Data')
                plt.plot(data_snippet, label='Clean Data')
                plt.title(rec + ' noise type ' + str(noise_type))
                plt.xlabel('Samples')
                plt.ylabel('mV')
                plt.legend()

            # Data set will return a list of data frames (each data frame is one training or testing instance)
            d = {'noisy_data': data_snippet_with_noise, 'clean_truth': data_snippet}
            df = pd.DataFrame(d)
            df_output.append(df)

    dump_path = 'EECE7397_cache'
    print(len(df_output))
    pickle.dump(df_output, open(dump_path, "wb"))


ModuleNotFoundError: No module named 'wfdb'