In [21]:
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler
from typing import List
import json
import random


In [9]:
import os
root = '/home/david/Desktop/projects/thesis/downstream_classification/'
data_folder = '/data/mit-bih/processed'

os.listdir(root+data_folder)

['08219_interval_14_labeled_N.npy',
 '08434_interval_4_labeled_N.npy',
 '04908_interval_13_labeled_AFIB.npy',
 '04043_interval_106_labeled_N.npy',
 '04043_interval_16_labeled_N.npy',
 '04043_interval_100_labeled_N.npy',
 '04936_interval_27_labeled_AFIB.npy',
 '08219_interval_16_labeled_N.npy',
 '06426_interval_33_labeled_AFIB.npy',
 '05261_interval_19_labeled_AFIB.npy',
 '04043_interval_73_labeled_AFIB.npy',
 '04043_interval_57_labeled_AFIB.npy',
 '08378_interval_7_labeled_AFIB.npy',
 '08219_interval_36_labeled_N.npy',
 '05121_interval_22_labeled_N.npy',
 '06995_interval_9_labeled_N.npy',
 '04043_interval_78_labeled_N.npy',
 '05121_interval_9_labeled_AFIB.npy',
 '04936_interval_21_labeled_AFIB.npy',
 '04936_interval_53_labeled_AFIB.npy',
 '04126_interval_5_labeled_AFIB.npy',
 '05121_interval_11_labeled_AFIB.npy',
 '04043_interval_46_labeled_N.npy',
 '04043_interval_53_labeled_AFIB.npy',
 '04936_interval_35_labeled_AFIB.npy',
 '06426_interval_36_labeled_N.npy',
 '04043_interval_19_label

In [67]:
class MIT_BIH_DataGen():
    def __init__(self, data_path, readings, batch_size, window_size = 450):
        
        assert (batch_size % 10) == 0, 'batch_size has to be a multiplication of 10'

        # store to self
        self.readings = readings
        self.batch_size = int(batch_size/10)
        self.window_size = window_size
        self.data_path = data_path
        
        # get numpy files
        self.npy_files = [x for x in os.listdir(data_path) if x.endswith('.npy')]

        # get relevant files
        self.N_files = []
        self.AF_files = []
        # self.test_N_files = []
        # self.test_AF_files = []

        for reading_number in self.readings:
            intervals_for_reading_all = [x for x in self.npy_files if x.split('_')[0] == reading_number]

            intervals_for_reading_N = [x for x in intervals_for_reading_all if x.endswith('_N.npy')]
            intervals_for_reading_AF = [x for x in intervals_for_reading_all if x.endswith('_AFIB.npy')]
            
            self.N_files += intervals_for_reading_N
            self.AF_files += intervals_for_reading_AF
        
        # shuffle lists
        random.shuffle(self.N_files)
        random.shuffle(self.AF_files)
        
        print('train_N_files[:5]:',self.N_files[:5])
        print('train_AF_files[:5]:',self.AF_files[:5])
        print('train_N_files Len:',len(self.N_files))
        print('train_AF_files Len:',len(self.AF_files))

        self.N_files = np.array(self.N_files)
        self.AF_files = np.array(self.AF_files)

    def __getitem__(self):
        """
        ASSUMPTIONS:
        - balanced batch each time
        - sample form five files each time, later versions can change that
        """
        batch_files_N  = np.random.choice(self.N_files, size=5, replace=True)
        batch_files_AF = np.random.choice(self.AF_files, size=5, replace=True)

        signals_in_batch = []
        targets_in_batch = []
        for filename in batch_files_N:
            interval = np.load(self.data_path+f'/{filename}')
            interval_length = len(interval)
            
            # draw window starts
            indices_to_draw_from = np.arange(interval_length-self.window_size-1)
            starts_of_signals = np.random.choice(indices_to_draw_from, size=self.batch_size, replace=False)
            
            # save signals
            signals_in_batch += [interval[start:(start+self.window_size)] for start in starts_of_signals]
            targets_in_batch += [0 for x in range(self.batch_size)]
        
        for filename in batch_files_AF:
            interval = np.load(self.data_path+f'/{filename}')
            interval_length = len(interval)
            
            # draw window starts
            indices_to_draw_from = np.arange(interval_length-self.window_size-1)
            starts_of_signals = np.random.choice(indices_to_draw_from, size=self.batch_size, replace=False)
            
            # save signals
            signals_in_batch += [interval[start:(start+self.window_size)] for start in starts_of_signals]
            targets_in_batch += [1 for _x in range(self.batch_size)]
        
        # combind signals with targets
        signals_and_targets_in_batch = [(signal, target) for (signal, target) in zip(signals_in_batch,targets_in_batch)]
        
        # shuffle
        random.shuffle(signals_and_targets_in_batch)

        # convert to np
        X = np.array([x[0] for x in signals_and_targets_in_batch])
        y = np.array([x[1] for x in signals_and_targets_in_batch])
        
        # transpose X
        X = np.transpose(X,(0,2,1))
        return X,y

# get splits
with open(root+data_folder+"/splits.json") as json_file:
    splits = json.load(json_file)

temp = MIT_BIH_DataGen(data_path=root+data_folder, readings=splits['train'], batch_size=10)
X,y = temp.__getitem__()

print('Shapes:', X.shape, y.shape)
print('y:', y)

train_N_files[:5]: ['08219_interval_30_labeled_N.npy', '04936_interval_46_labeled_N.npy', '04043_interval_130_labeled_N.npy', '04043_interval_90_labeled_N.npy', '06426_interval_43_labeled_N.npy']
train_AF_files[:5]: ['08219_interval_31_labeled_AFIB.npy', '04043_interval_117_labeled_AFIB.npy', '04043_interval_9_labeled_AFIB.npy', '05121_interval_7_labeled_AFIB.npy', '04043_interval_115_labeled_AFIB.npy']
train_N_files Len: 255
train_AF_files Len: 260
Shapes: (10, 2, 450) (10,)
y: [1 0 0 1 0 1 0 1 0 1]


# Compare with SHL Data Gen

In [62]:
os.chdir('/home/david/Desktop/projects/thesis/downstream_classification')
from dataloader.DataGenerator import DataGenerator
train_generator = DataGenerator(
    data_folder_path='./data/individual-signals/',
    metadata_file_path='./data/combined_data/metadata_balanced_by_death.csv',
    targets=['AF'],
    sample='train',
    seed=123,
    batch_size=10,
    shuffle = True
)
X_shl, y_shl, signum_shl = train_generator.__getitem__(1)
print('Shapes:', X_shl.shape, y_shl.shape)
print('y:', y_shl)

Shapes: (10, 12, 450) (10, 1)
y: [[0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]
