In [8]:
import os

# helper function
def exists(path):
    val = os.path.exists(path)
    if val:
        print(f'{path} already exits. Using cached. Delete it manually to recieve it again!')
    return val

In [9]:
# Import
import os
from dataloader import BatchDataloader
import torch
import torch.nn as nn
import numpy as np
from tqdm.notebook import trange, tqdm
import h5py
from torch.utils.data import TensorDataset, random_split, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
from utils import pgd_attack, train_loop, eval_loop, train_loop_apgd
import utils
import ecg_plot
from models import ResNet1d, ResNet1dGELU
import ast
%matplotlib inline

In [10]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tqdm.write("Use device: {device:}\n".format(device=device))

Use device: cuda



In [11]:
batch_size = 32

In [12]:
# Load the data
path_to_csv = 'code-15/exams.csv'
path_to_h5 = 'code-15/full.h5'

df = pd.read_csv(path_to_csv, index_col='exam_id')

# Get h5 file
h5_file = h5py.File(path_to_h5, 'r')
traces_ids = (h5_file['exam_id'])

# Only keep the traces in the csv that match the traces in the h5 file
df = df[df.index.isin(traces_ids)]

# Define traces
traces = h5_file['tracings']

# Sort the dataframe in trace order
df = df.reindex(traces_ids)

# Divide the data into train and test set (10/90), without overlapping patient ids
patient_ids = df['patient_id'].unique()

np.random.seed(42)
np.random.shuffle(patient_ids)

train_size = int(0.1 * len(patient_ids))
train_patient_ids = patient_ids[:train_size]
test_patient_ids = patient_ids[train_size:]

train_df = df[df['patient_id'].isin(train_patient_ids)]
test_df = df[df['patient_id'].isin(test_patient_ids)]

# Len of train and test
no_train = len(train_df)
no_test = len(test_df)

print(f'Len of train: {len(train_df)}')
print(f'Len of test: {len(test_df)}') 

train_indices_to_keep = np.isin(df.index, train_df.index)
test_indices_to_keep = np.isin(df.index, test_df.index)

# Split the h5 file into train and test h5 files
train_traces_indeces = h5_file['exam_id'][train_indices_to_keep]
test_traces_indeces = h5_file['exam_id'][test_indices_to_keep]

train_h5_file = h5py.File('code-15/train.h5', 'w')
test_h5_file = h5py.File('code-15/test.h5', 'w')

train_h5_file.create_dataset('exam_id', data=train_traces_indeces, dtype='i8')
test_h5_file.create_dataset('exam_id', data=test_traces_indeces, dtype='i8')

train_to_save = None
test_to_save = None

train_ind = 0
test_ind = 0

for i, trace in tqdm(enumerate(traces), total=len(traces)):
    
    
    if train_indices_to_keep[i]:
        
        if train_to_save is None:
            train_to_save = train_h5_file.create_dataset('tracings', (no_train,) + traces[0].shape, dtype='f8')
            
        train_to_save[train_ind] = trace
        train_ind += 1
        
    if test_indices_to_keep[i]:
        
        if test_to_save is None:
            test_to_save = test_h5_file.create_dataset('tracings', (no_test,) + traces[0].shape, dtype='f8')
            
        test_to_save[test_ind] = trace
        test_ind += 1

train_h5_file.close()
test_h5_file.close()


Len of train: 34583
Len of test: 311196


  0%|          | 0/345779 [00:00<?, ?it/s]

In [15]:
# Try opening train.h5 and count the number of traces
train_h5_file = h5py.File('code-15/train.h5', 'r')
print(f'Number of exam ids in train.h5: {len(train_h5_file["exam_id"])}')
print(f'Number of tracings in train.h5: {len(train_h5_file["tracings"])}')
train_h5_file.close()

# Try opening test.h5 and count the number of traces
test_h5_file = h5py.File('code-15/test.h5', 'r')
print(f'Number of exam ids in test.h5: {len(test_h5_file["exam_id"])}')
print(f'Number of tracings in test.h5: {len(test_h5_file["tracings"])}')
test_h5_file.close()

Number of exam ids in train.h5: 34583
Number of tracings in train.h5: 34583
Number of exam ids in test.h5: 311196
Number of tracings in test.h5: 311196
