In [1]:
import numpy as np
import h5py
import os
import sys
import inspect

# Add parent dir to the sys path
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)


from data.data_utils import *
from constants import *

In [3]:
output_dir = '/share/pi/rubin/siyitang/eeg/input/nonOverlap'

In [11]:
SEED = 123
np.random.seed(SEED)

# Test preprocess code

In [7]:
file_dir = '../data/train_seizure_files.txt'
TRAIN_INCREASES = len(Fs) * len(Ws) * len(Os)


train_file_tuples = parseTxtFiles(file_dir,  num_folds=None, fold_idx=None, cross_val=False)[0] 

# Compute features for train set
features = {}
        
for idx in range(2):
    curr_file_name, seizure_class, seizure_idx = train_file_tuples[idx]
    # read file
    f = pyedflib.EdfReader(curr_file_name)
        
    ordered_channels = getOrderedChannels(curr_file_name, False, f.getSignalLabels())
       
    signals = getEDFsignals(f)
                
    frequencies = getSamplingFreq(f, ordered_channels)
    freq = frequencies[0]        
            
    seizure_times = getSeizureTimes(curr_file_name, file_type="edf")
    seizure_times = seizure_times[seizure_idx]
    print(seizure_times)
    start_t = int(freq * seizure_times[0])
    end_t = int(freq * seizure_times[1])
    curr_signals = signals[:, start_t:end_t]
    print(curr_signals.shape)
                            
    f._close()        
                
    # dense features, only for training split
    dense_feats = []
    for sampling_idx in range(TRAIN_INCREASES):
        dense_param = DENSE_PARAMS[sampling_idx]
        dense_feats.append(denseSampling(ordered_channels, curr_signals, dense_param[0], dense_param[1], dense_param[2]))
            
    write_file_name = curr_file_name + '_' + str(seizure_idx)
    print(write_file_name)
    features[write_file_name] = dense_feats

# Write into h5py file
train_h5_file = 'train_features.h5'
with h5py.File(train_h5_file, 'w') as hf:
    for key, val in features.items():
        hf.create_dataset(key,  data = val)

[843.4102, 926.5781]
(34, 21290)
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/03_tcp_ar_a/065/00006514/s010_2010_05_04/00006514_s010_t005.edf_1
[27.832, 60.0]
(26, 8236)
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/03_tcp_ar_a/091/00009162/s002_2012_04_09/00009162_s002_t000.edf_0


# Check preprocessed data

In [4]:
h5_file = os.path.join(output_dir, 'dev_features.h5')

In [5]:
txt_file = '../data/devSet_seizure_files.txt'

with open(txt_file, 'r') as f:
    raw_names = f.read().splitlines()
    
file_names = []
for i in range(len(raw_names)):
    curr_str = raw_names[i].strip('\n').split(',')
    curr_file_name = curr_str[0] + '_' + curr_str[2]
    file_names.append(curr_file_name)
    #file_names.append(raw_names[i].strip('\n').split(',')[0])

print(len(file_names))

184


In [6]:
features = []
with h5py.File(h5_file, 'r') as hf:
    for f_name in file_names:
        print(f_name)
        features.append(hf[f_name][()])

/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_0
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_1
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_2
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_3
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_4
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_5
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_6
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/00006514/s020_2010_05_17/00006514_s020_t000.edf_7
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/065/000065

In [7]:
print(len(features))
print(features[100])
print(features[100].shape)

184
[[[ 65.05184538 141.46493845  65.00930394]
  [ 65.02177202  70.93167121  64.88235868]
  [ 65.02863749  64.32903715  65.65459649]
  ...
  [ 65.02611327  87.89205577  65.55816383]
  [ 65.02323661  65.04550816  66.14565675]
  [ 65.05434756 121.81575339  64.04270862]]

 [[ 65.05184538 141.46493845  65.00930394]
  [ 65.02177202  70.93167121  64.88235868]
  [ 65.02863749  64.32903715  65.65459649]
  ...
  [ 65.02611327  87.89205577  65.55816383]
  [ 65.02323661  65.04550816  66.14565675]
  [ 65.05434756 121.81575339  64.04270862]]

 [[ 65.05184538 141.46493845  65.00930394]
  [ 65.02177202  70.93167121  64.88235868]
  [ 65.02863749  64.32903715  65.65459649]
  ...
  [ 65.02611327  87.89205577  65.55816383]
  [ 65.02323661  65.04550816  66.14565675]
  [ 65.05434756 121.81575339  64.04270862]]

 ...

 [[ 65.03998464 133.97474284  68.14731076]
  [ 65.03139436  48.47431655  64.97333101]
  [ 65.02692187  50.55200221  65.96481979]
  ...
  [ 65.02260542  70.35945     66.75518268]
  [ 65.0256541

In [19]:
h5_file = os.path.join(output_dir, 'train_features.h5')

In [20]:
txt_file = '../data/trainSet_seizure_files.txt'

with open(txt_file, 'r') as f:
    raw_names = f.read().splitlines()
    
file_names = []
for i in range(len(raw_names)):
    curr_str = raw_names[i].strip('\n').split(',')
    curr_file_name = curr_str[0] + '_' + curr_str[2]
    file_names.append(curr_file_name)

print(len(file_names))

1141


In [None]:
features = []
with h5py.File(h5_file, 'r') as hf:
    for f_name in file_names:
        print(f_name)
        features.append(hf[f_name][()])

/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/005/00000529/s003_2003_10_21/00000529_s003_t001.edf_0
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/005/00000529/s003_2003_10_21/00000529_s003_t001.edf_1
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/005/00000529/s003_2003_10_21/00000529_s003_t001.edf_2
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/01_tcp_ar/092/00009232/s004_2012_10_11/00009232_s004_t010.edf_0
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/032/00003208/s001_2006_09_08/00003208_s001_t002.edf_0
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/032/00003208/s001_2006_09_08/00003208_s001_t002.edf_1
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/032/00003208/s001_2006_09_08/00003208_s001_t002.edf_2
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/02_tcp_le/032/00003208/s001_2006_09_08/00003208_s001_t001.edf_0
/share/pi/rubin/eeg/TUH/v1_4_0/train_dev_test/train/01_tcp_ar/080/000080

In [12]:
print(len(features))
print(features[100])
print(features[100].shape)

1141
[[[[ 91.86765937  92.12261043  89.48238296]
   [ 91.8546929  125.43317674  93.32283974]
   [ 91.84124241  76.8590473   91.11183931]
   ...
   [ 91.82512987 131.50876885  93.35017298]
   [ 91.85507305 120.55686456  94.99990849]
   [ 91.85744023 147.0714447   93.63291903]]

  [[ 91.86765937  92.12261043  89.48238296]
   [ 91.8546929  125.43317674  93.32283974]
   [ 91.84124241  76.8590473   91.11183931]
   ...
   [ 91.82512987 131.50876885  93.35017298]
   [ 91.85507305 120.55686456  94.99990849]
   [ 91.85744023 147.0714447   93.63291903]]

  [[ 91.86765937  92.12261043  89.48238296]
   [ 91.8546929  125.43317674  93.32283974]
   [ 91.84124241  76.8590473   91.11183931]
   ...
   [ 91.82512987 131.50876885  93.35017298]
   [ 91.85507305 120.55686456  94.99990849]
   [ 91.85744023 147.0714447   93.63291903]]

  ...

  [[ 91.87661332  88.51089211  90.35788541]
   [ 91.85868156 155.73205127  90.46453158]
   [ 91.85136121  83.20034003  88.79655349]
   ...
   [ 91.82684105 140.04037639 

In [13]:
h5_file = os.path.join(output_dir, 'test_features.h5')

In [14]:
txt_file = '../data/testSet_seizure_files.txt'

with open(txt_file, 'r') as f:
    raw_names = f.read().splitlines()
    
file_names = []
for i in range(len(raw_names)):
    curr_str = raw_names[i].strip('\n').split(',')
    curr_file_name = curr_str[0] + '_' + curr_str[2]
    file_names.append(curr_file_name)

print(len(file_names))

684


In [15]:
features = []
with h5py.File(h5_file, 'r') as hf:
    for f_name in file_names:
        features.append(hf[f_name][()])

In [16]:
print(len(features))
print(features[0])
print(features[0].shape)

684
[[[ 82.95858613 151.70096     82.86615202]
  [ 82.90950827  66.07365638  82.04279841]
  [ 82.92296041  75.57398823  82.97331795]
  ...
  [ 82.90388931  96.28672107  82.50205332]
  [ 82.9048614   66.7106778   84.2017308 ]
  [ 82.97036345 115.54516984  83.76769289]]

 [[ 82.95858613 151.70096     82.86615202]
  [ 82.90950827  66.07365638  82.04279841]
  [ 82.92296041  75.57398823  82.97331795]
  ...
  [ 82.90388931  96.28672107  82.50205332]
  [ 82.9048614   66.7106778   84.2017308 ]
  [ 82.97036345 115.54516984  83.76769289]]

 [[ 82.95858613 151.70096     82.86615202]
  [ 82.90950827  66.07365638  82.04279841]
  [ 82.92296041  75.57398823  82.97331795]
  ...
  [ 82.90388931  96.28672107  82.50205332]
  [ 82.9048614   66.7106778   84.2017308 ]
  [ 82.97036345 115.54516984  83.76769289]]

 ...

 [[ 82.98385821  90.86761955  83.16947565]
  [ 82.89691095  91.92434619  82.51678197]
  [ 82.90392962  83.0510114   82.67647144]
  ...
  [ 82.90218102  87.14469952  82.2605251 ]
  [ 82.9056200

# Test preprocessed data for cross-validation

In [12]:
h5_file = os.path.join(output_dir, 'cv/fold4_train_features.h5')

In [13]:
txt_file = '../data/fold4_trainSet_seizure_files.txt'

with open(txt_file, 'r') as f:
    raw_names = f.read().splitlines()
    
file_names = []
for i in range(len(raw_names)):
    curr_str = raw_names[i].strip('\n').split(',')
    curr_file_name = curr_str[0] + '_' + curr_str[2]
    file_names.append(curr_file_name)

print(len(file_names))

1610


In [14]:
features = []
with h5py.File(h5_file, 'r') as hf:
    for f_name in file_names:
        features.append(hf[f_name][()])

In [15]:
print(len(features))
print(features[100])
print(features[100].shape)

1610
[[[[122.76379739  17.06427974 119.45861067]
   [122.63797966 140.6480516  120.68684545]
   [122.63219273  99.54053544 123.82340431]
   ...
   [122.6240302  125.2705667  123.47154113]
   [122.65729487  61.13298547 122.25554749]
   [122.72312167 123.07766271 126.41050316]]

  [[122.76379739  17.06427974 119.45861067]
   [122.63797966 140.6480516  120.68684545]
   [122.63219273  99.54053544 123.82340431]
   ...
   [122.6240302  125.2705667  123.47154113]
   [122.65729487  61.13298547 122.25554749]
   [122.72312167 123.07766271 126.41050316]]

  [[122.76379739  17.06427974 119.45861067]
   [122.63797966 140.6480516  120.68684545]
   [122.63219273  99.54053544 123.82340431]
   ...
   [122.6240302  125.2705667  123.47154113]
   [122.65729487  61.13298547 122.25554749]
   [122.72312167 123.07766271 126.41050316]]

  ...

  [[122.74625154 124.73931649 122.09038512]
   [122.63710911 147.86498482 121.17449037]
   [122.6375461  111.2283219  123.45539649]
   ...
   [122.62973337 139.0750397  