This notebook is a tutorial to show how to manage the preprocessed data for sleep stage classification

In [37]:
import numpy as np 
import gzip as gz 
from tqdm.notebook import tqdm
import torch as th 
import pickle

In [2]:
datad="/home/allauzen/dev/edf/5-cassette"


In [3]:
fp = gz.open(datad+'/SC4671G0.npz.gz','rb')
data = np.load(fp,allow_pickle=True) 


In [4]:
# To see what it contains 
data.files

['x', 'y', 'fs', 'ch_label', 'header_raw', 'header_annotation']

In [5]:
# The data are stored in 'x' and 'y'
x = data['x']
y = data['y']

In [6]:
print(x.shape, y.shape)

(1968, 600, 4) (1968,)


In [7]:
# The header is the copy of the original one 
data["header_raw"]


array({'local_subject_id': 'X F X Female_87yr', 'local_recording_id': 'Startdate 07-AUG-1991 X X X', 'date_time': '2091-08-07 16:00:00', 'EDF+': False, 'contiguous': True, 'n_records': 2780, 'record_length': 30.0, 'n_channels': 7, 'label': ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'Resp oro-nasal', 'EMG submental', 'Temp rectal', 'Event marker'], 'transducer_type': ['Ag-AgCl electrodes', 'Ag-AgCl electrodes', 'Ag-AgCl electrodes', 'Oral-nasal thermistors', 'Ag-AgCl electrodes', 'Rectal thermistor', 'Marker button'], 'units': ['uV', 'uV', 'uV', '', 'uV', '', ''], 'physical_min': array([ -207.,  -179., -1055., -2048.,    -5.,     0., -2047.]), 'physical_max': array([ 207.,  179., 1055., 2047.,    5.,   30., 2048.]), 'digital_min': array([ -2048.,  -2048.,  -2048.,  -2048.,  -2500., -32768.,  -2047.]), 'digital_max': array([ 2047.,  2047.,  2047.,  2047.,  2500., 32767.,  2048.]), 'prefiltering': ['HP:0.5Hz LP:100Hz [enhanced cassette BW]', 'HP:0.5Hz LP:100Hz [enhanced cassette BW]', 

In [8]:
# The four channels in x are 'EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'EMG submental'
# You can take more if you modify the preparation script and rerun it. 
# To get a list all the files:
import os 
import glob
fnames = glob.glob(os.path.join(datad, "*npz.gz"))
print(fnames[:10]) # print the first 10

['/home/allauzen/dev/edf/5-cassette/SC4182E0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4772G0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4822G0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4341F0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4751E0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4811G0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4472F0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4122E0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4121E0.npz.gz', '/home/allauzen/dev/edf/5-cassette/SC4241E0.npz.gz']


In [27]:
devpart = 10
xtrain , xvalid = None , None 
ytrain , yvalid = None , None 
# If you take all the data you dhould end with 
# 
for fn in tqdm(fnames): 
    fp = gz.open(fn,'rb')
    data = np.load(fp,allow_pickle=False) # for now, don't care about headers
    x = data['x'][:,:,2] # Take only the EOG
    y = data['y'] # Take the labels 
    idx = np.arange(x.shape[0])
    np.random.shuffle(idx)
    devlim = x.shape[0]//devpart
    devpart = 10
    idx = np.arange(x.shape[0])
    np.random.shuffle(idx)
    devlim = x.shape[0]//devpart
    if xtrain is None: 
        xtrain = np.zeros((1,x.shape[1]))
        xvalid = np.zeros((1,x.shape[1]))
        ytrain , yvalid = np.zeros(1) , np.zeros(1)
    xvalid = np.concatenate((xvalid,x[idx[:devlim]]), axis=0)
    yvalid = np.concatenate((yvalid,y[idx[:devlim]]), axis=0)
    xtrain = np.concatenate((xtrain,x[idx[devlim:]]), axis=0)
    ytrain = np.concatenate((ytrain,y[idx[devlim:]]), axis=0)
    del x,y

  0%|          | 0/153 [00:00<?, ?it/s]

In [32]:
print(xtrain.shape, xvalid.shape)
print(ytrain.shape, yvalid.shape)


torch.Size([175995, 600]) torch.Size([19484, 600])
torch.Size([175995]) torch.Size([19484])


In [29]:
# clean the first dummy example 
xtrain , xvalid = xtrain[1:] , xvalid[1:] 
ytrain , yvalid = ytrain[1:] , yvalid[1:] 
print(xtrain.shape, xvalid.shape)
print(ytrain.shape, yvalid.shape)


(175995, 600) (19484, 600)
(175995,) (19484,)


In [31]:
# In Torch version 
xtrain, xvalid = th.FloatTensor(xtrain), th.FloatTensor(xvalid)
ytrain, yvalid = th.IntTensor(ytrain), th.IntTensor(yvalid)

In [33]:
outf="./cassette-th-data.pck"
fp = open(outf,"wb")
pickle.dump((xtrain , xvalid , ytrain , yvalid), fp)

In [36]:
!ls -lh ./cassette-th-data.pck

-rw-rw-r-- 1 allauzen allauzen 449M mars  11 12:52 ./cassette-th-data.pck
