In [24]:
import mne
import glob
import os
import pandas as pd
import numpy as np
from pprint import pprint
import re

In [3]:
# Constantes: Dataset
patient = '01'
sample_rate = 256

# Otras constantes
window_seconds = 30

In [4]:
info = f'./data/chb{patient}/chb{patient}-summary.txt'
eegs = glob.glob(f'./chb{patient}/*.edf')
eegs.sort()

In [5]:
with open(info, 'r') as f:
    summary = f.read()

summary_parts = summary.split('\n\n')[:-1]
info_eegs_raw = filter(lambda part: part.startswith('File'), summary_parts)

info_eegs = list()

for file_info in info_eegs_raw:
    info_eeg = dict()
    for row in file_info.split('\n'):
        key, value = row.split(': ')
        value.strip()
        info_eeg[key] = value
    info_eegs.append(info_eeg)

In [6]:
info_eegs[3]

{'File Name': 'chb01_04.edf',
 'File Start Time': '14:43:12',
 'File End Time': '15:43:12',
 'Number of Seizures in File': '1',
 'Seizure Start Time': '1467 seconds',
 'Seizure End Time': '1494 seconds'}

In [7]:
channels = ['F7-T7', 'T7-P7', 'P7-T7', 'P7-O1', 'P8-O2', 'T8-P8-1', 'F8-T8']

i = 3
info = info_eegs[i]
eeg_file = f"./data/chb{patient}/{info['File Name']}"
eeg = mne.io.read_raw_edf(eeg_file)

raw_eeg = eeg.get_data(picks=channels)
print(raw_eeg.shape)

Extracting EDF parameters from /home/heladioac/Documentos/Proyectos/Commercial-EEG/data/chb01/chb01_04.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  eeg = mne.io.read_raw_edf(eeg_file)


(7, 921600)


In [8]:
if int(info['Number of Seizures in File']) > 0:
    start = info['Seizure Start Time']
    start = int(start.split(' ')[0])
    end = info['Seizure End Time']
    end = int(end.split(' ')[0])
    
print(start, end)

1467 1494


In [9]:
frames = window_seconds * sample_rate
overlay = int(frames/2)
length = raw_eeg.shape[1]

In [10]:
dfs = []

for i in range(length//overlay - 1):
    start = i * overlay
    end = start + frames
    data = raw_eeg[:, start:end].T
    frame = np.arange(start, end).reshape(-1, 1)
    window = np.full((end - start, 1), i)
    columns = channels.copy()
    columns.append('frame')
    columns.append('window')

    df_i = pd.DataFrame(
        data=np.hstack((data, frame, window)),
        columns=columns)
    
    dfs.append(df_i)
    
df = pd.concat(dfs)

In [11]:
df['seizure'] = 0

n_seizures = int(info['Number of Seizures in File'])
if n_seizures > 0:
    seizures_keys = filter(lambda k: k.startswith('Seizure'), info)
    seizures = [[]*n_seizures]
    for i, key in enumerate(seizures_keys):
        value_seconds = int(info[key].split(' ')[0])
        value_frames = value_seconds * sample_rate
        seizures[i//2].append(value_frames)
    
    for s_start, s_end in seizures:
        df.loc[(df['frame'] >= s_start) & (df['frame'] <= s_end), 'seizure'] = 1


In [12]:
df.loc[df['seizure'] == 1].describe()

Unnamed: 0,F7-T7,T7-P7,P7-T7,P7-O1,P8-O2,T8-P8-1,F8-T8,frame,window,seizure
count,13826.0,13826.0,13826.0,13826.0,13826.0,13826.0,13826.0,13826.0,13826.0,13826.0
mean,3.613019e-07,2.481496e-07,1.425708e-07,3.596628e-07,1e-06,2.159136e-06,-1e-06,379008.0,97.722335,1.0
std,7.571068e-05,7.020268e-05,7.020268e-05,4.829287e-05,0.000131,0.0001270557,0.000169,1995.683357,0.803205,0.0
min,-0.0002936264,-0.0002834676,-0.0003166789,-0.0001592186,-0.000389,-0.0005010989,-0.000671,375552.0,96.0,1.0
25%,-4.630037e-05,-4.356532e-05,-4.004884e-05,-3.028083e-05,-7.6e-05,-8.224664e-05,-9.2e-05,377280.0,97.0,1.0
50%,-1.953602e-07,-1.367521e-06,1.758242e-06,5.860806e-07,-8e-06,-5.860806e-07,2e-06,379008.0,98.0,1.0
75%,4.708181e-05,4.043956e-05,4.395604e-05,3.184371e-05,6.3e-05,8.107448e-05,9.2e-05,380736.0,98.0,1.0
max,0.0002940171,0.0003170696,0.0002838584,0.0001928205,0.000504,0.0005968254,0.000561,382464.0,99.0,1.0


In [13]:
df.memory_usage()

Index      14684160
F7-T7      14684160
T7-P7      14684160
P7-T7      14684160
P7-O1      14684160
P8-O2      14684160
T8-P8-1    14684160
F8-T8      14684160
frame      14684160
window     14684160
seizure    14684160
dtype: int64

In [14]:
columns = channels.copy()
columns
types = dict()
for col in columns:
    types[col] = 'Float16'

In [15]:
types.update({'seizure': bool, 'window': 'Int16', 'frame': 'Int32'})

print(types)

{'F7-T7': 'Float16', 'T7-P7': 'Float16', 'P7-T7': 'Float16', 'P7-O1': 'Float16', 'P8-O2': 'Float16', 'T8-P8-1': 'Float16', 'F8-T8': 'Float16', 'seizure': <class 'bool'>, 'window': 'Int16', 'frame': 'Int32'}


In [16]:
df = df.astype(types)
df.memory_usage()

Index      14684160
F7-T7       3671040
T7-P7       3671040
P7-T7       3671040
P7-O1       3671040
P8-O2       3671040
T8-P8-1     3671040
F8-T8       3671040
frame       9177600
window      5506560
seizure     1835520
dtype: int64

In [128]:
with open('Electroencefalografos.txt', 'r') as f:
    eeg_data_txt = f.read()
    
eeg_data = eeg_data_txt.split('\n\n')[1:]
pprint(eeg_data)

['\t1- DSI-7\n'
 '\t\tElectrodos: F3, F4, C3, C4, Pz, P3 y P4.\n'
 '\t\tPares:\n'
 '\t\t\tF3-C3\n'
 '\t\t\tF4-C4\n'
 '\t\t\tC3-P3\n'
 '\t\t\tC4-P4\n'
 '\t\tSampling rate:\n'
 '\t\t\t300 - 600 Hz',
 '\t2- DSI-4\n'
 '\t\tElectrodos: F7, F8, Fp1 y Fp2. \n'
 '\t\tPares:\n'
 '\t\t\tFp1-F7\n'
 '\t\t\tFp2-F8\n'
 '\t\tSampling rate:\n'
 '\t\t\t300 - 600 Hz',
 '\t3- EMOTIV EPOC+\n'
 '\t\tElectrodos: AF3, F7, F3, FC5, T7, P7, O1, O2, P8, T8, FC6, F4, F8 y AF4\n'
 '\t\tPares:\n'
 '\t\t\tF7-T7\n'
 '\t\t\tT7-P7\n'
 '\t\t\tP7-T7\n'
 '\t\t\tP7-O1\n'
 '\t\t\tP8-O2\n'
 '\t\t\tT8-P8-1\n'
 '\t\t\tF8-T8\n'
 '\t\tSampling rate:\n'
 '\t\t\t2048 internal downsampled to 128 SPS or 256 SPS (samples per second)\n'
 '\t\tSampling method: \n'
 '\t\t\tSequential sampling, single ADC?',
 '\t4- EMOTIV INSIGHT\n'
 '\t\tElectrodos: AF3, AF4, T7, T8 y Pz. \n'
 '\t\tPares:\n'
 '\t\tSampling rate:\n'
 '\t\t\t128 samples per second per channel',
 '\t5- Bitbrain Air\n'
 '\t\tElectrodos: Fp1, Fp2, AF7, AF8, PO7, PO8, O1, O2

In [129]:
prog_name = re.compile('^\d- (\S+)')

devices_info = list()

for device_info_txt in eeg_data:
    rows = device_info_txt.split('\n')
    rows = list(map(lambda x: x.strip(), rows))

    match = prog_name.search(rows[0])
    name = match.group(1)
    
    pairs_start = rows.index('Pares:') + 1
    pairs_end = rows.index('Sampling rate:')
    
    pairs = rows[pairs_start: pairs_end]
    
    device_info = {
        'name': name,
        'channels': pairs,
    }
    devices_info.append(device_info)

pprint(devices_info)

[{'channels': ['F3-C3', 'F4-C4', 'C3-P3', 'C4-P4'], 'name': 'DSI-7'},
 {'channels': ['Fp1-F7', 'Fp2-F8'], 'name': 'DSI-4'},
 {'channels': ['F7-T7', 'T7-P7', 'P7-T7', 'P7-O1', 'P8-O2', 'T8-P8-1', 'F8-T8'],
  'name': 'EMOTIV'},
 {'channels': [], 'name': 'EMOTIV'},
 {'channels': ['Fp1-AF7', 'Fp2-AF8'], 'name': 'Bitbrain'}]
