In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from pyedflib import EdfReader
from tqdm.notebook import tqdm

In [None]:
'''
https://www.mathworks.com/matlabcentral/answers/225716-how-i-can-read-chb01_03-edf-seizures-file-from-chb-mit-database-in-matlab-as-i-am-using-this-file-f
Returns start time and length of the seizure
'''
def get_seizure_period(file_location):
    bytes_array = []
    for b in Path(file_location).read_bytes(): bytes_array.append(b)
    return int(str(bin(bytes_array[38]))[2:] + str(bin(bytes_array[41]))[2:],2), bytes_array[49]

In [None]:
'''
List of numpy array, each position contains a patient's array of data
'''
def read_and_store_data (dataset_folder, sample_rate, channels) :
    initial_path = getcwd()
    chdir(dataset_folder)
    
    patients = [d for d in listdir() if path.isdir(d) and d.startswith('chb')]
    patients.sort()
    arr = np.array([], dtype=np.float64).reshape(0, len(channels))
    for p in patients:
        chdir(p)
        print('Reading data of patient', p)
        
        # for each patient specify the edf files and the seizure files
        edf = [f for f in listdir() if path.isfile(f) and f.endswith('edf')]
        edf.sort()
        seizures = [f for f in listdir() if path.isfile(f) and f.endswith('seizures')]
        seizures.sort()
        for e in tqdm(edf):
            sigbufs = readEdfFile(e, channels)
            if seizures and seizures[0].startswith(e):
                (start, length) = get_seizure_period(seizures[0])
                for i in range(start * sample_rate, (start+length)*sample_rate + 1):
                    sigbufs[i][len(channels)-1] = 1.0
                seizures.pop(0)
        arr = np.concatenate([arr, sigbufs])
        chdir('..')
    chdir(initial_path)
    
    df = pd.DataFrame(arr, columns = channels)
    df.reset_index(drop = True, inplace = True)
    return df