In [None]:
import pyedflib

import numpy as np
import pandas as pd

import plotly.graph_objects as go

import glob
import os
import re

from tqdm import tqdm

In [None]:
def get_patient_dict(patient:str, root_path:str) -> dict:
    info_file = open(root_path + patient + '/' + patient + '-summary.txt','r').readlines()
    patient_dict = {'channel_list': []}
    for line in info_file:
        if(re.findall(r'(File Name: )\D*\d*(_)\d*(.edf)', line)):
            file = re.findall(r'(?:chb)\d*_\d*(?:.edf)', line)[0]
            patient_dict[file] = {'seizure_start': [], 'seizure_end': []}
        elif(re.findall(r'Channel \d+', line)):
            patient_dict['channel_list'].append(str(re.findall(r'Channel\s\d+:\s(\S*)', line)[0]))
        elif(re.findall(r'Seizure Start Time|Seizure \d+ Start Time', line)):
            patient_dict[file]['seizure_start'].append(int(re.findall(r'(\d+)\sseconds', line)[0]))
        elif(re.findall(r'Seizure End Time|Seizure \d+ End Time', line)):
            patient_dict[file]['seizure_end'].append(int(re.findall(r'(\d+)\sseconds', line)[0]))
    return patient_dict

In [None]:
def get_labeled_file(file_path:str, channel_list:list, patient_dict:dict) -> pd.DataFrame:
    edf_file = pyedflib.EdfReader(file_path)
    signal_data = np.zeros((edf_file.getNSamples()[0], len(channel_list)))
    for i, channel in enumerate(channel_list):
        signal_data[:, i] = edf_file.readSignal(channel_list.index(channel))
    dataframe = pd.DataFrame(signal_data, columns=channel_list).astype('float32')
    dataframe["seconds"] = np.floor(np.linspace(0, len(dataframe)/edf_file.getSampleFrequencies()[0], len(dataframe), endpoint=False)).astype('uint16')
    file_name = re.findall(r'([^\/]+$)', file_path)[-1]
    seizure_start_list = patient_dict.get(file_name).get("seizure_start")
    seizure_end_list = patient_dict.get(file_name).get("seizure_end")
    dataframe["seizure"] = 0
    if(len(seizure_start_list) > 0):
        for seizure in range(len(seizure_start_list)):
            start_second = seizure_start_list[seizure]
            end_second = seizure_end_list[seizure]
            dataframe.loc[dataframe["seconds"].between(start_second, end_second), "seizure"] = 1
    dataframe = dataframe.drop(columns=["seconds"])
    return dataframe

In [None]:
def get_complete_patient_data(patient:str, channel_list:list, root_path:str) -> pd.DataFrame:
    parent_path = root_path + patient
    all_patient_files = sorted(glob.glob(os.path.join(parent_path , ("*.edf"))))
    all_patient_files = [ x for x in all_patient_files if "+" not in x ]
    patient_dict = get_patient_dict(patient=patient, root_path=root_path)
    concat_list = []
    bar = tqdm(total=len(all_patient_files))
    for file in all_patient_files:
        concat_list.append(get_labeled_file(file_path=file, channel_list=channel_list, patient_dict=patient_dict))
        bar.update(1)
    bar.close()
    dataframe = pd.concat(concat_list, axis=0, ignore_index=True)
    dataframe["patient"] = patient
    dataframe["timestamp"] = pd.date_range('1970-01-01 00:00:00', freq='3906250N', periods=len(dataframe))
    return dataframe

In [None]:
def scalp_database_to_dataframe(patient_list:list, channel_list:list, root_path:str) -> pd.DataFrame:
    for patient in patient_list:
        print("Processing Patient: " + patient)
        temp_df = get_complete_patient_data(patient=patient, channel_list=channel_list, root_path=root_path)
        temp_df.to_pickle('../00_Data/Dataframes/' + patient + '.pkl')
    return None

In [None]:
root_path = '../00_Data/chb-mit-scalp-eeg-database-1.0.0/'
all_patients = sorted([patient for patient in os.listdir(root_path) if re.match(r'(chb)\d+', patient)])
channels = ['FP1-F7', 'C3-P3', 'C4-P4', 'CZ-PZ', 'F3-C3', 'F4-C4', 'F7-T7', 'F8-T8', 'FP1-F3', 'FP2-F4', 'FP2-F8', 'FT10-T8', 'FT9-FT10', 'FZ-CZ', 'P3-O1', 'P4-O2', 'P7-O1', 'P7-T7', 'P8-O2', 'T7-FT9', 'T7-P7', 'T8-P8-0', 'T8-P8-1']

In [None]:
df = scalp_database_to_dataframe(patient_list=all_patients, channel_list=channels, root_path=root_path)

# Data Generation Algorithmus:
1. Get path to root folder
2. Get list of all patient folders
3. Iterate over all all patients
    3.1 Get Info-Dict of current patient
    3.2 Iterate over indivudual files
        3.2.1 Load and transform current file
        3.2.2 Add second column
        3.2.3 Look in dict if seizures are present
        3.2.4 If yes, label the data
        3.2.5 Return labeled file data
    3.3 Combine all files into one dataframe
    3.4 Add Timestamp for later optinal resampling
4. Combine all dataframes into one

# Dict Structure
{
    patient: {
        channel_list: [],
        file_name: {
            seizure_start: [],
            seizure_end: []
        }
    }
}