*Requested third-party library：numpy, pandas, pyedflib, frozendict*

In [None]:
from config import *

In [1]:
import os
import numpy as np
from pyedflib import EdfReader
from pyedflib.highlevel import read_edf

In [2]:
# pyedflib provides two different levels of API for reading channels info
content, ch_hdr_lst, edf_hdr = read_edf(os.path.join(dataset_path, 'chb12', 'chb12_32.edf'), digital=True) # high-level

with EdfReader('./physionet.org/files/chbmit/1.0.0/chb12/chb12_32.edf') as f: # low-level 
    nch = f.getSignalLabels()


In [3]:
import glob
from frozendict import frozendict
import logging

logging.basicConfig(level=logging.DEBUG, format='%(levelname)s:%(message)s')

f_lst = glob.glob(os.path.join(dataset_path, '**', '*.edf'), recursive=True)
ch_hdr_to_fn = {}
for fn in f_lst:
    with EdfReader(fn) as f:
        eeg_list = f.getSignalHeaders()
        # Initialize an empty dictionary
        eeg_dict = {}
        duplicates = {}

        # Loop through the list and add each element as a key-value pair to the dictionary
        for k, eeg in enumerate(eeg_list):
            # Get the label value as the key
            key = eeg['label']
            # Get a copy of the element without the label as the value
            value = eeg.copy()
            # value.pop('label')
            # Add the key-value pair to the dictionary
            if key in eeg_dict:
                if value == eeg_dict[key]:
                    # logging.info('Duplicate channel name %s in %s' % (key, fn))
                    # print('Duplicate channel name %s in %s' % (key, fn))
                    duplicates[key].append(k)
                else:
                    # logging.warning('Conflict metadata within the same channel name %s in %s' % (key, fn))
                    raise Warning('Conflict metadata within the same channel name %s in %s' % (key, fn))
            else:
                duplicates[key] = [k]
            eeg_dict[key] = frozendict(value)
            

        chinfo_key = tuple(frozendict(achinfo) for achinfo in eeg_list)
        if chinfo_key in ch_hdr_to_fn:
            ch_hdr_to_fn[chinfo_key].append(fn)
        else:
            ch_hdr_to_fn[chinfo_key] = [fn]
        
        # Check if duplicates channel share the same content
        # for chname, lst in duplicates.items():
        #     if len(lst) > 1:
        #         n1 = lst[0]
        #         data1 = f.readSignal(n1, digital=True)
        #         for i in range(1, len(lst)):
        #             n2 = lst[i]
        #             data2 = f.readSignal(n2, digital=True)
        #             if np.any(data1 != data2):
        #                 print(f'Different content in channel {n1} and {n2} of {fn}')

        

In [4]:
len(ch_hdr_to_fn)

15

In [None]:
import pandas as pd
import os.path
from IPython.display import display_html
for ch_hdr, fn_lst in ch_hdr_to_fn.items():
    print(f'共有\033[1;33m {len(fn_lst):3} \033[0m个EDF文件共享如下表的\033[1;32m {len(ch_hdr)} \033[0m通道集合') # print(f'{len(fn_lst):3} EDF Files Share the same channels set within {len(ch_hdr)} elements.')
    df = pd.DataFrame(ch_hdr).set_index('label')
    df_styler = df.style.set_table_attributes("style='display:inline'")
    display_html(df_styler._repr_html_()+'<br>', raw=True)
    print(*sorted(os.path.basename(fn) for fn in fn_lst), '\n', sep=' ')
    

In [10]:
# 聚合每位患者的最大公共通道子集并提取该子集 Aggregate the maximum common subset for each patient and extract that subset
import os
# import pprint
import pandas as pd
import os.path
from IPython.display import display_html
# 转换为浮点数组后这四个参数不影响通道之间的比较 These four channel parameters do not affect the comparison between channels after int2float conversion, so they can be discarded
def dropfunc(d):
    # for k in ['digital_max', 'digital_min', 'physical_max', 'physical_min']:
    #     del d[k]
    return d

output_dict = {}
for folder, sub_folders, files in os.walk(dataset_path):  
    chinfo2nch_lst = []
    common_set = None    
    for special_file in files:
        if special_file.endswith('.edf'):
            if special_file in ignore_lst:  # en: Discard these files as their channel information differs significantly from the same patient mode and is not very useful for prediction
                                            # zh: 舍弃掉这些文件，它们的通道信息与同患者众数差异太大且对预测用处不大，不过检测任务可能有小用
                continue
            file_path = os.path.join(folder, special_file)
            with EdfReader(file_path) as f:
                eeg_list = f.getSignalHeaders()
                chinfo2nch = {}
                for n, chinfo in enumerate(eeg_list):
                    key = frozendict(dropfunc(eeg_list[n]))
                    if key not in chinfo2nch:
                        chinfo2nch[key] = n
                chinfo2nch_lst.append((special_file, chinfo2nch))
                common_set = common_set & chinfo2nch.keys() if common_set else set(chinfo2nch.keys())
    if common_set:
        print(f'Patient {chinfo2nch_lst[0][0][3:5]} 最大公共子集有 {len(common_set)} 通道', ) # print(f'Patient {chinfo2nch_lst[0][0][3:5]}'s Max Common channel set contains {len(common_set)} elements', )
        df = pd.DataFrame(common_set).set_index('label')
        # df_styler = df.style.set_table_attributes("style='display:inline'")
        # display_html(df_styler._repr_html_()+'<br>', raw=True)   
        
        # edf_files = [t[0] for t in chinfo2nch_lst]
        # shuffle_df = pd.DataFrame({ch_dict['label']: pd.Series([t[1][ch_dict] for t in chinfo2nch_lst], index=edf_files) for ch_dict in common_set})
        
        # shdf_mode = shuffle_df.mode()
        # row_list = shdf_mode.loc[0, :].values.flatten().tolist()
        # df_ne = shuffle_df.ne(row_list)
        # df_any = df_ne.any(axis=1)
        # print(df_any[df_any==True])
        
        # df_styler = shuffle_df.style.set_table_attributes("style='display:inline'")
        # display_html(df_styler._repr_html_()+'<br>', raw=True)
        
        label_lst = [ch_dict['label'] for ch_dict in common_set]
        edf_shfl_lst = [[t[0], [t[1][ch_dict] for ch_dict in common_set]] for t in chinfo2nch_lst]
        output_dict[folder] = [label_lst, edf_shfl_lst]
        


Patient 01 最大公共子集有 22 通道
Patient 02 最大公共子集有 22 通道
Patient 03 最大公共子集有 22 通道
Patient 04 最大公共子集有 22 通道
Patient 05 最大公共子集有 22 通道
Patient 06 最大公共子集有 22 通道
Patient 07 最大公共子集有 22 通道
Patient 08 最大公共子集有 22 通道
Patient 09 最大公共子集有 23 通道
Patient 10 最大公共子集有 22 通道
Patient 11 最大公共子集有 23 通道
Patient 12 最大公共子集有 23 通道
Patient 13 最大公共子集有 19 通道
Patient 14 最大公共子集有 23 通道
Patient 15 最大公共子集有 32 通道
Patient 16 最大公共子集有 23 通道
Patient 17 最大公共子集有 23 通道
Patient 18 最大公共子集有 23 通道
Patient 19 最大公共子集有 23 通道
Patient 20 最大公共子集有 23 通道
Patient 21 最大公共子集有 23 通道
Patient 22 最大公共子集有 23 通道
Patient 23 最大公共子集有 22 通道
Patient 24 最大公共子集有 22 通道


In [11]:
# 手动为Pat12和Pat15加上同名异质的T8-P8通道 Manually add T8-P8 channels with the same label but different info to Pat12 & Pat15
pat12 = output_dict[os.path.join(dataset_path, 'chb12')]
pat12[0].append('T8-P8')
for _, nchs in pat12[1]:
    nchs.append(27)

pat15 = output_dict[os.path.join(dataset_path, 'chb15')]
pat15[0].append('T8-P8')
for _, nchs in pat15[1]:
    nchs.append(28)

In [None]:
# Debug
output_dict[os.path.join(dataset_path, 'chb12')]

In [13]:
import json
with open('raw_clean_mapping.json', 'w') as f:
    json.dump(output_dict, f)