In [None]:
import os
import numpy as np
import pandas as pd
from scipy import interpolate

In [None]:
# root dir
root = 'TDBRAIN\derivatives'
# participants file path
ptc_path = os.path.join(root, 'TDBRAIN_participants_V2.tsv')
ptc = pd.read_csv(ptc_path, sep='\t')
ptc

In [None]:
# delete invalid participants and the participants with invalid indication(nan, replication)
ptc = ptc[~(ptc['participants_ID'].isna())]
ptc = ptc[~(ptc['indication'].isna())]
ptc = ptc[~(ptc['indication']=='REPLICATION')]
# capitalize the indication uniformly
ptc['indication'] = ptc['indication'].str.upper()
# remove invalid space placeholder of disease label
ptc['indication'] = ptc['indication'].str.strip()
ptc

In [None]:
# Get duplicate values of participants_id
l = ptc.drop_duplicates(['participants_ID'],keep=False)['participants_ID'].to_list()
dup_ptc = ptc[~(ptc['participants_ID'].isin(l))]
dup_ptc

In [None]:
# Test for the indication labels of different records of the same participant
# Same indication for all the records of the same participant
for i in range(len(dup_ptc)):
    temp = dup_ptc[dup_ptc['participants_ID']==dup_ptc['participants_ID'].iloc[i]].drop_duplicates(['indication'])
    if len(temp)!=1:
        print('The indication of all the records of the same participant are not the same!')

In [None]:
# Drop duplicate participants
ptc = ptc.drop_duplicates(['participants_ID'])
ptc

In [None]:
# EEG data preprocessing

# all disease mapping
all_dise_dict = {}
ls_ind = list(ptc['indication'].unique())
for i in range(len(ls_ind)):
    all_dise_dict[ls_ind[i]] = i
print(all_dise_dict)
    
ptc['indication_id'] = ptc['indication'].map(all_dise_dict)
ptc

In [None]:
# filter subjects with PARKINSON and healthy indication
# 25 PARKINSON & 47 healthy 
ptc = ptc[((ptc['indication_id']==20)|(ptc['indication_id']==2))]
ptc['is_parkinson'] = 0
ptc['is_parkinson'][ptc['indication_id']==20] = 1
ptc

In [None]:
# label.npy
# initialization column 'subject_id'
ptc['subject_id'] = 0
for i in range(ptc.shape[0]):
    ptc['subject_id'].iloc[i] = i+1

df_label = ptc[['is_parkinson', 'subject_id']]
label_path = './Label'
if not os.path.exists(label_path):
    os.mkdir(label_path)
    
np.save(label_path + '/label.npy', df_label.values)

In [None]:
np.load('./Label/label.npy')

In [None]:
# EEG preprocessing

# resampling to 256Hz
def resampling(array, freq=500, kind='linear'):
    t = np.linspace(1, len(array), len(array))
    f = interpolate.interp1d(t, array, kind=kind)
    t_new = np.linspace(1, len(array), int(len(array)/freq * 256))
    new_array = f(t_new)
    return new_array

# segmentation with no overlapping (256 timestamps)
# start from the middle position
def segment(df, window_size=256):
    res = []
    start = int(df.shape[0]/2)
    left_index = start - int(start/window_size) * window_size
    right_index = start + int((df.shape[0]-start)/window_size) * window_size
    for i in range(left_index, right_index, window_size):
        res.append(df.iloc[i: i+window_size, :])   
    return res


def eeg_data(eeg_path):
    # read .csv file
    df = pd.read_csv(eeg_path)
    """
    df['indication'] = ptc[ptc['participants_ID']==task_name.split('_')[0]]['indication'].values[0]
    df['task_label'] = task_name.split('_')[-2]
    df['session_label'] = task_name.split('_')[1]
    df['subject_label'] = task_name.split('_')[0]
    # mapping disease label
    df['num_indication'] = df['indication'].map(all_dise_dict)
    """
    trial = []
    for i in range(df.shape[1]):
        data = resampling(df.iloc[:, i].values, freq=500, kind='linear')
        data = pd.Series(data)
        trial.append(data)
    df_new = pd.concat(trial, axis=1, ignore_index=True)   
    # segmentation
    res_df = segment(df_new, window_size=256)
    return res_df

In [None]:
# 911 valid subjects
# Unified sampling rate: 500Hz
# Unified channels (26 EEG electrodes, 7 additional electrodes)
# Initial data downloaded from TDBrain database: All the EEG signals are preprocessed and saved as CSV files
# Data structure: subject-session-task(EC/EO)

feature_path = './Feature'
if not os.path.exists(feature_path):
    os.mkdir(feature_path)

sub_id = 1
ls_ptc = ptc['participants_ID'].to_list()
for sub in os.listdir(root):
    #print(sub)
    # select valid subjects
    if sub in ls_ptc:
        sub_path = os.path.join(root, sub) 
        li_sub = []
        for ses in os.listdir(sub_path):
            ses_path = os.path.join(sub_path, ses, 'eeg')
            for task in os.listdir(ses_path):
                # only use EC task
                if 'EC' in task:
                    task_path = os.path.join(ses_path, task)
                    res_df = eeg_data(task_path)
                    for df_std in res_df:
                        print(df_std)
                        print('--------------------------------------------------------------------------')
                        li_sub.append(df_std.values)
        array_sub = np.array(li_sub)
        print(array_sub.shape)
        np.save(feature_path + '/feature_{:02d}.npy'.format(sub_id), array_sub)  # :03d to save 1 as 01 to avoid potential sorting issue
        sub_id += 1

In [None]:
# Test the saved npy file
# example
np.load('./Feature/feature_16.npy').shape