In [31]:
import os
import numpy as np
import h5py
import pandas as pd
from scipy.signal import resample
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [32]:
import mne
from mne.preprocessing import ICA
try:
    from mne_icalabel import label_components
except Exception:
    label_components = None

In [33]:
SAMPLE_RATE = 200  # fs
# SAMPLE_LEN = 1.0   # sample seconds
# OVERLAPPING = 0.8  # overlapping seconds
sub_folder_path = str(SAMPLE_RATE) + 'Hz'
sub_folder_path

'200Hz'

## Load participants.tsv file

In [34]:
# root dir
root = "CESCA/"
# root = "Simon-conflict Task/ds004580-download/"
participants_path = os.path.join(root, 'participants.tsv')
participants = pd.read_csv(participants_path, sep='\t')
participants

Unnamed: 0,participant_id,Age_Grouped,Gender,Handedness,Highest_Edu_Grouped,Highest_Adult_Edu_Grouped,PassiveAuditoryOddballEEG,PassiveAuditoryOddballEEG_missingness_reason,VisualOddballEEG,VisualOddballEEG_missingness_reason,...,asrs16,asrs17,asrs18,asrs1_binary,asrs2_binary,asrs3_binary,asrs4_binary,asrs5_binary,asrs6_binary,ADHD_binary
0,sub-001,18 to 22,female,right,High School,Some College or Associate Degree,1,,1,,...,sometimes,rarely,rarely,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sub-002,23 to 26,male,right,Bachelor's Degree or Higher,Some College or Associate Degree,0,Task not administered,1,,...,never,never,never,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,sub-003,18 to 22,female,right,High School,Bachelor's Degree or Higher,0,Task not administered,1,,...,rarely,rarely,sometimes,1.0,0.0,1.0,1.0,0.0,1.0,1.0
3,sub-004,18 to 22,female,right,High School,Bachelor's Degree or Higher,0,Task not administered,1,,...,sometimes,rarely,never,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,sub-005,18 to 22,male,right,Some College or Associate Degree,High School or Less,0,Task not administered,1,,...,rarely,never,never,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,sub-123,18 to 22,male,right,Some College or Associate Degree,Bachelor's Degree or Higher,1,,1,,...,,,,,,,,,,
123,sub-124,18 to 22,male,right,Some College or Associate Degree,Bachelor's Degree or Higher,1,,1,,...,rarely,never,rarely,1.0,0.0,0.0,1.0,0.0,1.0,0.0
124,sub-125,18 to 22,male,"ambidextrous (i.e., both right and left)",Some College or Associate Degree,High School or Less,1,,1,,...,sometimes,sometimes,sometimes,1.0,1.0,1.0,0.0,0.0,0.0,0.0
125,sub-126,18 to 22,male,right,Some College or Associate Degree,Bachelor's Degree or Higher,1,,0,Participant-related issue,...,sometimes,rarely,rarely,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
# get Highest_Adult_Edu_Grouped info
unique_parent_degree = list(participants['Highest_Adult_Edu_Grouped'].unique())
unique_parent_degree

['Some College or Associate Degree',
 "Bachelor's Degree or Higher",
 'High School or Less']

In [36]:
degree_map = {'High School or Less':0, 'Some College or Associate Degree':1, "Bachelor's Degree or Higher":2}
degree_map

{'High School or Less': 0,
 'Some College or Associate Degree': 1,
 "Bachelor's Degree or Higher": 2}

In [37]:
participants[participants['participant_id']=='sub-001']['Highest_Adult_Edu_Grouped']

0    Some College or Associate Degree
Name: Highest_Adult_Edu_Grouped, dtype: object

## Find if there are bad channels, check sampling frequency and data shape to avoid inconsistency

In [38]:
# Test for bad channels, sampling freq and shape
"""bad_channel_list, sampling_freq_list, data_shape_list = [], [], []
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        for file in os.listdir(sub_path):
            if 'auditoryoddball_eeg.vhdr' in file:
            # if 'visualoddball_eeg.vhdr' in file:
                file_path = os.path.join(sub_path, file)
                print(file_path)
                raw = mne.io.read_raw_brainvision(file_path, preload=True)                
                bad_channel = raw.info['bads']
                bad_channel_list.append(bad_channel)
                # get sampling frequency
                sampling_freq = raw.info['sfreq']
                sampling_freq_list.append(sampling_freq)
                # get eeg data
                data = raw.get_data()
                data_shape = data.shape
                data_shape_list.append(data_shape)"""

"bad_channel_list, sampling_freq_list, data_shape_list = [], [], []\nfor sub in os.listdir(root):\n    if 'sub-' in sub:\n        sub_path = os.path.join(root, sub, 'eeg/')\n        for file in os.listdir(sub_path):\n            if 'auditoryoddball_eeg.vhdr' in file:\n            # if 'visualoddball_eeg.vhdr' in file:\n                file_path = os.path.join(sub_path, file)\n                print(file_path)\n                raw = mne.io.read_raw_brainvision(file_path, preload=True)                \n                bad_channel = raw.info['bads']\n                bad_channel_list.append(bad_channel)\n                # get sampling frequency\n                sampling_freq = raw.info['sfreq']\n                sampling_freq_list.append(sampling_freq)\n                # get eeg data\n                data = raw.get_data()\n                data_shape = data.shape\n                data_shape_list.append(data_shape)"

In [39]:
"""from collections import Counter

print(bad_channel_list)
print(data_shape_list[0])
print("Channel number counter:", Counter(i[0] for i in data_shape_list))
print("Sampling rate counter:", Counter(sampling_freq_list))"""

'from collections import Counter\n\nprint(bad_channel_list)\nprint(data_shape_list[0])\nprint("Channel number counter:", Counter(i[0] for i in data_shape_list))\nprint("Sampling rate counter:", Counter(sampling_freq_list))'

## Pick common channels

In [40]:
# channel number not consistent, take the common channels

"""common_channels = []
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        # print(sub_path)
        for file in os.listdir(sub_path):
            if 'auditoryoddball_eeg.vhdr' in file:
                file_path = os.path.join(sub_path, file)
                raw = mne.io.read_raw_brainvision(file_path, preload=True)
                
                current_channels = set(raw.info['ch_names'])
                if not common_channels:
                    common_channels = current_channels
                else:
                    common_channels &= current_channels
common_channels = list(common_channels)
print(common_channels)
print("Common channels number: ", len(common_channels))"""

'common_channels = []\nfor sub in os.listdir(root):\n    if \'sub-\' in sub:\n        sub_path = os.path.join(root, sub, \'eeg/\')\n        # print(sub_path)\n        for file in os.listdir(sub_path):\n            if \'auditoryoddball_eeg.vhdr\' in file:\n                file_path = os.path.join(sub_path, file)\n                raw = mne.io.read_raw_brainvision(file_path, preload=True)\n                \n                current_channels = set(raw.info[\'ch_names\'])\n                if not common_channels:\n                    common_channels = current_channels\n                else:\n                    common_channels &= current_channels\ncommon_channels = list(common_channels)\nprint(common_channels)\nprint("Common channels number: ", len(common_channels))'

In [41]:
common_channels = ['F7', 'FC1', 'P8', 'FC5', 'P4', 'CP1', 'CP6', 'C4', 'LM', 'P7', 'Oz', 'F3', 'FC2', 'CP5', 'Fp2', 'HER', 'HEL', 'O2', 'Pz', 'T7', 'RM', 'P3', 'FC6', 'F4', 'Fp1', 'O1', 'C3', 'VER', 'CP2', 'F8', 'Fz']

## Data preprocessing and segmentation

In [42]:
Auditory_oddball_feature_path = 'Processed/' + sub_folder_path + '/CESCA-AODD/Feature'
if not os.path.exists(Auditory_oddball_feature_path):
    os.makedirs(Auditory_oddball_feature_path)
    
Auditory_oddball_label_path = 'Processed/' + sub_folder_path + '/CESCA-AODD/Label'
if not os.path.exists(Auditory_oddball_label_path):
    os.makedirs(Auditory_oddball_label_path)

Visual_oddball_feature_path = 'Processed/' + sub_folder_path + '/CESCA-VODD/Feature'
if not os.path.exists(Visual_oddball_feature_path):
    os.makedirs(Visual_oddball_feature_path)
    
Visual_oddball_label_path = 'Processed/' + sub_folder_path + '/CESCA-VODD/Label'
if not os.path.exists(Visual_oddball_label_path):
    os.makedirs(Visual_oddball_label_path)
    
Flanker_feature_path = 'Processed/' + sub_folder_path + '/CESCA-FLANKER/Feature'
if not os.path.exists(Flanker_feature_path):
    os.makedirs(Flanker_feature_path)
    
Flanker_label_path = 'Processed/' + sub_folder_path + '/CESCA-FLANKER/Label'
if not os.path.exists(Flanker_label_path):
    os.makedirs(Flanker_label_path)

In [43]:
def data_preprocessing(
    raw: mne.io.Raw,
    common_channels: list,
    sample_rate: int = 250,
    notch_freq: float = 60.0,
    l_freq: float = 0.5,
    h_freq: float = 40.0,
    do_bad_interp: bool = True,
    verbose: bool = True,
):
    """
    Preprocessing steps ：
      1) choose common channels and reorder
      2) Set Montage 
      3) 60 Hz Notch（before band pass）
      4) bandpass filter（default 0.5–40 Hz）
      5) interpolate bad channels（if do_bad_interp is True）
      6) re-reference to average
      7) ICA（在 1 Hz 高通的副本上拟合，自动剔除眼动/肌电等分量，需 mne-icalabel）
      8) downsample to 250 Hz
    """
    
    # 1. select common channels and reorder to given order
    keep = [ch for ch in common_channels if ch in raw.ch_names]
    raw.pick_channels(keep)
    raw.reorder_channels(keep)
    if verbose:
        print(f"✔ Step 1: Picked common channels ({len(keep)}): {keep}")
        
    # 2. Set Montage
    raw.set_montage(mne.channels.make_standard_montage('standard_1020'))
    if verbose:
        print("✔ Step 2, Montage set: 'standard_1020'.")
        
    # 3. Notch（工频）
    if notch_freq is not None:
        raw.notch_filter(freqs=[notch_freq], picks="eeg", verbose=False)
        if verbose:
            print(f"✔ Step 3: Notch @ {notch_freq} Hz")
        
    # 4. Bandpass Filter (0.5–40 Hz)
    raw.filter(l_freq=l_freq, h_freq=h_freq, picks="eeg", verbose=False)
    if verbose:
        print(f"✔ Step 4: Band-pass {l_freq}–{h_freq} Hz")
        
    # 5. Interpolate bad channels
    if do_bad_interp and raw.info.get("bads"):
        raw.interpolate_bads(reset_bads=True, verbose=False)
        if verbose:
            print(f"✔ Step 5: Interpolated bads: {raw.info.get('bads', [])}")
    else:
        if verbose:
            print("ℹ Step 5: No bads to interpolate (set raw.info['bads'] first if needed)")
            
    # 6) Re-reference to average
    raw.set_eeg_reference("average", verbose=False)
    if verbose:
        print("✔ Step 6: Average reference")
    
    print()
    # 7) ICA (fit ICLabel on 1 Hz high-pass filtered copy, then apply to original)
    raw_for_ica = raw.copy().filter(l_freq=1.0, h_freq=None, picks="eeg", verbose=False)
    ica = ICA(n_components=None, method="fastica", random_state=97, max_iter="auto")
    ica.fit(raw_for_ica)

    excluded = []
    if label_components is not None:
        try:
            ic_labels = label_components(raw_for_ica, ica, method="iclabel")
            labels = ic_labels["labels"]
            probs = ic_labels["y_pred_proba"]  # (n_comp, n_classes)
            thresholds = {
                "eye blink": 0.7,
                "muscle artifact": 0.6,
                "heart beat": 0.5,
                "line noise": 0.8,
                "channel noise": 0.9,
            }
            for i, lab in enumerate(labels):
                if lab in thresholds:
                    if probs is not None:
                        p = probs[i].max()
                    else:
                        p = 1.0
                    if p >= thresholds[lab]:
                        excluded.append(i)
        except Exception as e:
            if verbose:
                print(f"⚠ ICLabel failed ({e}). Skipping auto exclusion.")
    else:
        if verbose:
            print("ℹ ICLabel not available; fitted ICA but no auto component exclusion.")

    if excluded:
        ica.exclude = sorted(set(excluded))
        raw = ica.apply(raw.copy())
        if verbose:
            print(f"✔ Step 7: ICA applied. Excluded comps: {ica.exclude}")
    else:
        if verbose:
            print("ℹ Step 7: No ICA components excluded.")

    # 8) downsample to 250 Hz
    if raw.info["sfreq"] != sample_rate:
        raw.resample(sample_rate, npad="auto", verbose=False)
    if verbose:
        print(f"✔ Step 8: Resampled to {sample_rate} Hz")
        
    return raw

In [44]:
Auditory_oddball_mapping_dict = { # According to the dataset DOI, S180 represents Pre-standard stimulus, S 80 represents Standard stimulus, S 70 represents Deviant stimulus
    "S180" : -1,  # Pre-standard stimulus
    "S 80" : 0,  # Standard stimulus
    "S 70" : 1   # Deviant stimulus
}

Visual_oddball_mapping_dict = {
    # Standard stimulation (non-target)
    "S 12": 0, "S 13": 0, "S 14": 0, "S 15": 0,
    "S 21": 0, "S 23": 0, "S 24": 0, "S 25": 0,
    "S 31": 0, "S 32": 0, "S 34": 0, "S 35": 0,
    "S 41": 0, "S 42": 0, "S 43": 0, "S 45": 0,
    "S 51": 0, "S 52": 0, "S 53": 0, "S 54": 0,
    
    # Oddball stimulation (target)
    "S 11": 1, "S 22": 1, "S 33": 1, "S 44": 1, "S 55": 1,   # according to GPT explanation
    
    # Response events
    "S201": 2,  # Correct response
    "S202": 3   # Incorrect response
}

Flanker_mapping_dict = {
    "S 11": 0, "S 12": 0,  # compatible
    "S 21": 1, "S 22": 1,  # incompatible
    
    "S111": 2, "S121": 2, "S212": 2, "S222": 2,  # Correct response
    "S112": 3, "S122": 3, "S211": 3,  "S221": 3,  # Incorrect response
}


def epoch_and_make_xy(
    raw: mne.io.Raw,
    events_tsv_path: str,
    tmin: float = -0.2,
    tmax: float = 0.8,
    baseline=(-0.2, 0),               # e.g, (-0.5, 0)   baseline correction from -0.5 to 0 s
    task_id: int = 0,            # assigned task ID, 0 for auditory oddball tasks, 1 for visual oddball task
    subject_id: int = 1,     
    parent_degree: int = 0,      # ['High School or Less', 'Some College or Associate Degree', 'Bachelor's Degree or Higher'] -> (0, 1, 2)
):
    
    ev = pd.read_csv(events_tsv_path, sep="\t")
    
    mapping_dict ={}
    # Remove useless data at the beginning
    if task_id == 0:
        mapping_dict = Auditory_oddball_mapping_dict
        ev = ev[~ev['value'].isin(['S  1', 'S  2', 'S180', 'boundary', 'Buffer Overflow'])]  # remove irrelevant event 
    if task_id == 1:
        mapping_dict = Visual_oddball_mapping_dict
        ev = ev[~ev['value'].isin(['boundary', 'Buffer Overflow'])]
        ev["code"] = ev['value'].map(mapping_dict).astype(int)
        ev = ev[ev["code"].isin([0, 1])].reset_index(drop=True)  # using visual stimulus (both target and non-target) as time-lock
    if task_id == 2:
        mapping_dict = Flanker_mapping_dict 
        ev = ev[~ev['value'].isin(['S  1', 'S  2', 'boundary', 'Buffer Overflow'])]  # remove irrelevant event 
        ev["code"] = ev['value'].map(mapping_dict).astype(int)
        ev = ev[ev["code"].isin([0, 1])].reset_index(drop=True)  # using flanker stimulus (both compatible and incompatible) as time-lock

    # from onset second to timestamp under current sampling frequency
    sfreq = raw.info["sfreq"]
    print(f"Current sampling frequency: {sfreq} Hz")
    s1_samples = np.round(ev["onset"].values * sfreq).astype(int)
    events = np.c_[s1_samples, np.zeros_like(s1_samples), np.ones_like(s1_samples).astype(int)]

    # segment epoch
    picks = mne.pick_types(raw.info, eeg=True, eog=False, exclude="bads")
    epochs = mne.Epochs(
        raw, events, event_id=dict(cue=1),
        tmin=tmin, tmax=tmax,
        baseline=baseline, picks=picks,
        proj=False, preload=True, reject=None, verbose=False
    )

    # guarantee int((tmax - tmin) * sfreq) timestamps 
    target_len = int((tmax - tmin) * sfreq)
    data = epochs.get_data()  # (N, C, T)
    if data.shape[-1] > target_len:
        data = data[..., :target_len]
    elif data.shape[-1] < target_len:
        pad = target_len - data.shape[-1]
        data = np.pad(data, ((0, 0), (0, 0), (0, pad)), mode="edge")
    
    # convert to (N, int((tmax - tmin) * sfreq), C)
    X = np.transpose(data, (0, 2, 1))
    
    # generate to Y: [task_id, stimulation_type, subject_id, parent_degree]
    valid_event_idx = epochs.selection   # select the index of event 
    stimulation_type = ev['value'].map(mapping_dict).values[valid_event_idx].astype(int)
    y = np.column_stack([
            np.full_like(stimulation_type, task_id),
            stimulation_type,
            np.full_like(stimulation_type, subject_id),
            np.full_like(stimulation_type, parent_degree),
    ])
    
    y = y[valid_event_idx]  # y align with X to avoid mismatch
    
    return X, y

In [45]:
import re
import os
task_id = 0
def get_subject_id(path: str) -> int:
    # match string like "sub-xxx" 
    match = re.search(r"sub-(\d+)", path)
    if match:
        return int(match.group(1).lstrip("0"))  
    else:
        raise ValueError(f"No subject_id found in path: {path}")
    
    
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        # print(sub_path)
        subject_id = get_subject_id(sub_path)
        parent_degree_str = participants[participants['participant_id']==sub]['Highest_Adult_Edu_Grouped'].item()
        parent_degree_int = degree_map[parent_degree_str]
        
        for file in os.listdir(sub_path):
            if 'auditoryoddball_eeg.vhdr' in file or 'visualoddball_eeg.vhdr' in file or "task-flanker_eeg.vhdr" in file: 
                # check task
                # only use visual task here
                if "task-auditoryoddball_eeg.vhdr" in file:
                    task_id = 0
                    feature_path = Auditory_oddball_feature_path
                    label_path = Auditory_oddball_label_path
                if "task-visualoddball_eeg.vhdr" in file: 
                    task_id = 1
                    feature_path = Visual_oddball_feature_path
                    label_path = Visual_oddball_label_path
                if "task-flanker_eeg.vhdr" in file: 
                    task_id = 2
                    feature_path = Flanker_feature_path
                    label_path = Flanker_label_path
                # load eeg data and preprocess
                print(sub_path)
                
                vhdr_path = os.path.join(sub_path, file)
                events_file = file.split(".")[0].split("_")[0] + "_" + file.split(".")[0].split("_")[1] + "_events.tsv"
                events_file_path = os.path.join(sub_path, events_file)
                
                print(vhdr_path)
                raw = mne.io.read_raw_brainvision(vhdr_path, preload=True)                
                raw.pick_types(eeg=True)   # only keep EEG channels
                raw = data_preprocessing(raw, common_channels, SAMPLE_RATE, notch_freq=60, l_freq=0.5, h_freq=40, verbose=True)
                print()
                
                # segment and make X, y
                
                print(f"Start epoching file {file} making X, y...")
                X, y = epoch_and_make_xy(
                    raw, events_file_path,
                    tmin=-0.2, tmax=0.8, baseline=(-0.2, 0),  #  -200ms - 800ms
                    task_id=task_id, subject_id=subject_id, parent_degree=parent_degree_int, 
                )
                print(f"File {file} epoch into trial X shape: {X.shape}, y shape: {y.shape}")

                # save X, y to npy files
                np.save(feature_path + '/feature_{:03d}.npy'.format(subject_id), X)
                np.save(label_path + '/label_{:03d}.npy'.format(subject_id), y)
                print("------------------------------------------------\n")


CESCA/sub-001\eeg/
CESCA/sub-001\eeg/sub-001_task-auditoryoddball_eeg.vhdr
Extracting parameters from CESCA/sub-001\eeg/sub-001_task-auditoryoddball_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 124029  =      0.000 ...   248.058 secs...
✔ Step 1: Picked common channels (26): ['F7', 'FC1', 'P8', 'FC5', 'P4', 'CP1', 'CP6', 'C4', 'P7', 'Oz', 'F3', 'FC2', 'CP5', 'Fp2', 'O2', 'Pz', 'T7', 'P3', 'FC6', 'F4', 'Fp1', 'O1', 'C3', 'CP2', 'F8', 'Fz']
✔ Step 2, Montage set: 'standard_1020'.
✔ Step 3: Notch @ 60 Hz
✔ Step 4: Band-pass 0.5–40 Hz
ℹ Step 5: No bads to interpolate (set raw.info['bads'] first if needed)
✔ Step 6: Average reference

Fitting ICA to data using 26 channels (please be patient, this may take a while)
Selecting by non-zero PCA components: 25 components
Fitting ICA took 12.8s.
Applying ICA to Raw instance
    Transforming to ICA space (25 components)
    Zeroing out 7 ICA components
    Projecting back using 26 PCA components
✔ Step 7: ICA applied. Excluded comps:

## Load and check the processed data

In [46]:
# Test the saved npy file
# example
feature_path = Auditory_oddball_feature_path   
label_path = Auditory_oddball_label_path

print(f"Loading auditory oddball trials:\n")
total_samples = 0
for feature_file, label_file in zip(os.listdir(feature_path), os.listdir(label_path)):
    sub_id = int(re.search(r'\d+', feature_file).group())
    feature_file_path = os.path.join(feature_path, feature_file)
    label_file_path = os.path.join(label_path, label_file)
    X = np.load(feature_file_path)
    y = np.load(label_file_path)
    print(f"Subject {sub_id}: X shape: {X.shape}, y shape: {y.shape}")
    if X.shape[0] != y.shape[0]:
        raise(f"Subject {sub_id} data and label length mismatch: " 
                f"{X.shape[0]} vs {y.shape[0]}")
    total_samples += np.load(feature_file_path).shape[0]
    sub_id += 1
print("\nTotal number of trials:", total_samples)

Loading auditory oddball trials:

Subject 1: X shape: (335, 200, 26), y shape: (335, 4)
Subject 13: X shape: (335, 200, 26), y shape: (335, 4)
Subject 14: X shape: (335, 200, 26), y shape: (335, 4)
Subject 15: X shape: (335, 200, 26), y shape: (335, 4)
Subject 16: X shape: (335, 200, 26), y shape: (335, 4)
Subject 17: X shape: (333, 200, 26), y shape: (333, 4)
Subject 18: X shape: (335, 200, 26), y shape: (335, 4)
Subject 20: X shape: (335, 200, 26), y shape: (335, 4)
Subject 21: X shape: (332, 200, 26), y shape: (332, 4)
Subject 22: X shape: (335, 200, 26), y shape: (335, 4)
Subject 23: X shape: (335, 200, 26), y shape: (335, 4)
Subject 24: X shape: (333, 200, 26), y shape: (333, 4)
Subject 25: X shape: (335, 200, 26), y shape: (335, 4)
Subject 26: X shape: (334, 200, 26), y shape: (334, 4)
Subject 27: X shape: (332, 200, 26), y shape: (332, 4)
Subject 28: X shape: (332, 200, 26), y shape: (332, 4)
Subject 29: X shape: (333, 200, 26), y shape: (333, 4)
Subject 30: X shape: (335, 200, 

In [47]:
# Test the saved npy file
# example
feature_path = Visual_oddball_feature_path   
label_path = Visual_oddball_label_path

print(f"Loading visual oddball trials:\n")
total_samples = 0
for feature_file, label_file in zip(os.listdir(feature_path), os.listdir(label_path)):
    sub_id = int(re.search(r'\d+', feature_file).group())
    feature_file_path = os.path.join(feature_path, feature_file)
    label_file_path = os.path.join(label_path, label_file)
    X = np.load(feature_file_path)
    y = np.load(label_file_path)
    print(f"Subject {sub_id}: X shape: {X.shape}, y shape: {y.shape}")
    if X.shape[0] != y.shape[0]:
        raise(f"Subject {sub_id} data and label length mismatch: " 
                f"{X.shape[0]} vs {y.shape[0]}")
    total_samples += np.load(feature_file_path).shape[0]
    sub_id += 1
print("\nTotal number of trials:", total_samples)

Loading visual oddball trials:

Subject 1: X shape: (210, 200, 26), y shape: (210, 4)
Subject 2: X shape: (210, 200, 26), y shape: (210, 4)
Subject 3: X shape: (210, 200, 26), y shape: (210, 4)
Subject 4: X shape: (210, 200, 26), y shape: (210, 4)
Subject 5: X shape: (210, 200, 26), y shape: (210, 4)
Subject 6: X shape: (210, 200, 26), y shape: (210, 4)
Subject 7: X shape: (210, 200, 26), y shape: (210, 4)
Subject 8: X shape: (210, 200, 26), y shape: (210, 4)
Subject 9: X shape: (210, 200, 26), y shape: (210, 4)
Subject 10: X shape: (210, 200, 26), y shape: (210, 4)
Subject 11: X shape: (210, 200, 26), y shape: (210, 4)
Subject 12: X shape: (210, 200, 26), y shape: (210, 4)
Subject 13: X shape: (210, 200, 26), y shape: (210, 4)
Subject 14: X shape: (210, 200, 26), y shape: (210, 4)
Subject 15: X shape: (210, 200, 26), y shape: (210, 4)
Subject 16: X shape: (210, 200, 26), y shape: (210, 4)
Subject 17: X shape: (210, 200, 26), y shape: (210, 4)
Subject 18: X shape: (210, 200, 26), y sha

In [48]:
# Test the saved npy file
# example
feature_path = Flanker_feature_path   
label_path = Flanker_label_path

print(f"Loading flanker trials:\n")
total_samples = 0
for feature_file, label_file in zip(os.listdir(feature_path), os.listdir(label_path)):
    sub_id = int(re.search(r'\d+', feature_file).group())
    feature_file_path = os.path.join(feature_path, feature_file)
    label_file_path = os.path.join(label_path, label_file)
    X = np.load(feature_file_path)
    y = np.load(label_file_path)
    print(f"Subject {sub_id}: X shape: {X.shape}, y shape: {y.shape}")
    if X.shape[0] != y.shape[0]:
        raise(f"Subject {sub_id} data and label length mismatch: " 
                f"{X.shape[0]} vs {y.shape[0]}")
    total_samples += np.load(feature_file_path).shape[0]
    sub_id += 1
print("\nTotal number of trials:", total_samples)

Loading flanker trials:

Subject 1: X shape: (410, 200, 26), y shape: (410, 4)
Subject 2: X shape: (410, 200, 26), y shape: (410, 4)
Subject 3: X shape: (410, 200, 26), y shape: (410, 4)
Subject 4: X shape: (410, 200, 26), y shape: (410, 4)
Subject 5: X shape: (410, 200, 26), y shape: (410, 4)
Subject 6: X shape: (410, 200, 26), y shape: (410, 4)
Subject 7: X shape: (410, 200, 26), y shape: (410, 4)
Subject 8: X shape: (410, 200, 26), y shape: (410, 4)
Subject 9: X shape: (410, 200, 26), y shape: (410, 4)
Subject 10: X shape: (410, 200, 26), y shape: (410, 4)
Subject 11: X shape: (410, 200, 26), y shape: (410, 4)
Subject 12: X shape: (410, 200, 26), y shape: (410, 4)
Subject 13: X shape: (410, 200, 26), y shape: (410, 4)
Subject 14: X shape: (410, 200, 26), y shape: (410, 4)
Subject 15: X shape: (410, 200, 26), y shape: (410, 4)
Subject 16: X shape: (410, 200, 26), y shape: (410, 4)
Subject 17: X shape: (410, 200, 26), y shape: (410, 4)
Subject 18: X shape: (410, 200, 26), y shape: (41