## Load Dataset

In [None]:
print(eeg_data.keys())

In [None]:
import h5py
import mne 

file_roi = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\processed_setfile\\processed_0_ICA_DLtrain.set"
file_eeg = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\setfile\\0_ICA_DLtrain.set"

with h5py.File(file_roi, 'r') as f:
    if 'roi' in f:
        ROI = f['roi']['source_voxel_data'][:]
        print(ROI.shape)

EEG = mne.io.read_raw_eeglab(file_eeg, preload=True)
print(EEG.info)

## Dataset

In [1]:
import os
import numpy as np
import h5py
import mne
import json
import time
from torch.utils.data import Dataset


class EEGROIDataset(Dataset):
    def __init__(self, roi_folder, eeg_folder, group_file , group_index, overlap=0.5, window_size=500):
        """
        Args:
            roi_folder (str): Path to the folder containing ROI .set files.
            eeg_folder (str): Path to the folder containing EEG .set files.
            overlap (float): Fraction of overlap between consecutive windows (0 <= overlap < 1).
            window_size (int): Number of samples in each window.
        """
        self.roi_folder = roi_folder
        self.eeg_folder = eeg_folder
        self.group_file = group_file
        self.group_index = group_index
        self.overlap = overlap
        self.window_size = window_size
        self.subjects = self._get_subject_list()

        self.eeg_data = []  # Will store tuples of (ROI segment, EEG segment)
        self.roi_data = []
        self._prepare_dataset()

    def _get_subject_list(self):
        """Gets the list of subjects based on file names in the ROI folder."""
        with open(self.group_file, 'r') as f:
            groups = json.load(f)

        subject_indices = groups.get(str(self.group_index), [])
        print(subject_indices)
        return subject_indices 

    def _prepare_dataset(self):
        """Reads and processes data for all subjects."""
        for subject in self.subjects:
            # start_time = time.time()
            roi_path = os.path.join(self.roi_folder, f"processed_{subject}_ICA_DLtrain.set")
            eeg_path = os.path.join(self.eeg_folder, f"{subject}_ICA_DLtrain.set")

            # Load ROI data
            with h5py.File(roi_path, 'r') as f:
                if 'roi' in f:
                    roi_data = f['roi']['source_voxel_data'][:]
                    # print(roi_data.shape)

            # Load EEG data
            eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()
            # end_time = time.time()
            # print(f"Load Data time: {end_time - start_time}")
            # Verify dimensions
            assert roi_data.shape[0] == 3, f"Unexpected ROI shape: {roi_data.shape}"
            assert roi_data.shape[1] == 5003, f"Unexpected ROI shape: {roi_data.shape}"
            assert eeg_data.shape[0] == 30, f"Unexpected EEG shape: {eeg_data.shape}"

            # Process and overlap data
            # start_time = time.time()
            self._process_subject_data(roi_data, eeg_data)
            # end_time = time.time()
            # print(f"Overlapping time: {end_time - start_time}")
            
    def _process_subject_data(self, roi_data, eeg_data):
        """Segments and overlaps data for a single subject."""
        time_len = int(int(eeg_data.shape[1] / 256) / 2)*2
        # print(time_len)
        eeg_window_size = 256 * 2
        roi_window_size = 200 * 2
        for start_idx in range(0, time_len, 2):
            eeg_step = start_idx * 256
            eeg_segment = eeg_data[:, eeg_step:eeg_step+eeg_window_size]

            roi_step = start_idx * 200
            roi_segment = roi_data[:, :, roi_step:roi_step+roi_window_size]
            
            if roi_segment.shape[2] == roi_window_size:
                roi_segment_reshape = roi_segment.reshape(-1, roi_window_size) 
                # print(roi_segment_reshape.shape)
                self.roi_data.append(roi_segment_reshape)
                self.eeg_data.append(eeg_segment)
            else:
                break

    def __len__(self):
        return len(self.eeg_data)

    def __getitem__(self, idx):
        return {
            "src": self.eeg_data[idx], 
            "tgt": self.eeg_data[idx], 
            "src_mask": None,
            "tgt_mask": None,
            "label": self.roi_data[idx]
        }

# Usage example
roi_folder = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\processed_setfile"
eeg_folder = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\setfile"
group_file = "subject_groups.json"
group_index = 0

# Create dataset
train_dataset = EEGROIDataset(roi_folder, eeg_folder, group_file, group_index)
print(f"Total dataset size: {len(train_dataset)}")
# Create dataset
test_dataset = EEGROIDataset(roi_folder, eeg_folder, group_file, "train")
print(f"Total dataset size: {len(test_dataset)}")


['156', '314', '448', '282', '513', '194', '188', '417', '133', '492', '170', '387', '300', '11', '515', '161', '337', '377', '184', '333', '489', '402', '118', '137', '395', '79', '446']
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\156_ICA_DLtrain.fdt
Reading 0 ... 58890  =      0.000 ...   230.039 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\314_ICA_DLtrain.fdt
Reading 0 ... 77475  =      0.000 ...   302.637 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\448_ICA_DLtrain.fdt
Reading 0 ... 77731  =      0.000 ...   303.637 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\282_ICA_DLtrain.fdt
Reading 0 ... 68762  =      0.000 ...   268.602 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\513_ICA_DLtrain.fdt
Reading 0 ... 71936  =      0.000 ...   281.000 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\194_ICA_DLtrain.fdt
Reading 0 ... 77260  =      0.000 ...   301.797 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\188_ICA_DLtrain.fdt
Reading 0 ... 76748  =      0.000 ...   299.797 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\417_ICA_DLtrain.fdt
Reading 0 ... 76902  =      0.000 ...   300.398 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\133_ICA_DLtrain.fdt
Reading 0 ... 70964  =      0.000 ...   277.203 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\492_ICA_DLtrain.fdt
Reading 0 ... 59229  =      0.000 ...   231.363 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\170_ICA_DLtrain.fdt
Reading 0 ... 77158  =      0.000 ...   301.398 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\387_ICA_DLtrain.fdt
Reading 0 ... 77158  =      0.000 ...   301.398 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\300_ICA_DLtrain.fdt
Reading 0 ... 60611  =      0.000 ...   236.762 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\11_ICA_DLtrain.fdt
Reading 0 ... 77199  =      0.000 ...   301.559 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\515_ICA_DLtrain.fdt
Reading 0 ... 59782  =      0.000 ...   233.523 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\161_ICA_DLtrain.fdt
Reading 0 ... 80609  =      0.000 ...   314.879 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\337_ICA_DLtrain.fdt
Reading 0 ... 79667  =      0.000 ...   311.199 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\377_ICA_DLtrain.fdt
Reading 0 ... 76933  =      0.000 ...   300.520 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\184_ICA_DLtrain.fdt
Reading 0 ... 80302  =      0.000 ...   313.680 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\333_ICA_DLtrain.fdt
Reading 0 ... 62311  =      0.000 ...   243.402 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\489_ICA_DLtrain.fdt
Reading 0 ... 87111  =      0.000 ...   340.277 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\402_ICA_DLtrain.fdt
Reading 0 ... 65301  =      0.000 ...   255.082 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\118_ICA_DLtrain.fdt
Reading 0 ... 78551  =      0.000 ...   306.840 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\137_ICA_DLtrain.fdt
Reading 0 ... 81233  =      0.000 ...   317.316 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\395_ICA_DLtrain.fdt
Reading 0 ... 87819  =      0.000 ...   343.043 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\79_ICA_DLtrain.fdt
Reading 0 ... 83445  =      0.000 ...   325.957 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\446_ICA_DLtrain.fdt
Reading 0 ... 61604  =      0.000 ...   240.641 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Total dataset size: 1922
['49', '422', '90', '345', '364', '23']
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\49_ICA_DLtrain.fdt
Reading 0 ... 59760  =      0.000 ...   233.438 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\422_ICA_DLtrain.fdt
Reading 0 ... 76799  =      0.000 ...   299.996 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\90_ICA_DLtrain.fdt
Reading 0 ... 59464  =      0.000 ...   232.281 secs...


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\345_ICA_DLtrain.fdt
Reading 0 ... 80220  =      0.000 ...   313.359 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\364_ICA_DLtrain.fdt
Reading 0 ... 77168  =      0.000 ...   301.438 secs...
Reading G:\共用雲端硬碟\CNElab_黎承宣&賴璁毅_EEG_ROI\A.Dataset\setfile\23_ICA_DLtrain.fdt
Reading 0 ... 60407  =      0.000 ...   235.965 secs...
Total dataset size: 399


  eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()


In [None]:
import os
import numpy as np
import h5py
import mne
from torch.utils.data import Dataset

roi_folder = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\processed_setfile"
eeg_folder = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\setfile"
roi_folder_ls = [f.split('_')[1] for f in os.listdir(roi_folder) if f.endswith('.set')]

shape_list = []

for subject in roi_folder_ls:
    roi_path = os.path.join(roi_folder, f"processed_{subject}_ICA_DLtrain.set")
    eeg_path = os.path.join(eeg_folder, f"{subject}_ICA_DLtrain.set")
    # Load ROI data
    with h5py.File(roi_path, 'r') as f:
        if 'roi' in f:
            roi_data = f['roi']['source_voxel_data'][:]
            print(roi_data.shape)

    # Load EEG data
    eeg_data = mne.io.read_raw_eeglab(eeg_path, preload=True).get_data()
    print(eeg_data.shape)
    shape_list.append((roi_data.shape, eeg_data.shape))

## Save File Name

In [None]:
import os
import json
import random

def save_filenames_to_file(folder_path, output_file):
    """
    Reads all filenames in a folder and saves them to a specified file.

    Args:
        folder_path (str): Path to the folder containing files.
        output_file (str): Path to the output file to save filenames.
    """
    # Get all filenames in the folder
    filenames = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Save filenames to a JSON file
    with open(output_file, 'w') as f:
        json.dump(filenames, f, indent=4)

    print(f"Saved {len(filenames)} filenames to {output_file}")

def get_common_subjects(folder1, folder2):
    """
    Finds common subjects between two folders based on filenames.

    Args:
        folder1 (str): Path to the first folder.
        folder2 (str): Path to the second folder.

    Returns:
        list: List of common subjects.
    """

    files1 = set([f.split('_')[0] for f in os.listdir(folder1) if f.endswith('.set')])
    files2 = set([f.split('_')[1] for f in os.listdir(folder2) if f.endswith('.set')])
    print(len(files1), len(files2))
    common_subjects = list(files1.intersection(files2))
    return common_subjects

def split_subjects_into_groups(subjects, num_groups):
    """
    Splits subjects into a specified number of random groups.

    Args:
        subjects (list): List of subject filenames.
        num_groups (int): Number of groups to split into.

    Returns:
        dict: Dictionary where keys are group indices and values are lists of subjects.
    """
    random.shuffle(subjects)
    groups = {i: [] for i in range(num_groups)}
    for idx, subject in enumerate(subjects):
        group_idx = idx % num_groups
        groups[group_idx].append(subject)
    return groups

# Example usage
folder2 = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\processed_setfile"
folder1 = "G:\\共用雲端硬碟\\CNElab_黎承宣&賴璁毅_EEG_ROI\\A.Dataset\\setfile"
output_file = "common_subjects.json"
num_groups = 20

# Find common subjects
common_subjects = get_common_subjects(folder1, folder2)

# Save common subjects to a file
with open(output_file, 'w') as f:
    json.dump(common_subjects, f, indent=4)
print(f"Saved {len(common_subjects)} common subjects to {output_file}")

# Split common subjects into groups
groups = split_subjects_into_groups(common_subjects, num_groups)
group_file = "subject_groups.json"
with open(group_file, 'w') as f:
    json.dump(groups, f, indent=4)
print(f"Saved subject groups to {group_file}")
