In [14]:
import os
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import mne
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import accuracy_score


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

task_mapping = {
    # 1: "baseline", 2: "baseline",
    3: "task1", 7: "task1", 11: "task1",
    4: "task2",  8: "task2",12: "task2",
    5: "task3",  9: "task3", 13: "task3",
    6: "task4", 10: "task4", 14: "task4"
}

In [16]:
def make_sliding_epochs_with_offset(raw, duration, overlap, offset_sec=0.0):
    raw_offset = raw.copy()
    raw_offset.crop(tmin=offset_sec, tmax=None)
    epochs = mne.make_fixed_length_epochs(
        raw_offset, duration=duration, overlap=overlap, preload=True, verbose=False
    )
    return epochs


def load_eeg_data(edf_file_path, baseline_files=None):
    raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
    raw.pick(['Cz..', 'Oz..', 'T7..'])
    raw.filter(1., 40., fir_design='firwin', verbose=False)

    event_times = raw.annotations.onset
    event_labels = raw.annotations.description
    label_mapping = {'T0': 0, 'T1': 1, 'T2': 2}
    event_ids = np.array([label_mapping[label] for label in event_labels])

    events = np.zeros((len(event_times), 3), dtype=int)
    events[:, 0] = (event_times * raw.info['sfreq']).astype(int)
    events[:, -1] = event_ids
    

    event_id = {key: value for key, value in label_mapping.items()}

    epochs = mne.Epochs(
        raw, events, event_id=event_id, tmin=0, tmax=2,
        baseline=None, preload=True, verbose=False
    )

    # if baseline_files:
    #     baseline_data = []
    #     for baseline_file in baseline_files:

    #         baseline_raw = mne.io.read_raw_edf(baseline_file, preload=True, verbose=False)
    #         baseline_raw.pick(['Cz..', 'C3..', 'C4..'])
    #         baseline_data.append(baseline_raw.get_data())

    #     baseline_mean = np.mean(np.concatenate(baseline_data, axis=0), axis=0)
    #     epochs._data -= baseline_mean

    return epochs


In [17]:
root_dir = "./files/"

data_dict = {}


i = 0

for subject in sorted(os.listdir(root_dir)):
    subject_path = os.path.join(root_dir, subject)
    
    if os.path.isdir(subject_path) and re.match(r"S\d{3}", subject):
        # control the number of persons
        # if(i < 100):
        #     i+=1

        # else:
        #     break

        edf_files = sorted([f for f in os.listdir(subject_path) if f.endswith(".edf")])

        baseline_files = [os.path.join(subject_path, f) for f in edf_files if re.match(rf"{subject}R0[12]\.edf", f)]

        for edf_file in edf_files:
            match = re.match(r"(S\d{3})R(\d{2})\.edf", edf_file)
            if match:
                subject_id, session_id = match.groups()
                session_id = int(session_id)

                if session_id in task_mapping:
                    task = task_mapping[session_id]
                    full_path = os.path.join(subject_path, edf_file)

                    if subject_id not in data_dict:
                        data_dict[subject_id] = {task: []}
                    if task not in data_dict[subject_id]:
                        data_dict[subject_id][task] = []
                    
                    data_dict[subject_id][task].append((full_path, baseline_files))
                    # data_dict[subject_id][task].append(full_path)



In [18]:
train_files, test_files = [], []

for subject_id, tasks in data_dict.items():
    for task, file_list in tasks.items():
        if len(file_list) >= 3:
            train_files.extend(file_list[:2])
            test_files.append(file_list[2])

def extract_person_id(file_path):
    match = re.search(r'S(\d+)/S\d+R\d+\.edf$', file_path)
    return int(match.group(1)) if match else None

In [19]:
train_files

[('./files/S001/S001R03.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R07.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R04.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R08.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R05.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R09.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R06.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S001/S001R10.edf',
  ['./files/S001/S001R01.edf', './files/S001/S001R02.edf']),
 ('./files/S002/S002R03.edf',
  ['./files/S002/S002R01.edf', './files/S002/S002R02.edf']),
 ('./files/S002/S002R07.edf',
  ['./files/S002/S002R01.edf', './files/S002/S002R02.edf']),
 ('./files/S002/S002R04.edf',
  ['./files/S002/S002R01.edf', './files/S002/S002R02.edf']),

In [20]:
# class EEGMotorImageryDataset(Dataset):
#     def __init__(self, file_list):
#         self.file_list = file_list
#         self.data = []
#         self.labels = []


#         for file_path, baseline_files in self.file_list:
#             epochs = load_eeg_data(file_path, baseline_files)

#             self.data.append(epochs.get_data())
#             self.labels.append(epochs.events[:, -1])

#         self.data = np.concatenate(self.data, axis=0).astype(np.float32)
#         self.labels = np.concatenate(self.labels, axis=0)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sample = self.data[idx]
#         label = self.labels[idx]
#         sample = sample[np.newaxis, :, :]
#         return torch.tensor(sample), torch.tensor(label)

# train_dataset = EEGMotorImageryDataset(train_files)
# test_dataset = EEGMotorImageryDataset(test_files)

# batch_size = 8
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [21]:
# import numpy as np

# def convert_dataloader_to_numpy(dataloader):
#     X_list, y_list = [], []

#     for X_batch, y_batch in dataloader:
#         X_list.append(X_batch.numpy())
#         y_list.append(y_batch.numpy())

#     X_array = np.concatenate(X_list, axis=0)
#     y_array = np.concatenate(y_list, axis=0)

#     return X_array, y_array

# X_train, y_train = convert_dataloader_to_numpy(train_loader)
# X_test, y_test = convert_dataloader_to_numpy(test_loader)

# X_train = X_train.reshape(X_train.shape[0], -1)
# X_test = X_test.reshape(X_test.shape[0], -1)

# print(f"Train_shape: {X_train.shape}, Test_shape: {X_test.shape}")


In [22]:
class EEGMotorImageryDataset(Dataset):
    def __init__(self, file_list):
        self.file_list = file_list
        self.data = []
        self.labels = []
        self.person_ids = []  # Store person IDs as a list initially

        for file_path, baseline_files in self.file_list:
            epochs = load_eeg_data(file_path, baseline_files)
            num_samples = epochs.get_data().shape[0]  # Number of samples in this EEG file

            person_id = extract_person_id(file_path)  # Extract person ID from full path

            self.data.append(epochs.get_data())  # Append EEG data
            self.labels.append(epochs.events[:, -1])  # Append labels
            self.person_ids.append(np.full(num_samples, person_id))  # Append person ID array

        # Convert lists to NumPy arrays
        self.data = np.concatenate(self.data, axis=0).astype(np.float32)
        self.labels = np.concatenate(self.labels, axis=0)
        self.person_ids = np.concatenate(self.person_ids, axis=0)  # Now concatenate correctly

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        person_id = self.person_ids[idx]

        sample = sample[np.newaxis, :, :]  # Add channel dimension
        return torch.tensor(sample), torch.tensor(label), torch.tensor(person_id)

train_dataset = EEGMotorImageryDataset(train_files)
test_dataset = EEGMotorImageryDataset(test_files)

  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=True, verbose=False)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 321 and the array at index 696 has size 257

In [None]:
# Convert dataset to NumPy arrays
x_train = np.stack([train_dataset[i][0].numpy() for i in range(len(train_dataset))])  # EEG data
y_label_train = np.array([train_dataset[i][1].item() for i in range(len(train_dataset))])   # Labels
y_personid_train = np.array([train_dataset[i][2].item() for i in range(len(train_dataset))])  # Person IDs
x_train = x_train.reshape(x_train.shape[0], -1)


x_train_with_label = np.column_stack((x_train, y_label_train))
print("New shape of x_train_with_label:", x_train_with_label.shape)

print("x_train shape:", x_train.shape)  # Expected: (num_samples, 1, channels, time)
print("y_label_train shape:", y_label_train.shape)  # Expected: (num_samples,)
print("y_personid_train shape:", y_personid_train.shape)  # Expected: (num_samples,)

x_test = np.stack([test_dataset[i][0].numpy() for i in range(len(test_dataset))])  # EEG data
y_label_test = np.array([test_dataset[i][1].item() for i in range(len(test_dataset))])   # Labels
y_personid_test = np.array([test_dataset[i][2].item() for i in range(len(test_dataset))])  # Person IDs
x_test = x_test.reshape(x_test.shape[0], -1)

x_test_with_label = np.column_stack((x_test, y_label_test))
print("New shape of x_train_with_label:", x_test_with_label.shape)

print("x_test shape:", x_test.shape)  # Expected: (num_samples, 1, channels, time)
print("y_label_test shape:", y_label_test.shape)  # Expected: (num_samples,)
print("y_personid_test shape:", y_personid_test.shape)  # Expected: (num_samples,)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_with_label = scaler.fit_transform(x_train_with_label)
x_test_with_label = scaler.transform(x_test_with_label)


from sklearn.preprocessing import LabelEncoder

personid_encoder = LabelEncoder()
y_personid_train = personid_encoder.fit_transform(y_personid_train)
y_personid_test = personid_encoder.transform(y_personid_test)

New shape of x_train_with_label: (25189, 964)
x_train shape: (25189, 963)
y_label_train shape: (25189,)
y_personid_train shape: (25189,)
New shape of x_train_with_label: (12600, 964)
x_test shape: (12600, 963)
y_label_test shape: (12600,)
y_personid_test shape: (12600,)


In [None]:
np.unique(y_personid_train, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104]),
 array([240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240,
        240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240,
        240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240,
        240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240,
        240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240,
        240, 240, 240, 240, 240, 240

## Using SVM

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1.0)
model.fit(x_train_with_label, y_personid_train)
y_pred = model.predict(x_test_with_label)
acc = accuracy_score(y_personid_test, y_pred)
print(f"Accuracy: {acc:.2%}")

In [None]:
np.unique(y_pred, return_counts=True)

(array([0, 1, 2]), array([2669,  247,  564]))

## Using XGBoost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_train_with_label, label=y_personid_train)
dtest = xgb.DMatrix(x_test_with_label, label=y_personid_test)

params = {
    "objective": "multi:softmax",
    "num_class": len(np.unique(y_personid_train)),
    "eval_metric": "mlogloss",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

y_pred = bst.predict(dtest)
accuracy = accuracy_score(y_personid_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 19.37%


In [None]:
np.unique(y_pred, return_counts=True)

(array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
         22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
         33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
         44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
         55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
         66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
         77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
         88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
         99., 100., 101., 102., 103., 104.], dtype=float32),
 array([ 77, 152,  74, 228, 130, 175,  99, 203, 106,  75, 185, 207,  76,
        172,  60, 145,  67, 119, 122, 115, 170,  85,  87, 123,  99, 110,
         65, 120, 209, 145,  71,  98, 147, 142, 208,  55, 114, 157,  84,
        155, 135, 155,  76, 103,  51,  69, 113,  83,  