In [33]:
import numpy as np
import matplotlib.pyplot as plt

# _TODO_

# Import EEG data
Imports the `pre-processed.p` file generated by the [pre-processing notebook](preprocessing.ipynb).

Shapes:
- X: `(subject, trials, components, bands)`
- y: `(subject, trials)`

In [34]:
import pickle

with open('preprocessed.p', 'rb') as f:
    X, y = pickle.load(f)

n_subjects = X.shape[0]
for i, subj in enumerate(X):
    n_trials, n_components, n_bands = subj.shape
    print(f"Subject {i+1}: {n_trials} trials, {n_components} components, {n_bands} bands average power")

Subject 1: 434 trials, 6 components, 2 bands average power
Subject 2: 434 trials, 6 components, 2 bands average power
Subject 3: 434 trials, 6 components, 2 bands average power
Subject 4: 434 trials, 6 components, 2 bands average power
Subject 5: 434 trials, 6 components, 2 bands average power
Subject 6: 434 trials, 6 components, 2 bands average power


In [35]:
# # Dummy data (random matrix with same shape as EEG data)
# rng = np.random.default_rng(42)
# X = rng.standard_normal((6, 434, 6, 2)).astype(np.float32)
# y = rng.integers(0, 2, size=(6, 434), dtype=np.int8)

# n_subjects = X.shape[0]
# for i, subj in enumerate(X):
#     n_trials, n_components, n_bands = subj.shape
#     print(f"Subject {i+1}: {n_trials} trials, {n_components} components, {n_bands} bands' average power")

# SVM

In [36]:
# rows = one per trial and subject, columns = features (average power of two bands for each component)
X_svm = X.reshape(n_subjects * n_trials, n_components * n_bands)  
y_svm = y.reshape(n_subjects * n_trials)
# for each row, which subject it belongs to
subject_ids = np.repeat(np.arange(n_subjects), n_trials)  # shape = (n_subjects * n_trials,)

## Inter-subject (LeaveOneGroupOut)

In [37]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.svm import SVC

scores = [] # one accuracy score per subject left out
for train_idx, test_idx in LeaveOneGroupOut().split(X_svm, y_svm, groups=subject_ids):  # number of iterations = n_subjects
    X_train, X_test = X_svm[train_idx], X_svm[test_idx]
    y_train, y_test = y_svm[train_idx], y_svm[test_idx]

    print("\nSubject left out for testing:", subject_ids[test_idx[0]])
    print(f"Training on {X_train.shape[0]} samples, testing on {X_test.shape[0]} samples")
    print("Training set class distribution (0s and 1s):", np.bincount(y_train))
    print("Testing set class distribution (0s and 1s):", np.bincount(y_test))

    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("Testing set predictions accuracy:", score)
    scores.append(score)

print("\nAverage cross-subject accuracy:", np.mean(scores))


Subject left out for testing: 0
Training on 2170 samples, testing on 434 samples
Training set class distribution (0s and 1s): [1127 1043]
Testing set class distribution (0s and 1s): [217 217]
Testing set predictions accuracy: 0.5

Subject left out for testing: 1
Training on 2170 samples, testing on 434 samples
Training set class distribution (0s and 1s): [1106 1064]
Testing set class distribution (0s and 1s): [238 196]
Testing set predictions accuracy: 0.511520737327189

Subject left out for testing: 2
Training on 2170 samples, testing on 434 samples
Training set class distribution (0s and 1s): [1122 1048]
Testing set class distribution (0s and 1s): [222 212]
Testing set predictions accuracy: 0.5069124423963134

Subject left out for testing: 3
Training on 2170 samples, testing on 434 samples
Training set class distribution (0s and 1s): [1123 1047]
Testing set class distribution (0s and 1s): [221 213]
Testing set predictions accuracy: 0.511520737327189

Subject left out for testing: 4


## Intra-subject (K-fold CV)

In [43]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="linear"))
])

# Parameters that will be tested in the inner cross-validation:
# for each outer fold, the inner CV will select the best C among these values
param_grid = {
    "svm__C": [0.01, 0.1, 1, 10]    # Possible values for the SVM regularization parameter C
}

scores = []     # to store the accuracy scores from each outer fold
best_Cs = []    # to store the best C found in each outer fold
for train_idx, test_idx in outer_cv.split(X_svm, y_svm):
    X_train, X_test = X_svm[train_idx], X_svm[test_idx]
    y_train, y_test = y_svm[train_idx], y_svm[test_idx]

    grid = GridSearchCV(
        pipe,
        param_grid,
        cv=inner_cv,
        scoring="accuracy"
    )

    grid.fit(X_train, y_train)
    scores.append(grid.score(X_test, y_test))
    best_Cs.append(grid.best_params_["svm__C"])

    print(f"\nOuter fold {len(scores)}:")
    print(f"Training on {X_train.shape[0]} samples, testing on {X_test.shape[0]} samples")
    print("Training set class distribution (0s and 1s):", np.bincount(y_train))
    print("Testing set class distribution (0s and 1s):", np.bincount(y_test))
    print("Mean accuracy on test set:", scores[-1])
    print("Best C found in inner CV:", best_Cs[-1])

print("\nOverall average accuracy across outer folds:", np.mean(scores))



Outer fold 1:
Training on 2083 samples, testing on 521 samples
Training set class distribution (0s and 1s): [1075 1008]
Testing set class distribution (0s and 1s): [269 252]
Mean accuracy on test set: 0.5489443378119002
Best C found in inner CV: 0.1

Outer fold 2:
Training on 2083 samples, testing on 521 samples
Training set class distribution (0s and 1s): [1075 1008]
Testing set class distribution (0s and 1s): [269 252]
Mean accuracy on test set: 0.5412667946257198
Best C found in inner CV: 1

Outer fold 3:
Training on 2083 samples, testing on 521 samples
Training set class distribution (0s and 1s): [1075 1008]
Testing set class distribution (0s and 1s): [269 252]
Mean accuracy on test set: 0.5758157389635317
Best C found in inner CV: 0.1

Outer fold 4:
Training on 2083 samples, testing on 521 samples
Training set class distribution (0s and 1s): [1075 1008]
Testing set class distribution (0s and 1s): [269 252]
Mean accuracy on test set: 0.5393474088291746
Best C found in inner CV: 10