# Main analysis

The primary analysis for the thesis, where we train a classifier for the code vs prose task.

In [None]:
# Imports
import re
import logging
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from eegclassify import main, load, clean, features, preprocess

logger = logging.getLogger(__name__)

# Set this to True to run on testing data
simulate_test = False
if simulate_test:
    import os
    os.environ['PYTEST_CURRENT_TEST'] = "true"
    
# Configuration
use_bandpass_filter = True
classify_breaks = False
    
# Constants
sfreq = 256  # sampling frequency of the Muse S

In [None]:
%%javascript
document.title='erb-thesis/Main - Jupyter'  // Set the document title to be able to track time spent working on the notebook with ActivityWatch

### Loading EEG data

First we need to load the EEG data used during the experiments.

In [None]:
data_dir = Path('../data').resolve()

# Available in git
file_example = data_dir / 'eeg/muse/subject0000/session001/recording_2021-04-02-14.03.36.csv'

# Private data
files_private = []
path_private = Path('/home/erb/.eegnb/data/test/local/museS/subject0001/session001/')
if path_private.exists():
    files_private = list(path_private.glob('*.csv'))
    
files = [
    file_example,
    *files_private,
]

eeg = load.load_eeg(files)
eeg = eeg.set_index('timestamp').sort_index()
print(eeg.shape)

### Loading markers

Now we need to load the markers produced during the experiment, so we can annotate the EEG data.

In [None]:
marker_files = [
    data_dir / 'tasks/visual-codeprose/subject0000/session000/subject0_session0_behOutput_2021-04-02-14.28.30.csv',
]

marker_files_private = [
    Path('~/.eegnb/data/visual-codeprose/local/none/session000/subject1_session0_behOutput_2021-03-26-14.31.14.csv').expanduser()
]
for file in marker_files_private:
    if file.exists():
        marker_files.append(file)


def _build_breaks(df):
    starts = df['t_answered'].iloc[:-1].shift()
    starts_utc = df['t_answered_utc'].iloc[:-1].shift()
    stops = df['t_presented'].iloc[1:]
    stops_utc = df['t_presented_utc'].iloc[1:]
    
    breaks = pd.DataFrame({
        "t_presented": starts, 
        "t_answered": stops, 
        "t_presented_utc": starts_utc, 
        "t_answered_utc": stops_utc, 
        "type": "relax", 
        "duration": stops - starts, 
        'subject': df['subject'],
        'image_path': 'none',
        'response': 'up',  # as placeholder
    })
    return breaks

dfs = []
for file in marker_files:
    df = pd.read_csv(file, index_col=0)
    df['duration'] = df['t_answered'] - df['t_presented']
    match = re.search('subject(\d+)', str(file))
    assert match
    df['subject'] = int(match.group(1))
    
    if classify_breaks:
        breaks = _build_breaks(df)
        df = df.append(breaks)
    dfs.append(df)
df_markers = pd.concat(dfs).sort_values(by=['subject', 't_presented'])

# Filter away rows where the subject didn't spend at least 10 seconds with the task
n_prev = len(df_markers)
df_markers = df_markers[df_markers['duration'] > 5]
print(f"Filtered away {n_prev - len(df_markers)} epochs due to short duration")

# Filter away rows where space was clicked (didn't answer/skipped/unsure?)
n_prev = len(df_markers)
df_markers = df_markers[df_markers['response'].isin(['up', 'down'])]
print(f"Filtered away {n_prev - len(df_markers)} epochs due skipped by subject")

df_markers['img'] = df_markers['image_path'].apply((lambda c: c.split("/")[-1]))

# Preview first 5 rows
df_markers.drop(columns=['image_path']).head()

### Preprocessing

Now we need to preprocess the data a bit, gathering the EEG data for each epoch in the experiment.

 - [ ] Better cleaning/rejection of bad epochs/windows/samples

In [None]:
plt.figure(figsize=(15, 3))
plt.matshow(eeg.to_numpy()[:sfreq, :].T)

In [None]:
# Bandpass-filter the signal
if use_bandpass_filter:
    eeg_clean = clean.filter(eeg)
    for ch_idx, col in enumerate(eeg.columns):
        eeg[col] = eeg_clean[:, ch_idx]
        
    # plot the new result
    plt.figure(figsize=(15, 3))
    plt.matshow(eeg.to_numpy()[:sfreq, :].T)
else:
    print("Bandpass filtering was skipped")

In [None]:
epochs = []
for _, row in df_markers.iterrows():
    start = datetime.fromtimestamp(row['t_presented_utc'], timezone.utc)
    stop = datetime.fromtimestamp(row['t_answered_utc'], timezone.utc)
    epoch = eeg.truncate(start, stop)
    
    # Check that sample count aligns with epoch duration
    expected_samples = round(row['duration'] * sfreq)
    actual_samples = len(epoch)
    diff = expected_samples - actual_samples
    if abs(diff) > 5:
        logger.warning(f"Expected {expected_samples} samples, found {actual_samples}")
        
    epochs.append((epoch, row['type'], row['subject']))
print(len(epochs))

In [None]:
# Split epochs into windows
WINDOW_SIZE = 512

windows = []
for epoch, type, subject in epochs:
    for i in range(0, len(epoch), WINDOW_SIZE):
        window = epoch.iloc[i:i+WINDOW_SIZE]
        if len(window) == WINDOW_SIZE:
            windows.append((window, type, subject))
        else:
            print(f'epoch too small ({len(window)}), skipping')
print(len(windows))

### Constructing our dataset
 
 - [ ] Split into sessions (by date) for LORO CV

In [None]:
# Construct our X and y

X, y, subj = zip(*windows)
X = np.array([x.to_numpy().T for x in X])
print(X.shape)

In [None]:
from collections import Counter

y = np.array(y)
#y = np.array([0 if yy == 'prose' else 1 for yy in y])
print(y.shape)
print(Counter(y))

In [None]:
subj = np.array(subj)

In [None]:
plt.figure(figsize=(10, 5))
plt.matshow(X[0, :, :sfreq])

## Training our model

Here we train our model using pyRiemann.

 - [ ] Much of this code is from eegclassify/main.py, code should probably be reused better

In [None]:
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneGroupOut, learning_curve

from pyriemann.estimation import Covariances, ERPCovariances, XdawnCovariances
from pyriemann.spatialfilters import CSP
from pyriemann.tangentspace import TangentSpace


# Fixes non-convergence for binary classification
dual = set(y) == 2

clfs: Dict[str, Pipeline] = {
    # These four are from https://neurotechx.github.io/eeg-notebooks/auto_examples/visual_ssvep/02r__ssvep_decoding.html
    "CSP + Cov + TS": make_pipeline(
        Covariances(),
        CSP(4, log=False),
        TangentSpace(),
        LogisticRegression(dual=dual),
    ),
    "Cov + TS": make_pipeline(
        Covariances(), TangentSpace(), LogisticRegression(dual=dual)
    ),
    # Performs meh
    # "CSP + RegLDA": make_pipeline(
    #     Covariances(), CSP(4), LDA(shrinkage="auto", solver="eigen")
    # ),
    # Performs badly
    # "Cov + MDM": make_pipeline(Covariances(), MDM()),
}
    
def unison_shuffled_copies(a, b, c):
    assert len(a) == len(b) == len(c)
    p = np.random.permutation(len(a))
    return a[p], b[p], c[p]
    
cv_method = "LOGO"

for name, clf in clfs.items():
    logger.info(f"===== Training with {name} =====")
    
    # LORO/LOGO split
    # TODO: Is LOGO the same as LORO?
    logo = LeaveOneGroupOut()
    # x_idx, y_idx = logo.split(X, y, subj)

    # Shuffled split
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size=0.3, shuffle=True
    )

    logger.info("Training...")

    clf.fit(X_train, y_train)
    logger.info(f"Test score: {clf.score(X_test, y_test)}")

    y_pred = clf.predict(X_test)
    from eegclassify.main import _performance
    perf = _performance(y_test, y_pred)
    logger.info(perf)

    score = sklearn.model_selection.cross_val_score(clf, X, y, cv=3)
    logger.info(f"CV score (shuffled): {score}")
    
    # LORO
    score = sklearn.model_selection.cross_val_score(clf, X, y, cv=LeaveOneGroupOut(), groups=subj)
    logger.info(f"CV score (LORO):     {score}")

### Learning curves

Now to check the learning curves and see if the train and validation scores converge.

**Note:** Performance is currently terrible as there isn't enough data for the model to learn to generalize across subjects (easily seen by changing to shuffled CV).

A great example of how to plot learning curves is available here: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

In [None]:
for name, clf in clfs.items():
    logger.info(f"===== Training with {name} =====")
    
    # We create shuffled versions of the dataset to ensure that all stimuli of the same type aren't in sequence
    # (as is the case for subject 1 which didn't have shuffled stimuli)
    x_l, y_l, subj_l = unison_shuffled_copies(X, y, subj)
    
    # Compute the learning curve
    train_sizes, train_scores, valid_scores = learning_curve(
        clf, x_l, y_l, groups=subj_l, 
        train_sizes=range(50, 500, 50), cv=logo
    )
    
    plt.plot(train_sizes, train_scores, label="train score")
    plt.plot(train_sizes, valid_scores, label="valid score")
    plt.legend()
    plt.show()

# Braindecode stuff

Here we'll experiment with braindecode (convnets) to compare performance.

In [None]:
from braindecode.datautil import create_from_X_y

# This wants X to be in the shape (x_trials, n_channels, n_samples)
X, y, subj = zip(*epochs)
X = [x.to_numpy().T for x in X]
print(len(X), X[0].shape)
y = np.array(y)
print(y.shape)
windows_dataset = create_from_X_y(
    X, y, drop_last_window=False, sfreq=sfreq, ch_names=list(eeg.columns),
    window_stride_samples=500,
    window_size_samples=500,
)

In [None]:
windows_dataset.description['subject'] = subj
windows_dataset.description.head(10)

In [None]:
splitted = windows_dataset.split('subject')
train_set = splitted['0']
valid_set = splitted['1']

In [None]:
import torch
from braindecode.util import set_random_seeds
from braindecode.models import ShallowFBCSPNet

cuda = torch.cuda.is_available()  # check if GPU is available, if True chooses to use it
if cuda:
    print("CUDA available!")
    torch.backends.cudnn.benchmark = True
    
# Set random seed to be able to reproduce results
seed = 20200220
set_random_seeds(seed=seed, cuda=cuda)

# Extract number of chans and time steps from dataset
n_classes = len(set(y))
n_chans = train_set[0][0].shape[0]
input_window_samples = train_set[0][0].shape[1]

print(f"classes:   {n_classes}")
print(f"channels:  {n_chans}")
print(f"samples per window:  {input_window_samples}")

model = ShallowFBCSPNet(
    n_chans,
    n_classes,
    input_window_samples=input_window_samples,
    final_conv_length='auto',
)

# Send model to GPU
if cuda:
    model.cuda()

In [None]:
print(train_set[0])
print(train_set[0][0].shape)

In [None]:
from skorch.callbacks import LRScheduler
from skorch.helper import predefined_split

from braindecode import EEGClassifier

# These values we found good for shallow network:
lr = 0.0625 * 0.01
weight_decay = 0

# For deep4 they should be:
# lr = 1 * 0.01
# weight_decay = 0.5 * 0.001

batch_size = 64
n_epochs = 4

clf = EEGClassifier(
    model,
    criterion=torch.nn.NLLLoss,
    optimizer=torch.optim.AdamW,
    train_split=predefined_split(valid_set),  # using valid_set for validation
    optimizer__lr=lr,
    optimizer__weight_decay=weight_decay,
    batch_size=batch_size,
    callbacks=[
        "accuracy", ("lr_scheduler", LRScheduler('CosineAnnealingLR', T_max=n_epochs - 1)),
    ],
    device='cuda' if cuda else 'cpu',
)

# Model training for a specified number of epochs. `y` is None as it is already supplied in the dataset.
# FIXME: Remove try/except when error is resolved
try:
    clf.fit(train_set, y=None, epochs=n_epochs)
except Exception as e:
    logger.exception(e)