# Main analysis

The primary analysis for the thesis, where we train a classifier for the code vs prose task.

In [None]:
# Imports
from pathlib import Path
from datetime import datetime, timezone
from eegclassify import main, load, clean, features, preprocess

import pandas as pd
import numpy as np

# Set this to True to run on testing data
simulate_test = False
if simulate_test:
    import os
    os.environ['PYTEST_CURRENT_TEST'] = "true"

In [None]:
%%javascript
document.title='erb-thesis/Main - Jupyter'  // Set the document title to be able to track time spent working on the notebook with ActivityWatch

In [None]:
# Load raw EEG data
# FIXME: Look for discontinuities in the data


data_dir = Path('../data').resolve()
print(data_dir)

# TODO: Load all files from all subjects
#recordings_dir = data_dir / "test/local/museS/subject0001/session001/"
#files = sorted(list(recordings_dir.glob("*.csv")))

files = [data_dir / 'eeg/muse/subject0000/session001/recording_2021-04-02-14.03.36.csv']

eeg = load.load_eeg(files)
eeg = eeg.set_index('timestamp').sort_index()

In [None]:
# Load markers

# TODO: Extract 'relax' sections in between tasks?

marker_file = data_dir / 'tasks/visual-codeprose/subject0000/session000/subject0_session0_behOutput_2021-04-02-14.28.30.csv'
#marker_file = Path('/home/erb/.eegnb/data/visual-codeprose/local/none/session000/subject1_session0_behOutput_2021-03-26-14.31.14.csv')
df_markers = pd.read_csv(marker_file, index_col=0)
df_markers['duration'] = df_markers['t_answered'] - df_markers['t_presented']

# Filter away rows where the subject didn't spend at least 10 seconds with the task
df_markers = df_markers[df_markers['duration'] > 10]

# Filter away rows where space was clicked (didn't answer/skipped/unsure?)
df_markers = df_markers[df_markers['response'].isin(['up', 'down'])]

df_markers

In [None]:
# Preprocess
# TODO: Split using timestamps from the stimuli markers
sfreq = 256

for _, row in df_markers.iterrows():
    start = datetime.fromtimestamp(row['t_presented_utc'], timezone.utc)
    stop = datetime.fromtimestamp(row['t_answered_utc'], timezone.utc)
    epoch = eeg.truncate(start, stop)
    
    # Check that sample count aligns with epoch duration
    expected_samples = round(row['duration'] * sfreq)
    actual_samples = len(epoch)
    diff = expected_samples - actual_samples
    if abs(diff) > 5:
        logger.warning(f"Expected {expected_samples} samples, found {actual_samples}")

In [None]:
# TODO: Split into sessions (by date) for LORO CV