# Setup

In [27]:
%cd  /home/tlm/Work/FYP-22-23/weee-preprocess

/home/tlm/Work/FYP-22-23/weee-preprocess


In [28]:
# %pip install neurokit2 --quiet

In [29]:
import os
import datetime
import numpy as np
import pandas as pd
import mne

In [30]:
DATASET_DIR = "./data/v1"
SAMPLING_RATE = 256 # Hz

# Prepare data

In [31]:
from utils.dataset import StudyInfoEncoder, DatasetVersion1
from utils.naming import standardize_column_names
from utils.resample import upsample

# Process

In [32]:
df = DatasetVersion1(DATASET_DIR).get(1, 'MUSE', 'EEG')
df.head()

Unnamed: 0,timestamp,alpha,beta,delta,gamma,theta
0,2021-12-03 16:54:57.575885056,171.648346,802.63739,784.505493,241.355316,684.175842
1,2021-12-03 16:54:57.576206080,1550.073242,825.60437,788.131897,1573.443237,835.677673
2,2021-12-03 16:54:57.576272128,0.0,787.325989,784.505493,0.0,776.446899
3,2021-12-03 16:54:57.576153088,959.377319,798.205139,785.714294,942.454224,763.553101
4,2021-12-03 16:54:57.575933952,89.047623,811.904785,788.131897,72.930405,783.699646


In [33]:
activity_encoder = StudyInfoEncoder(os.path.join(DATASET_DIR, 'Study_Information.csv'))

In [34]:
def process_task(p):
    global activity_encoder
    df = DatasetVersion1(DATASET_DIR).get(p, 'MUSE', 'EEG')
    df.insert(1, 'participant', p)
    activity_encoder.fit_activity_column(df, p, timestamp_column='timestamp', activity_column_index=2)
    df = standardize_column_names(df)
    print(f"Done processing participant {p:02d}")
    return df

In [35]:
# concat all participants row wise
from itertools import chain
muse_eeg = pd.concat([process_task(p) for p in chain(range(1, 2), range(3, 18))], axis=0, ignore_index=True)

Done processing participant 01
Done processing participant 03
Done processing participant 04
Done processing participant 05
Done processing participant 06
Done processing participant 07
Done processing participant 08
Done processing participant 09
Done processing participant 10
Done processing participant 11
Done processing participant 12
Done processing participant 13
Done processing participant 14
Done processing participant 15
Done processing participant 16
Done processing participant 17


# Windowing

In [36]:
muse_eeg['activity'].isna().sum()

1520071

In [37]:
WINDOW_SIZES = ['2s', '4s', '6s', '8s', '10s', '12s']
AGG_FUNCS = ['mean', 'std', 'min', 'max', 'median']

In [38]:
def create_non_overlapping_windows(window_size, agg_funcs, ignore_incomplete_windows=True):
    dfs = []
    for p, p_df in muse_eeg.groupby('participant'):
        for a, pa_df in p_df.groupby('activity'):
            # pre touch ups
            df = pa_df.sort_values('timestamp')
            df = df.dropna(subset=['activity']) # remove rows with nan activity
            if ignore_incomplete_windows:
                number_of_windows = len(df) // int(window_size[:-1])
                df = df.iloc[:number_of_windows*int(window_size[:-1])] # remove incomplete windows
            
            # create windows and aggregate
            agg_cols = {c: agg_funcs for c in df.columns if c not in ['timestamp', 'activity', 'participant']}
            agg_cols['timestamp'] =  ['first', 'last']
            windowed = df.groupby(pd.Grouper(key='timestamp', freq=window_size)).agg(agg_cols)

            # post touch ups
            windowed.columns = ['_'.join(col).strip() for col in windowed.columns.values]
            windowed = windowed.reset_index()
            windowed.insert(0, 'participant', p)
            windowed.insert(1, 'activity', a)
            windowed.insert(2, 'window', range(len(windowed)))
            dfs.append(windowed)
    return pd.concat(dfs, axis=0, ignore_index=True)

In [39]:
%mkdir -p data/v2/MUSE
DATASET_DIR_V2 = "data/v2"

In [40]:
for ws in WINDOW_SIZES:
    muse_eeg_windowed = create_non_overlapping_windows(ws, AGG_FUNCS)
    muse_eeg_windowed.to_csv(os.path.join(DATASET_DIR_V2, 'MUSE', f"muse_eeg_{ws}.csv"), index=False)
    print(f"Done processing window size {ws}")

Done processing window size 2s
Done processing window size 4s
Done processing window size 6s
Done processing window size 8s
Done processing window size 10s
Done processing window size 12s
