# Activity classification

Classification of device activities (tracked with ActivityWatch) from EEG data.

In [None]:
# Imports
import logging
from typing import Dict
from collections import defaultdict
from datetime import date

import matplotlib.pyplot as plt
import pandas as pd

import eegclassify
from eegclassify import main, load, clean, features, preprocess, plot, transform

logger = logging.getLogger(__name__)

# Set this to True to run on testing data
simulate_test = False
if simulate_test:
    import os
    os.environ['PYTEST_CURRENT_TEST'] = "true"
    
%matplotlib inline
plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "serif"  # since we're including the figures in serif-typed tex

In [None]:
%%javascript
document.title='erb-thesis/Activity - Jupyter'  // Set the document title to be able to track time spent working on the notebook with ActivityWatch

In [None]:
# Load data and save into special variable that won't be overwritten (since loading takes a while)
df_loaded = load.load_labeled_eeg2()

In [None]:
# TODO: Split data into sessions to perform out-of-session cross-validation

df_loaded.describe()

In [None]:
# Preprocess

df = df_loaded
df = preprocess.split_rows(df, min_duration=5)
#df = clean.clean(df)
df

In [None]:
# NOTE: This says nothing about the actual number of samples, only the number of events
plot.classdistribution(df)

In [None]:
def df_to_seconds_per_day_and_class(df) -> Dict[date, Dict[str, float]]:
    all_dates = {d.date() for d in df['start']}
    d: Dict[date, Dict[str, float]] = defaultdict(lambda: defaultdict(int))
    for date in all_dates:
        for idx, entry in df.iterrows():
            if date == entry['start'].date():
                d[date][entry['class']] += len(entry['raw_data']) / 256
    return d

seconds_per_day_and_class = df_to_seconds_per_day_and_class(df)

In [None]:
{date: sum(seconds_per_day_and_class[date].values()) for date in seconds_per_day_and_class.keys()}

In [None]:
combined_df = pd.DataFrame(seconds_per_day_and_class).T
combined_df = combined_df[(combined_df.T.sum() > 100 * 5)]    # at least 100x 5s windows for each date
combined_df = combined_df.T[(combined_df.sum() > 100 * 5)].T  # at least 100x 5s windows for each class
combined_df = combined_df.sort_index(axis=0).sort_index(axis=1)
combined_df = combined_df.filter(['Editing->Code', 'Editing->Prose', 'Twitter', 'YouTube'])
combined_df = combined_df.rename({'Editing->Code': 'Programming', 'Editing->Prose': 'Writing'}, axis=1)
combined_df

In [None]:
(combined_df[::-1]/60).plot.barh()
#plt.label("Date")
plt.xlabel("Minutes of data");

In [None]:
combined_df.sum().plot.bar(rot=0, stacked=True)
plt.xlabel("Category")
plt.ylabel("Seconds of data");

In [None]:
# Can we do PCA on the signal?

logging.getLogger('eegclassify.transform').setLevel(logging.ERROR)
X, y = transform.signal_ndarray(df)
print(X.shape)
#plot.pca(X, y)

In [None]:
all_dfs = []

# all classes with decent count
all_dfs += [clean._remove_rare(df, "class", threshold_count=50)]

# Code vs Prose
all_dfs += [clean._select_classes(df, "class", ["Editing->Code", "Editing->Prose"])]

# Code vs Twitter
all_dfs += [clean._select_classes(df, "class", ["Editing->Code", "Twitter"])]

# Code vs YouTube
all_dfs += [clean._select_classes(df, "class", ["Editing->Code", "YouTube"])]

# Prose vs Twitter
all_dfs += [clean._select_classes(df, "class", ["Editing->Prose", "Twitter"])]

# Prose vs YouTube (roughly same class size)
all_dfs += [clean._select_classes(df, "class", ["Editing->Prose", "YouTube"])]

# Twitter vs YouTube
all_dfs += [clean._select_classes(df, "class", ["Twitter", "YouTube"])]

# GitHub PR vs issue
#all_dfs += [clean._select_classes(df, "class", ["GitHub->Issues", "GitHub->Pull request"])]

In [None]:
# Train
from collections import Counter
import importlib
importlib.reload(eegclassify.main)
importlib.reload(eegclassify.transform)

for df_train in all_dfs:
    print(Counter(df_train['class']))
    print(f"Hours of data: {round(len(df_train['class']) * 5 / 60 / 60, 2)}")
    try:
        main._train_raw(df_train, shuffle=True)
    except Exception as e:
        # TODO: Fix testing data such that it doesn't err
        print("Error while training", e)

In [None]:
for df_train in all_dfs:
    print(len(df_train))
    try:
        main._train_features(df_train)
    except Exception as e:
        # TODO: Fix testing data such that it doesn't err
        logger.exception("Error while training")