# Setup

In [146]:
%cd  /home/tlm/Work/FYP-22-23/weee-preprocess

/home/tlm/Work/FYP-22-23/weee-preprocess


In [147]:
import os
import datetime
import numpy as np
import pandas as pd
import neurokit2 as nk

In [148]:
DATASET_DIR = "./data/v1"
SAMPLING_RATE = 4 # Hz

# Prepare data

In [149]:
from utils.dataset import StudyInfoEncoder, DatasetVersion1
from utils.naming import standardize_column_names
from utils.resample import upsample
from utils.time import parse_simple_timedelta

In [150]:
dataset = DatasetVersion1()
sie = StudyInfoEncoder(os.path.join(DATASET_DIR, 'Study_Information.csv'))

# Process

In [151]:
def process_task(p):
    global sie
    df = dataset.e4_eda(p)

    # process eda
    df = upsample(df, upsample_rate=SAMPLING_RATE*2, columns=['eda_raw'])
    signals, info = nk.eda_process(
        df['eda_raw'], sampling_rate=SAMPLING_RATE*2)
    df = pd.concat([df['timestamp'], signals], axis=1)

    # post touch ups
    df.insert(1, 'user_id', p)
    df = sie.fit_activity_column(
        df, p,
        timestamp_column='timestamp',
        activity_column='session_type',
        activity_column_index=2
    )
    df = standardize_column_names(df)
    print(f"Done processing participant {p:02d}")
    return df


In [152]:
# concat all participants row wise
E4_EDA = pd.concat([process_task(p) for p in range(1, 18)], axis=0, ignore_index=True)

Done processing participant 01
Done processing participant 02
Done processing participant 03
Done processing participant 04
Done processing participant 05
Done processing participant 06
Done processing participant 07
Done processing participant 08
Done processing participant 09
Done processing participant 10
Done processing participant 11
Done processing participant 12
Done processing participant 13
Done processing participant 14
Done processing participant 15
Done processing participant 16
Done processing participant 17


# Windowing

In [153]:
WINDOW_SIZES = ['2s', '4s', '6s', '8s', '10s', '12s']
AGG_FUNCS = ['mean', 'std', 'min', 'max', 'median']
SKIP_FROM_START_SIT_FOR_EACH = '30s'

In [154]:
def create_non_overlapping_windows(window_size, agg_funcs, ignore_incomplete_windows=True):
    dfs = []
    for p, p_df in E4_EDA.groupby('user_id'):
        for a, pa_df in p_df.groupby('session_type'):
            # pre touch ups
            df = pa_df.sort_values('timestamp')
            if SKIP_FROM_START_SIT_FOR_EACH:
                lower_bound = df['timestamp'].iloc[0] + parse_simple_timedelta(SKIP_FROM_START_SIT_FOR_EACH)
                df = df[df['timestamp'] >= lower_bound]
            df = df.dropna(subset=['session_type']) # remove rows with nan activity
            if ignore_incomplete_windows:
                number_of_windows = len(df) // int(window_size[:-1])
                df = df.iloc[:number_of_windows*int(window_size[:-1])] # remove incomplete windows
            
            # create windows and aggregate
            agg_cols = {c: agg_funcs for c in df.columns if c not in ['timestamp', 'session_type', 'user_id']}
            agg_cols['timestamp'] =  ['first', 'last']
            windowed = df.groupby(pd.Grouper(key='timestamp', freq=window_size)).agg(agg_cols)

            # post touch ups
            windowed.columns = ['_'.join(col).strip() for col in windowed.columns.values]
            windowed = windowed.reset_index()
            windowed.insert(0, 'user_id', p)
            windowed.insert(1, 'session_type', a)
            windowed.insert(2, 'window_number', range(len(windowed)))
            windowed.insert(3, 'start_timestamp', windowed.pop('timestamp_first').dt.ceil(freq='s'))
            windowed.insert(4, 'end_timestamp', windowed.pop('timestamp_last').dt.ceil(freq='s'))
            windowed.drop(columns=['timestamp'], inplace=True)
            dfs.append(windowed)
    return pd.concat(dfs, axis=0, ignore_index=True)

In [155]:
%mkdir -p data/v2/E4
DATASET_DIR_V2 = "data/v2"

In [156]:
for ws in WINDOW_SIZES:
    E4_EDA_WINDOWED = create_non_overlapping_windows(ws, AGG_FUNCS)
    E4_EDA_WINDOWED.to_csv(os.path.join(DATASET_DIR_V2, 'E4', f"e4_eda_{ws}.csv"), index=False)
    print(f"Done processing window size {ws}")

Done processing window size 2s
Done processing window size 4s
Done processing window size 6s
Done processing window size 8s
Done processing window size 10s
Done processing window size 12s
