# Setup

In [114]:
%cd  /home/tlm/Work/FYP-22-23/weee-preprocess

/home/tlm/Work/FYP-22-23/weee-preprocess


In [115]:
import os
import datetime
import numpy as np
import pandas as pd

In [116]:
DATASET_DIR = "./data/v1"
SAMPLING_RATE = 100 # Hz

# Prepare Data

In [117]:
from utils.dataset import StudyInfoEncoder, DatasetVersion1
from utils.naming import standardize_column_names
from utils.time import parse_simple_timedelta

In [118]:
dataset = DatasetVersion1()
activity_encoder = StudyInfoEncoder(os.path.join(DATASET_DIR, "Study_Information.csv"))

In [127]:
def process_task(p):
    left, right = dataset.earbud_acc(participant_id=p)
    
    # pre touch ups
    df = pd.concat([
        right, 
        pd.DataFrame({
            'timestamp': pd.date_range(right['timestamp'].iloc[0], right['timestamp'].iloc[-1], freq=f"1s", tz='UTC'),
            'ax': np.nan, 'ay': np.nan, 'az': np.nan
        })
    ], axis=0).sort_values('timestamp').reset_index(drop=True)
    df['timestamp'] = df['timestamp'].dt.tz_convert(None) # remove timezone info from timestamp
    
    # process
    df = df.rolling('60s', center=True, on="timestamp", min_periods=1).mean()
    # set first and last 30 seconds to nan
    df.iloc[df['timestamp'] < df['timestamp'].iloc[0] + parse_simple_timedelta("30s"), 1:] = np.nan
    df.iloc[df['timestamp'] > df['timestamp'].iloc[-1] - parse_simple_timedelta("30s"), 1:] = np.nan
    
    # post touch ups
    df.insert(1, 'user_id', p)
    df = activity_encoder.fit_activity_column(
        df, p,
        timestamp_column='timestamp',
        activity_column='session_type',
        activity_column_index=2
    )
    df = standardize_column_names(df)
    print(f"Done processing participant {p:02d}")
    
    return df

In [129]:
e4_acc = process_task(1)

Done processing participant 01


Unnamed: 0,timestamp,user_id,session_type,ax,ay,az
0,2021-12-03 16:53:50.000000000,1,,,,
1,2021-12-03 16:53:50.000000000,1,,,,
2,2021-12-03 16:53:51.000000000,1,,,,
3,2021-12-03 16:53:52.000000000,1,,,,
4,2021-12-03 16:53:53.000000000,1,,,,
5,2021-12-03 16:53:54.000000000,1,,,,
6,2021-12-03 16:53:55.000000000,1,,,,
7,2021-12-03 16:53:56.000000000,1,,,,
8,2021-12-03 16:53:57.000000000,1,,,,
9,2021-12-03 16:53:58.000000000,1,,,,


# Windowing

In [121]:
WINDOW_SIZES = ['2s', '4s', '6s', '8s', '10s', '12s']
AGG_FUNCS = ['mean', 'std', 'min', 'max', 'median']
SKIP_FROM_START_SIT = '30s'

In [122]:
def create_non_overlapping_windows(data, window_size, agg_funcs, ignore_incomplete_windows=True):
    dfs = []
    for p, p_df in data.groupby('user_id'):
        
        if SKIP_FROM_START_SIT:
            skip = parse_simple_timedelta(SKIP_FROM_START_SIT)
            p_df = activity_encoder.crop_from_start_time(p_df, p, offset=skip)
        
        for a, pa_df in p_df.groupby('session_type'):
            # pre touch ups
            df = pa_df.sort_values('timestamp')

            # create windows and aggregate
            agg_cols = {c: agg_funcs for c in df.columns if c not in ['timestamp', 'session_type', 'user_id']}
            agg_cols['timestamp'] =  ['first', 'last']
            windowed = df.groupby(pd.Grouper(key='timestamp', freq=window_size, origin='start')).agg(agg_cols)

            # post touch ups
            windowed.columns = ['_'.join(col).strip() for col in windowed.columns.values]
            windowed = windowed.reset_index()
            windowed.insert(0, 'user_id', p)
            windowed.insert(1, 'session_type', a)
            windowed.insert(2, 'window_number', range(len(windowed)))
            windowed.insert(3, 'start_timestamp', windowed.pop('timestamp_first').dt.floor(freq='s'))
            windowed.insert(4, 'end_timestamp', windowed.pop('timestamp_last').dt.ceil(freq='s'))
            windowed.drop(columns=['timestamp'], inplace=True)
            if (windowed.iloc[-1]['end_timestamp'] - windowed.iloc[-1]['start_timestamp']) < parse_simple_timedelta(window_size):
                # remove last window if it is incomplete
                windowed.drop(windowed.tail(1).index, inplace=True)
                
            dfs.append(windowed)
            
    return pd.concat(dfs, axis=0, ignore_index=True)

In [123]:
%mkdir -p data/v2/EARBUDS/ACC
DATASET_DIR_V2 = "data/v2"

In [124]:
for ws in WINDOW_SIZES:
    E4_EDA_WINDOWED = create_non_overlapping_windows(e4_acc, ws, AGG_FUNCS)
    E4_EDA_WINDOWED.to_csv(os.path.join(DATASET_DIR_V2, 'EARBUDS', f"earbuds_acc_{ws}.csv"), index=False)
    print(f"Done processing window size {ws}")

Done processing window size 2s
Done processing window size 4s
Done processing window size 6s
Done processing window size 8s
Done processing window size 10s
Done processing window size 12s


In [125]:
E4_EDA_WINDOWED

Unnamed: 0,user_id,session_type,window_number,start_timestamp,end_timestamp,ax_mean,ax_std,ax_min,ax_max,ax_median,ay_mean,ay_std,ay_min,ay_max,ay_median,az_mean,az_std,az_min,az_max,az_median
0,1,0,0,2021-12-03 16:59:20,2021-12-03 16:59:32,1977.763310,14.060191,1946.447520,2001.398720,1978.661760,16049.602242,6.874164,16041.695520,16061.079840,16046.525520,782.416732,43.374696,710.688160,821.492320,812.701040
1,1,0,1,2021-12-03 16:59:32,2021-12-03 16:59:44,1893.967792,37.594734,1815.904160,1946.447520,1911.695520,16049.779558,3.237669,16044.704960,16057.056960,16049.476160,838.188588,21.986602,810.269120,875.339520,835.839440
2,1,0,2,2021-12-03 16:59:44,2021-12-03 16:59:56,1746.602248,36.149162,1678.318560,1815.904160,1745.699600,16064.846415,3.801220,16057.056960,16072.206720,16064.937360,776.455425,15.509448,748.788320,813.497760,775.047520
3,1,0,3,2021-12-03 16:59:56,2021-12-03 17:00:08,1623.358267,23.598640,1585.822880,1678.318560,1619.618080,16078.183878,2.474251,16072.206720,16081.915680,16078.784880,718.097286,15.092691,683.462880,748.788320,718.234640
4,1,0,4,2021-12-03 17:00:08,2021-12-03 17:00:20,1545.251815,22.566364,1509.141440,1585.822880,1541.002720,16085.804623,2.116779,16081.911520,16089.314560,16086.232480,647.891472,14.844812,629.844800,683.462880,640.274400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1,5,27,2021-12-03 17:28:24,2021-12-03 17:28:36,2750.165037,23.314862,2699.719840,2780.564480,2754.832880,14975.501465,59.692457,14880.755680,15075.418720,14987.674160,4792.260885,135.104533,4547.316960,4991.686080,4788.078720
146,1,5,28,2021-12-03 17:28:36,2021-12-03 17:28:48,2548.183105,118.406692,2392.138080,2759.522240,2512.976800,15040.886524,57.890273,14908.397120,15122.231040,15065.278080,4482.341682,177.849229,4257.782240,4896.020480,4377.168960
147,1,5,29,2021-12-03 17:28:48,2021-12-03 17:29:00,2234.549232,60.668388,2144.600160,2392.138080,2218.422880,15252.638458,52.557542,15122.231040,15317.383520,15275.113360,3844.647967,231.989120,3577.265440,4257.782240,3755.513440
148,1,5,30,2021-12-03 17:29:00,2021-12-03 17:29:12,2258.814859,23.475661,2221.211409,2286.824555,2272.649129,15239.570314,27.294526,15186.539007,15298.567520,15234.413901,3767.085489,84.392166,3606.689280,3931.228282,3779.373910
