# Setup

In [4]:
%%bash 
# find the dataset directory
while ! [ -d "data/v1" ]; do
    if [ "$(pwd)" == "/" ]; then    
        echo "not found."
        exit 1
    fi
    cd ..
done

In [5]:
# %pip install neurokit2 --quiet

In [6]:
import os
import datetime
import numpy as np
import pandas as pd
import neurokit2 as nk

In [7]:
DATASET_DIR = "data/v1"
SAMPLING_RATE = 4 # Hz

# Prepare data

In [8]:
class StudyInfoEncoder:
    
    def __init__(self, dataset_dir=DATASET_DIR, filename="Study_Information.csv"):
        self.activities = ['Start_Sit', 'Start_Stand', 'Start_Cycle1', 'Start_Cycle2', 'Start_Run1', 'Start_Run2']
        self.encodings = { v:i for i, v in  enumerate(self.activities)}
        self.info = pd.read_csv(
            os.path.join(DATASET_DIR, filename),
            parse_dates=list(self.encodings.keys())
        )
    
    def fit_activity_column(self, df, participant_id, timstamp_column, activity_column='activity', activity_column_index=1):
        _info = self.info[self.info['Participant'] == f"P{participant_id:02d}"]
        df.insert(activity_column_index, activity_column, np.nan)
        df[activity_column] = df[activity_column].astype('Int8')
        for activity in self.activities:
            df.loc[df[timstamp_column] >= _info[activity].iloc[0], activity_column] = self.encodings[activity]
        return df

In [9]:
def load_dataset(dataset_dir, patient, sampling_rate):
    """
        Load EDA data from the E4 dataset.
    """
    data = pd.read_csv(os.path.join(dataset_dir, f"P{patient:02d}", "E4", "EDA.csv"), header=None)
    start_time = datetime.datetime.fromtimestamp(float(data.iloc[0, 0]), tz=datetime.timezone.utc)
    start_time = start_time.replace(tzinfo=None)
    time_gap = int(1000/sampling_rate) # ms
    data = pd.DataFrame({
        'timestamp': pd.date_range(start=start_time, periods=len(data.iloc[1:]), freq=f"{time_gap}ms"),
        'eda_raw': data.iloc[1:].values.reshape(-1),
    })
    return data

In [10]:
def upsample_eda(eda_data, upsample_rate, timestamp_col='timestamp', eda_col='eda_raw'):
    """
        Upsample EDA data.
    :param eda_data: EDA dataframe with timestamp and eda_raw columns
    :param upsample_rate: Upsample rate in Hz
    """
    data = eda_data.set_index(timestamp_col).resample(f"{1000/upsample_rate}ms").mean().reset_index()
    data[eda_col] = data[eda_col].interpolate(method='linear')
    return data

In [11]:
# change coloumn names
import re
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return re.sub('_+', '_', s2)

def standardize_column_names(df):
    df = df.rename(
        columns={old: to_snake_case(old) for old in df.columns},
    )
    return df

# Process

In [12]:
sie = StudyInfoEncoder()

def process_task(p):
    global sie
    df = load_dataset(DATASET_DIR, p, SAMPLING_RATE)
    
    # process eda
    df = upsample_eda(df, upsample_rate=SAMPLING_RATE*2)
    signals, info = nk.eda_process(df['eda_raw'], sampling_rate=SAMPLING_RATE*2)
    df = pd.concat([df['timestamp'], signals], axis=1)
    
    # post touch ups
    df.insert(1, 'participant', p)
    df.insert(2, 'activity', np.nan)
    df = sie.fit_activity_column(df, p, 'timestamp')
    df = standardize_column_names(df)
    print(f"Done processing participant {p:02d}")
    return df

FileNotFoundError: [Errno 2] No such file or directory: 'data/v1/Study_Information.csv'

In [None]:
# concat all participants row wise
E4_EDA = pd.concat([process_task(p) for p in range(1, 18)], axis=0, ignore_index=True)

Done processing participant 01
Done processing participant 02
Done processing participant 03
Done processing participant 04
Done processing participant 05
Done processing participant 06
Done processing participant 07
Done processing participant 08
Done processing participant 09
Done processing participant 10
Done processing participant 11
Done processing participant 12
Done processing participant 13
Done processing participant 14
Done processing participant 15
Done processing participant 16
Done processing participant 17


In [None]:
# nan count per column
E4_EDA.isna().sum()

timestamp                 0
participant               0
activity             145760
eda_raw                   0
eda_clean                 0
eda_tonic                 0
eda_phasic                0
scr_onsets                0
scr_peaks                 0
scr_height                0
scr_amplitude             0
scr_rise_time             0
scr_recovery              0
scr_recovery_time       171
dtype: int64

# Windowing

In [None]:
WINDOW_SIZES = ['2s', '4s', '6s', '8s', '10s', '12s']
AGG_FUNCS = ['mean', 'std', 'min', 'max', 'median']

In [None]:
def create_non_overlapping_windows(window_size, agg_funcs, ignore_incomplete_windows=True):
    dfs = []
    for p, p_df in E4_EDA.groupby('participant'):
        for a, pa_df in p_df.groupby('activity'):
            # pre touch ups
            df = pa_df.sort_values('timestamp')
            df = df.dropna(subset=['activity']) # remove rows with nan activity
            if ignore_incomplete_windows:
                number_of_windows = len(df) // int(window_size[:-1])
                df = df.iloc[:number_of_windows*int(window_size[:-1])] # remove incomplete windows
            
            # create windows and aggregate
            agg_cols = {c: agg_funcs for c in df.columns if c not in ['timestamp', 'activity', 'participant']}
            agg_cols['timestamp'] =  ['first', 'last']
            windowed = df.groupby(pd.Grouper(key='timestamp', freq=window_size)).agg(agg_cols)

            # post touch ups
            windowed.columns = ['_'.join(col).strip() for col in windowed.columns.values]
            windowed = windowed.reset_index()
            windowed.insert(0, 'participant', p)
            windowed.insert(1, 'activity', a)
            windowed.insert(2, 'window', range(len(windowed)))
            dfs.append(windowed)
    return pd.concat(dfs, axis=0, ignore_index=True)

In [None]:
%mkdir -p data/v2
DATASET_DIR_V2 = "data/v2"

In [None]:
for ws in WINDOW_SIZES:
    E4_EDA_WINDOWED = create_non_overlapping_windows(ws, AGG_FUNCS)
    E4_EDA_WINDOWED.to_csv(os.path.join(DATASET_DIR_V2, 'E4', f"E4_EDA_{ws}.csv"), index=False)
    print(f"Done processing window size {ws}")

Done processing window size 2s
Done processing window size 4s
Done processing window size 6s
Done processing window size 8s
Done processing window size 10s
Done processing window size 12s
