# Setup

In [15]:
# %pip install neurokit2 --quiet

In [16]:
import os
import datetime
import numpy as np
import pandas as pd
import neurokit2 as nk

In [17]:
DATASET_DIR = "../../data/v1"
SAMPLING_RATE = 4 # Hz

# Prepare data

In [26]:
class StudyInfoEncoder:
    
    def __init__(self, dataset_dir=DATASET_DIR, filename="Study_Information.csv"):
        self.encodings = { v:i for i, v in  enumerate([
            'Start_Sit',
            'Start_Stand',
            'Start_Cycle1',
            'Start_Cycle2',
            'Start_Run1',
            'Start_Run2',
        ])}
        self.info = pd.read_csv(
            os.path.join(DATASET_DIR, filename),
            parse_dates=list(self.encodings.keys())
        )
        
    def __apply(self, timestamp, participant_id):
        _info = self.info[self.info['Participant'] == f"P{participant_id:02d}"]
        if timestamp >= _info['Start_Run2'].iloc[0]:
            return self.encodings['Start_Run2']
        elif timestamp >= _info['Start_Run1'].iloc[0]:
            return self.encodings['Start_Run1']
        elif timestamp >= _info['Start_Cycle2'].iloc[0]:
            return self.encodings['Start_Cycle2']
        elif timestamp >= _info['Start_Cycle1'].iloc[0]:
            return self.encodings['Start_Cycle1']
        elif timestamp >= _info['Start_Stand'].iloc[0]:
            return self.encodings['Start_Stand']
        elif timestamp >= _info['Start_Sit'].iloc[0]:
            return self.encodings['Start_Sit']
        else:
            return np.nan
    
    def add_activity_column(self, df, participant_id, timstamp_column):
        df['activity'] = df[timstamp_column].apply(
            lambda x: self.__apply(x, participant_id)
        )
        return df



In [19]:
def load_dataset(dataset_dir, patient, sampling_rate):
    """
        Load EDA data from the E4 dataset.
    """
    data = pd.read_csv(os.path.join(dataset_dir, f"P{patient:02d}", "E4", "EDA.csv"), header=None)
    start_time = datetime.datetime.fromtimestamp(float(data.iloc[0, 0]), tz=datetime.timezone.utc)
    start_time = start_time.replace(tzinfo=None)
    time_gap = int(1000/sampling_rate) # ms
    data = pd.DataFrame({
        'timestamp': pd.date_range(start=start_time, periods=len(data.iloc[1:]), freq=f"{time_gap}ms"),
        'eda_raw': data.iloc[1:].values.reshape(-1),
    })
    return data

In [20]:
def upsample_eda(eda_data, upsample_rate, timestamp_col='timestamp', eda_col='eda_raw'):
    """
        Upsample EDA data.
    :param eda_data: EDA dataframe with timestamp and eda_raw columns
    :param upsample_rate: Upsample rate in Hz
    """
    data = eda_data.set_index(timestamp_col).resample(f"{1000/upsample_rate}ms").mean().reset_index()
    data[eda_col] = data[eda_col].interpolate(method='linear')
    return data

# Process

In [21]:
sie = StudyInfoEncoder()

In [22]:
def process_task(p):
    global sie
    df = load_dataset(DATASET_DIR, p, SAMPLING_RATE)
    df = upsample_eda(df, upsample_rate=SAMPLING_RATE*2)
    signals, info = nk.eda_process(df['eda_raw'], sampling_rate=SAMPLING_RATE*2)
    df = pd.concat([df['timestamp'], signals], axis=1)
    print(f"Done processing participant {p:02d}")
    return df

In [17]:
E4_EDA_dfs = { p:process_task(p) for p in range(1, 18) }

Using 7 cores
Processing participant 1
Processing participant 2
Processing participant 3
Processing participant 4
Processing participant 5
Processing participant 6
Processing participant 7
2021-11-29 20:49:40 2021-11-29 16:02:00
2021-11-29 20:49:40.125000 2021-11-29 16:02:00
2021-11-29 20:49:40.250000 2021-11-29 16:02:00
2021-11-29 20:49:40.375000 2021-11-29 16:02:00
2021-11-29 20:49:40.500000 2021-11-29 16:02:00
2021-11-29 20:49:40.625000 2021-11-29 16:02:00
2021-11-29 20:49:40.750000 2021-11-29 16:02:00
2021-11-29 20:49:40.875000 2021-11-29 16:02:00
2021-11-29 20:49:41 2021-11-29 16:02:00
2021-11-29 20:49:41.125000 2021-11-29 16:02:00
2021-11-29 20:49:41.250000 2021-11-29 16:02:00
2021-11-29 20:49:41.375000 2021-11-29 16:02:00
2021-11-29 20:49:41.500000 2021-11-29 16:02:00
2021-11-29 20:49:41.625000 2021-11-29 16:02:00
2021-11-29 20:49:41.750000 2021-11-29 16:02:00
2021-11-29 20:49:41.875000 2021-11-29 16:02:00
2021-11-29 20:49:42 2021-11-29 16:02:00
2021-11-29 20:49:42.125000 2021-1

KeyboardInterrupt: 


2021-11-29 11:06:00
2021-12-03 17:23:002021-11-29 14:48:30
 2021-11-26 17:04:052021-11-26 23:18:44.875000 
 2021-11-25 22:27:34.125000 2021-11-25 17:51:002021-11-26 18:45:00
2021-12-03 22:28:14.500000 2021-11-29 10:06:40

2021-11-29 20:51:57.8750002021-11-26 23:18:452021-12-03 17:23:00

2021-11-29 15:56:12.8750002021-11-25 22:27:34.250000 
 2021-11-29 14:48:30.125000 2021-11-26 21:38:42.625000 2021-11-29 16:02:002021-12-03 22:28:14.6250002021-11-26 18:45:00 2021-11-29 11:06:00 2021-11-25 17:51:00
 
2021-11-29 10:06:40
2021-11-26 17:04:05
2021-11-29 20:51:582021-12-03 17:23:002021-11-26 23:18:45.125000
2021-11-29 15:56:13 2021-11-25 22:27:34.375000 


In [29]:
# drop rows with activity == nan
E4_EDA = E4_EDA.dropna(subset=['activity'])

# convert column names to snake case
import re
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return re.sub('_+', '_', s2)
E4_EDA.rename(
    columns={old: to_snake_case(old) for old in E4_EDA.columns}, 
    inplace=True
)

In [30]:
# nan count per column
E4_EDA.isna().sum()

timestamp              0
patient                0
eda_raw                0
eda_clean              0
eda_tonic              0
eda_phasic             0
scr_onsets             0
scr_peaks              0
scr_height             0
scr_amplitude          0
scr_rise_time          0
scr_recovery           0
scr_recovery_time    171
activity               0
dtype: int64

In [34]:
sample = E4_EDA.iloc[:2000]

In [47]:
# group by activity and create a dict of dataframes.
# key of the dict is the activity ids
d = {k: v for k, v in sample.groupby('activity')}

In [51]:
d.keys()

dict_keys([5])