# Setup

In [1]:
%pip install neurokit2 --quiet

[K     |████████████████████████████████| 1.2 MB 4.9 MB/s eta 0:00:01
[K     |████████████████████████████████| 46 kB 2.3 MB/s eta 0:00:011
[K     |████████████████████████████████| 1.6 MB 9.2 MB/s eta 0:00:01
[?25h

In [2]:
import os
import datetime
import numpy as np
import pandas as pd
import neurokit2 as nk

In [3]:
DATASET_DIR = "/content/gdrive/Shareddrives/GitRepos/weee-preprocess/data/v1"
SAMPLING_RATE = 4 # Hz

# Prepare data

In [4]:
class StudyInfoEncoder:
    
    def __init__(self, dataset_dir=DATASET_DIR, filename="Study_Information.csv"):
        self.encodings = { v:i for i, v in  enumerate([
            'Start_Sit',
            'Start_Stand',
            'Start_Cycle1',
            'Start_Cycle2',
            'Start_Run1',
            'Start_Run2',
        ])}
        self.info = pd.read_csv(
            os.path.join(DATASET_DIR, filename),
            parse_dates=list(self.encodings.keys())
        )
        
    def __apply(self, timestamp, participant_id):
        _info = self.info[self.info['Participant'] == f"P{participant_id:02d}"]
        if timestamp >= _info['Start_Run2'].iloc[0]:
            return self.encodings['Start_Run2']
        elif timestamp >= _info['Start_Run1'].iloc[0]:
            return self.encodings['Start_Run1']
        elif timestamp >= _info['Start_Cycle2'].iloc[0]:
            return self.encodings['Start_Cycle2']
        elif timestamp >= _info['Start_Cycle1'].iloc[0]:
            return self.encodings['Start_Cycle1']
        elif timestamp >= _info['Start_Stand'].iloc[0]:
            return self.encodings['Start_Stand']
        elif timestamp >= _info['Start_Sit'].iloc[0]:
            return self.encodings['Start_Sit']
        else:
            return np.nan
    
    def add_activity_column(self, df, participant_id, timstamp_column):
        df['activity'] = df[timstamp_column].apply(
            lambda x: self.__apply(x, participant_id)
        )
        return df



In [5]:
def load_dataset(dataset_dir, patient, sampling_rate):
    """
        Load EDA data from the E4 dataset.
    """
    data = pd.read_csv(os.path.join(dataset_dir, f"P{patient:02d}", "E4", "EDA.csv"), header=None)
    start_time = datetime.datetime.fromtimestamp(float(data.iloc[0, 0])) # timestamp
    
    time_gap = int(1000/sampling_rate) # ms
    data = pd.DataFrame({
        'timestamp': pd.date_range(start=start_time, periods=len(data.iloc[1:]), freq=f"{time_gap}ms"),
        'eda_raw': data.iloc[1:].values.reshape(-1),
    })
    return data

In [6]:
def upsample_eda(eda_data, upsample_rate, timestamp_col='timestamp', eda_col='eda_raw'):
    """
        Upsample EDA data.
    :param eda_data: EDA dataframe with timestamp and eda_raw columns
    :param upsample_rate: Upsample rate in Hz
    """
    data = eda_data.set_index(timestamp_col).resample(f"{1000/upsample_rate}ms").mean().reset_index()
    data[eda_col] = data[eda_col].interpolate(method='linear')
    return data

# Process

In [7]:
from concurrent.futures import ThreadPoolExecutor

In [8]:
sie = StudyInfoEncoder()

In [9]:
def process_task(p):
    global sie
    print(f"Processing participant {p}")
    df = load_dataset(DATASET_DIR, p, SAMPLING_RATE)
    df = upsample_eda(df, upsample_rate=SAMPLING_RATE*2)
    signals, info = nk.eda_process(df['eda_raw'], sampling_rate=SAMPLING_RATE*2)
    df = pd.concat([df['timestamp'], signals], axis=1)
    df.insert(1, 'patient', p)
    sie.add_activity_column(df, p, "timestamp")
    print(f"Done processing participant {p}")
    return df

In [11]:
cores = max(os.cpu_count() - 1, 1)
print(f"Using {cores} cores")
with ThreadPoolExecutor(max_workers = cores) as executor:
    dfs = executor.map(process_task, range(1, 18))

Using 1 cores
Processing participant 1
Done processing participant 6
Processing participant 10
Done processing participant 7
Processing participant 11
Done processing participant 8
Processing participant 12
Done processing participant 9
Processing participant 13
Done processing participant 1
Processing participant 2


In [None]:
# concatenate all dataframes row wise
E4_EDA = pd.concat(dfs, ignore_index=True)

In [None]:
E4_EDA.sample(10)

Unnamed: 0,timestamp,patient,EDA_Raw,EDA_Clean,EDA_Tonic,EDA_Phasic,SCR_Onsets,SCR_Peaks,SCR_Height,SCR_Amplitude,SCR_RiseTime,SCR_Recovery,SCR_RecoveryTime,activity
254338,2021-11-29 10:35:46.125,6,0.291735,0.291668,0.29382,-0.002152,0,0,0.0,0.0,0.0,0,0.0,
380614,2021-11-30 15:00:47.000,11,5.829238,5.829439,5.850589,-0.02115,0,0,0.0,0.0,0.0,0,0.0,
826557,2021-11-30 11:37:29.875,10,0.134524,0.134519,0.133482,0.001036,0,0,0.0,0.0,0.0,0,0.0,1.0
705134,2021-11-29 09:37:34.125,5,0.410884,0.41092,0.410857,6.3e-05,0,0,0.0,0.0,0.0,0,0.0,
833409,2021-11-30 11:51:46.375,10,0.137727,0.137666,0.138929,-0.001263,0,0,0.0,0.0,0.0,0,0.0,3.0
947064,2021-12-01 15:28:31.250,14,0.411549,0.411602,0.411326,0.000276,0,0,0.0,0.0,0.0,0,0.0,5.0
258827,2021-11-29 10:45:07.250,6,0.262908,0.262902,0.263645,-0.000743,0,0,0.0,0.0,0.0,0,0.0,1.0
580960,2021-12-02 17:35:07.500,17,29.491726,29.4951,29.744495,-0.249395,0,0,0.0,0.0,0.0,0,0.0,5.0
305342,2021-11-29 17:58:32.375,8,0.97339,0.97343,0.971848,0.001582,0,0,0.0,0.0,0.0,0,0.0,
950359,2021-12-01 15:35:23.125,14,0.376956,0.376936,0.377046,-0.00011,0,0,0.0,0.0,0.0,0,0.0,5.0
