# Data Preprocessing: Other E4 Sensors

In this section we will preprocess the remaining sensor modalities (Temperature) of the E4 device.

## Imports and Loading Data

In [19]:
# imports
import os
from envyaml import EnvYAML

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
# environment configuration
VAR_ENV = EnvYAML("../../env.yaml")
DATASET_PATH = os.path.join("../..", VAR_ENV["dataset.path"])
DATASET_VERSION = VAR_ENV["dataset.version"]
DATASET_OUTPUT_VERSION = VAR_ENV["dataset.output"]
PARTICIPANTS = VAR_ENV["dataset.participants"]

In [21]:
path_list = [os.path.join(DATASET_PATH, DATASET_VERSION, f"P{i:02d}") for i in range(1, PARTICIPANTS + 1)]

### Loading E4 Data

In [22]:
# loading data
temp_e4_dfs = [pd.read_csv(os.path.join(path, "E4", "TEMP.csv")) for path in path_list]

Now let's separate the frequency from the data. Also, we will reshape the dataset such that the minimum value is now 0.

In [23]:
STARTING_TIMES = [df.columns[0] for df in temp_e4_dfs] # unix starting times
FREQUENCY = 4.0 # 4 Hz
temp_e4_dfs = list(map(lambda df: pd.DataFrame(data=df.iloc[1:].values, columns=["temp"]), temp_e4_dfs))

Now let's add the Timestamp to the data.

In [24]:
# add timestamp
def add_timestamp(df, starting_time):
    df["timestamp"] = float(starting_time) + df.index/FREQUENCY
    new_cols = ["timestamp"] + df.columns.tolist()[:-1]
    df = df[new_cols]
    return df

In [25]:
temp_e4_dfs = [add_timestamp(df, STARTING_TIMES[i]) for i, df in enumerate(temp_e4_dfs)]

### Remove data out of range

In [26]:
# load Study Information data
study_info_df = pd.read_csv(os.path.join(DATASET_PATH, DATASET_VERSION, "Study_Information.csv"))

In [27]:
study_info_df.head()

Unnamed: 0,Participant,Start_Sit,Start_Stand,Start_Cycle1,Start_Cycle2,Start_Run1,Start_Run2,Cycle_Speed1,Cycle_Speed2,Run_Speed1,Run_Speed2,Comments,MET_Sit,MET_Stand,MET_Cycle1,MET_Cycle2,MET_Run1,MET_Run2
0,P01,2021-12-03 16:58:50,2021-12-03 17:03:00,2021-12-03 17:08:00,2021-12-03 17:13:00,2021-12-03 17:18:00,2021-12-03 17:23:00,14,20,4.0,6.0,,1,1.2,10,16,4.5,10.0
1,P02,2021-11-25 17:17:00,2021-11-25 17:23:00,2021-11-25 17:36:10,2021-11-25 17:41:50,2021-11-25 17:46:50,2021-11-25 17:51:00,14,19,5.0,8.5,Muse headband data lost due to connectivity is...,1,1.2,10,12,8.0,14.0
2,P03,2021-11-26 16:20:20,2021-11-26 16:26:20,2021-11-26 16:53:06,2021-11-26 16:56:18,2021-11-26 17:00:13,2021-11-26 17:04:05,18,22,6.0,7.0,Second part V02 data got losT,1,1.2,12,16,10.0,11.5
3,P04,2021-11-26 18:15:48,2021-11-26 18:21:56,2021-11-26 18:27:00,2021-11-26 18:32:20,2021-11-26 18:37:10,2021-11-26 18:45:00,15,22,6.0,9.0,Interruption at 18:41,1,1.2,10,16,10.0,15.0
4,P05,2021-11-29 09:40:47,2021-11-29 09:45:40,2021-11-29 09:51:41,2021-11-29 09:56:41,2021-11-29 10:02:10,2021-11-29 10:06:40,15,22,5.0,7.0,,1,1.2,10,16,8.0,11.5


In [28]:
# convert start datetime to unix time
def startime_converter(date_time):
    date_time = date_time + datetime.timedelta(hours=5, minutes=30)
    return datetime.datetime.timestamp(date_time)

In [29]:
# load starting time of each participant in unix time
starting_time_arr = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Sit"]]

In [30]:
# remove rows out of range
def remove_out_of_range(df, starting_time):
    df = df[df["timestamp"] >= starting_time].reset_index(drop=True)
    return df

In [31]:
temp_e4_dfs = [remove_out_of_range(df, starting_time_arr[i]) for i, df in enumerate(temp_e4_dfs)]

In [32]:
temp_e4_dfs[0].head()

Unnamed: 0,timestamp,temp
0,1638551000.0,32.18
1,1638551000.0,32.18
2,1638551000.0,32.18
3,1638551000.0,32.18
4,1638551000.0,32.18


## Saving Data

Before saving the data let's add the `session ids` to the data.

There are **6** session types as follows in the WEEE dataset.
- Sit
- Stand
- Cycle (Speed 1)
- Cycle (Speed 2)
- Run (Speed 1)
- Run (Speed 2)

We will encode the activity types as follows.
- 0: Sit
- 1: Stand
- 2: Cycle (Speed 1)
- 3: Cycle (Speed 2)
- 4: Run (Speed 1)
- 5: Run (Speed 2)

In [33]:
# start times of each activity
start_stand = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Stand"]]
start_cycle1 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Cycle1"]]
start_cycle2 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Cycle2"]]
start_run1 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Run1"]]
start_run2 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Run2"]]

In [34]:
# add session id
def add_session_id(df, start_stand, start_cycle1, start_cycle2, start_run1, start_run2):
    df["session_id"] = 0
    df.loc[df["timestamp"] >= start_stand, "session_id"] = 1
    df.loc[df["timestamp"] >= start_cycle1, "session_id"] = 2
    df.loc[df["timestamp"] >= start_cycle2, "session_id"] = 3
    df.loc[df["timestamp"] >= start_run1, "session_id"] = 4
    df.loc[df["timestamp"] >= start_run2, "session_id"] = 5
    return df

In [35]:
temp_e4_dfs = [add_session_id(df, start_stand[i], start_cycle1[i], start_cycle2[i], start_run1[i], start_run2[i]) for i, df in enumerate(temp_e4_dfs)]

Now we will add the user id to each DataFrame.

In [37]:
# add user id
for i, df in enumerate(temp_e4_dfs):
    df["user_id"] = i + 1

Finally we will combine the DataFrames an rearrange the columns into a single file.

In [38]:
# combine dataframes
temp_e4_df = pd.concat(temp_e4_dfs, ignore_index=True).reset_index(drop=True)

In [39]:
# reorder columns
priority_cols = ["user_id", "session_id", "timestamp"]
temp_e4_df = temp_e4_df[priority_cols + [col for col in temp_e4_df.columns if col not in priority_cols]]

In [42]:
temp_e4_df.tail()

Unnamed: 0,user_id,session_id,timestamp,temp
161703,17,5,1638467000.0,21.29
161704,17,5,1638467000.0,21.23
161705,17,5,1638467000.0,21.23
161706,17,5,1638467000.0,21.23
161707,17,5,1638467000.0,21.23


In [44]:
# save DataFrame
temp_e4_df.to_csv(os.path.join(DATASET_PATH, DATASET_OUTPUT_VERSION, "devices", "E4", "TEMP.csv"), index=False)