# Data Preprocessing: Zephyr Data

Using the `Summary.csv` files provided in each Zephyr Dataset, we will be extracting the most impactful columns for our study.

## Imports and Data Loading

In [132]:
# imports
import os
import re
from envyaml import EnvYAML

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [155]:
# environment configuration
VAR_ENV = EnvYAML("../../env.yaml")
DATASET_PATH = os.path.join("../..", VAR_ENV["dataset.path"])
DATASET_VERSION = VAR_ENV["dataset.version"]
DATASET_OUTPUT_VERSION = VAR_ENV["dataset.output"]
PARTICIPANTS = VAR_ENV["dataset.participants"]

In [134]:
path_list = [os.path.join(DATASET_PATH, DATASET_VERSION, f"P{i:02d}") for i in range(1, PARTICIPANTS + 1)]

### Loading Zephyr Data

Loading Zephyr data stored in `Summary.csv` files of each participant.

In [135]:
zephyr_dfs = []
for path in path_list:
    dirs = os.listdir(os.path.join(path, "ZEPHYR"))
    for dir in dirs:
        if re.match(r".*Summary.csv$", dir):
            zephyr_dfs.append(pd.read_csv(os.path.join(path, "ZEPHYR", dir)))
            break

## Data Cleaning

Before cleaning the data, let's change the column names.

In [136]:
# lowering the case of the column names
for df in zephyr_dfs:
    df.columns = df.columns.str.lower()

In [137]:
zephyr_dfs[0].head()

Unnamed: 0,time,hr,br,skintemp,posture,activity,peakaccel,batteryvolts,batterylevel,bramplitude,...,devicetemp,statusinfo,linkquality,rssi,txpower,coretemp,auxadc1,auxadc2,auxadc3,ext.status
0,03/12/2021 16:44:37.445,65,10.0,-3276.8,-1,0.66,1.73,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,416,420,499,49152
1,03/12/2021 16:44:38.445,65,10.0,-3276.8,45,0.8,1.14,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,422,500,32768
2,03/12/2021 16:44:39.445,65,10.0,-3276.8,68,0.52,0.73,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,500,32768
3,03/12/2021 16:44:40.445,65,10.0,-3276.8,75,0.35,0.86,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,499,32768
4,03/12/2021 16:44:41.445,67,10.0,-3276.8,91,0.57,1.37,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,500,32768


Now let's convert the `datetime` format of the DataFrames to `timestamp` format.

In [138]:
# add timestamp
def add_timestamp(df):
    df.rename(columns={"time": "timestamp"}, inplace=True)
    df["timestamp"] = df["timestamp"].apply(lambda x: datetime.datetime.strptime(x.split(".")[0], "%d/%m/%Y %H:%M:%S") + datetime.timedelta(hours=5, minutes=30))
    df["timestamp"] = df["timestamp"].apply(lambda x: datetime.datetime.timestamp(x))
    return df

In [139]:
zephyr_dfs = [add_timestamp(df) for df in zephyr_dfs]

In [140]:
zephyr_dfs[0].head()

Unnamed: 0,timestamp,hr,br,skintemp,posture,activity,peakaccel,batteryvolts,batterylevel,bramplitude,...,devicetemp,statusinfo,linkquality,rssi,txpower,coretemp,auxadc1,auxadc2,auxadc3,ext.status
0,1638550000.0,65,10.0,-3276.8,-1,0.66,1.73,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,416,420,499,49152
1,1638550000.0,65,10.0,-3276.8,45,0.8,1.14,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,422,500,32768
2,1638550000.0,65,10.0,-3276.8,68,0.52,0.73,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,500,32768
3,1638550000.0,65,10.0,-3276.8,75,0.35,0.86,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,499,32768
4,1638550000.0,67,10.0,-3276.8,91,0.57,1.37,4.174,95,0.0,...,24.9,688,255,-128,-128,6553.5,415,421,500,32768


Now we will remove the columns out of the range of the study period.

In [141]:
# load Study Information data
study_info_df = pd.read_csv(os.path.join(DATASET_PATH, DATASET_VERSION, "Study_Information.csv"))

In [142]:
study_info_df.head()

Unnamed: 0,Participant,Start_Sit,Start_Stand,Start_Cycle1,Start_Cycle2,Start_Run1,Start_Run2,Cycle_Speed1,Cycle_Speed2,Run_Speed1,Run_Speed2,Comments,MET_Sit,MET_Stand,MET_Cycle1,MET_Cycle2,MET_Run1,MET_Run2
0,P01,2021-12-03 16:58:50,2021-12-03 17:03:00,2021-12-03 17:08:00,2021-12-03 17:13:00,2021-12-03 17:18:00,2021-12-03 17:23:00,14,20,4.0,6.0,,1,1.2,10,16,4.5,10.0
1,P02,2021-11-25 17:17:00,2021-11-25 17:23:00,2021-11-25 17:36:10,2021-11-25 17:41:50,2021-11-25 17:46:50,2021-11-25 17:51:00,14,19,5.0,8.5,Muse headband data lost due to connectivity is...,1,1.2,10,12,8.0,14.0
2,P03,2021-11-26 16:20:20,2021-11-26 16:26:20,2021-11-26 16:53:06,2021-11-26 16:56:18,2021-11-26 17:00:13,2021-11-26 17:04:05,18,22,6.0,7.0,Second part V02 data got losT,1,1.2,12,16,10.0,11.5
3,P04,2021-11-26 18:15:48,2021-11-26 18:21:56,2021-11-26 18:27:00,2021-11-26 18:32:20,2021-11-26 18:37:10,2021-11-26 18:45:00,15,22,6.0,9.0,Interruption at 18:41,1,1.2,10,16,10.0,15.0
4,P05,2021-11-29 09:40:47,2021-11-29 09:45:40,2021-11-29 09:51:41,2021-11-29 09:56:41,2021-11-29 10:02:10,2021-11-29 10:06:40,15,22,5.0,7.0,,1,1.2,10,16,8.0,11.5


In [143]:
# convert start datetime to unix time
def startime_converter(date_time):
    date_time = date_time + datetime.timedelta(hours=5, minutes=30)
    return datetime.datetime.timestamp(date_time)

In [144]:
# load starting time of each participant in unix time
starting_time_arr = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Sit"]]

In [145]:
# remove out of range rows
def remove_out_of_range(df, starting_time):
    df = df[df["timestamp"] >= starting_time].reset_index(drop=True)
    return df

In [146]:
zephyr_dfs = [remove_out_of_range(df, starting_time) for df, starting_time in zip(zephyr_dfs, starting_time_arr)]

## Saving the Data

Before saving the data let's add the `session ids` to the data.

There are **6** session types as follows in the WEEE dataset.
- Sit
- Stand
- Cycle (Speed 1)
- Cycle (Speed 2)
- Run (Speed 1)
- Run (Speed 2)

We will encode the activity types as follows.
- 0: Sit
- 1: Stand
- 2: Cycle (Speed 1)
- 3: Cycle (Speed 2)
- 4: Run (Speed 1)
- 5: Run (Speed 2)

In [147]:
# start times of each activity
start_stand = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Stand"]]
start_cycle1 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Cycle1"]]
start_cycle2 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Cycle2"]]
start_run1 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Run1"]]
start_run2 = [startime_converter(datetime.datetime.strptime(starting_time, "%Y-%m-%d %H:%M:%S")) for starting_time in study_info_df["Start_Run2"]]

In [148]:
# add session id
def add_session_id(df, start_stand, start_cycle1, start_cycle2, start_run1, start_run2):
    df["session_id"] = 0
    df.loc[df["timestamp"] >= start_stand, "session_id"] = 1
    df.loc[df["timestamp"] >= start_cycle1, "session_id"] = 2
    df.loc[df["timestamp"] >= start_cycle2, "session_id"] = 3
    df.loc[df["timestamp"] >= start_run1, "session_id"] = 4
    df.loc[df["timestamp"] >= start_run2, "session_id"] = 5
    return df

In [149]:
zephyr_dfs = [add_session_id(df, start_stand[i], start_cycle1[i], start_cycle2[i], start_run1[i], start_run2[i]) for i, df in enumerate(zephyr_dfs)]

Now we will add the user id to each DataFrame.

In [150]:
# add user id
for i, df in enumerate(zephyr_dfs):
    df["user_id"] = i + 1

Finally we will combine the DataFrames an rearrange the columns into a single file.

In [151]:
# combine dataframes
zephyr_df = pd.concat(zephyr_dfs, ignore_index=True).reset_index(drop=True)

In [153]:
# reorder columns
priority_cols = ["user_id", "session_id", "timestamp"]
zephyr_df = zephyr_df[priority_cols + [col for col in zephyr_df.columns if col not in priority_cols]]

In [154]:
zephyr_df.head()

Unnamed: 0,user_id,session_id,timestamp,hr,br,skintemp,posture,activity,peakaccel,batteryvolts,...,devicetemp,statusinfo,linkquality,rssi,txpower,coretemp,auxadc1,auxadc2,auxadc3,ext.status
0,1,0,1638551000.0,96,20.0,-3276.8,-8,0.01,0.03,4.144,...,31.1,128,254,0,12,37.2,415,420,499,32792
1,1,0,1638551000.0,97,20.0,-3276.8,-8,0.01,0.03,4.144,...,31.1,128,254,0,12,37.2,417,422,500,32792
2,1,0,1638551000.0,97,20.0,-3276.8,-8,0.01,0.03,4.144,...,31.1,128,254,0,12,37.2,415,420,499,32792
3,1,0,1638551000.0,98,20.0,-3276.8,-8,0.01,0.03,4.144,...,31.1,128,254,0,12,37.2,415,420,499,32792
4,1,0,1638551000.0,97,20.0,-3276.8,-8,0.02,0.06,4.144,...,31.1,128,254,0,12,37.2,415,420,499,32792


In [157]:
# save DataFrame
zephyr_df.to_csv(os.path.join(DATASET_PATH, DATASET_OUTPUT_VERSION, "devices", "ZEPHYR", "ZEPHYR.csv"), index=False)