### Load Arduino readings

In [1325]:
import glob
import math
import pandas as pd
import os
import numpy as np
import numpy.ma as ma

In [1326]:
l = [pd.read_csv(filename, names=["Humidity", "Temperature", "TemperatureF", "Light", "eCO2", "TVOC", "DateTime"]) for filename in glob.glob("./data/readings/*.TXT")]
dfreadings = pd.concat(l, axis=0)

In [1327]:
l

[   Humidity  Temperature  TemperatureF  Light  eCO2  TVOC             DateTime
 0      36.9         21.7         71.06  484.0   415     2  2022-12-12T13:02:04
 1      36.8         21.7         71.06  489.0   641    36  2022-12-12T13:03:04
 2      37.6         21.8         71.24  496.0   426     3  2022-12-12T13:04:04,
    Humidity  Temperature  TemperatureF  Light  eCO2  TVOC             DateTime
 0      37.0         21.8         71.24  495.0   400     0  2022-12-12T13:05:31
 1      36.6         21.7         71.06  493.0   432     4  2022-12-12T13:06:31,
     Humidity  Temperature  TemperatureF   Light  eCO2  TVOC  \
 0       35.1         21.7         71.06   841.0   400     0   
 1       35.0         21.5         70.70   868.0   400     0   
 2       35.0         21.5         70.70  1101.0   400     0   
 3       35.2         21.5         70.70  1085.0   400     0   
 4       35.1         21.5         70.70  1048.0   405     0   
 5       35.1         21.4         70.52  1075.0   411

In [1328]:
dfreadings.head(n=100)

Unnamed: 0,Humidity,Temperature,TemperatureF,Light,eCO2,TVOC,DateTime
0,36.9,21.7,71.06,484.0,415,2,2022-12-12T13:02:04
1,36.8,21.7,71.06,489.0,641,36,2022-12-12T13:03:04
2,37.6,21.8,71.24,496.0,426,3,2022-12-12T13:04:04
0,37.0,21.8,71.24,495.0,400,0,2022-12-12T13:05:31
1,36.6,21.7,71.06,493.0,432,4,2022-12-12T13:06:31
...,...,...,...,...,...,...,...
1,35.7,19.7,67.46,495.0,400,0,2022-12-13T09:57:09
2,35.5,19.7,67.46,518.0,400,0,2022-12-13T09:58:09
3,35.5,19.8,67.64,520.0,400,0,2022-12-13T09:59:09
4,35.4,19.9,67.82,522.0,400,0,2022-12-13T10:00:09


### Load HRV data

In [1329]:
def getHRVDataPerUser(txtfolder):

    ## txtfolder = r'./Data/HRV/Niek_Snijders/' #Change to your folder path

    #Find the textfiles
    textfiles = []
    for root, folder, files in os.walk(txtfolder):
        for file in files:
            if file.endswith('.txt'):
                fullname = os.path.join(root, file)
                textfiles.append(fullname)
    textfiles.sort() #Sort the filesnames

    #Read each of them to a dataframe
    for filenum, file in enumerate(textfiles, 1):
        if filenum==1:
            df = pd.read_csv(file, names=['RR'], delim_whitespace=True)
            date = os.path.basename(file).split(".")[0].split(" ")[0]
            time = os.path.basename(file).split(".")[0].split(" ")[1]
            split_date = date.split("-")
            split_time = time.split("-")
            df['DateTime']=split_date[2]+"-"+split_date[1]+"-"+split_date[0]+ " " + split_time[0] + ":" + split_time[1] + ":" + split_time[2]
            df['DateTime'] = pd.to_datetime(df['DateTime'])
        else:
            tempdf = pd.read_csv(file, names=['RR'], delim_whitespace=True)
            date = os.path.basename(file).split(".")[0].split(" ")[0]
            time = os.path.basename(file).split(".")[0].split(" ")[1]
            split_date = date.split("-")
            split_time = time.split("-")
            tempdf['DateTime']=split_date[2]+"-"+split_date[1]+"-"+split_date[0]+ " " + split_time[0] + ":" + split_time[1] + ":" + split_time[2]
            tempdf['DateTime'] = pd.to_datetime(tempdf['DateTime'])
            df = pd.concat([df, tempdf], ignore_index=True)

    return df

In [1330]:
def calculateHRV(df):
    array = df[["RR"]].to_numpy()
    sumOfSuccessiveDifference = 0.0
    array = array[((array <= 2000))]
    for idx, x in enumerate(array):
        if x != array[-1] :
            sumOfSuccessiveDifference += (array[idx] - array[idx + 1])**2
            df['RMSSD'] = math.sqrt(sumOfSuccessiveDifference/len(array))
    return df

#### Niek

In [1331]:
dfHRVNiek = getHRVDataPerUser(r'./Data/HRV/Niek_Snijders')

In [1332]:
dfHRVNiek = dfHRVNiek.groupby(["DateTime"]).apply(lambda x: calculateHRV(x))

In [1333]:
dfHRVNiek["User Name"] = "niek@email.com"

In [1334]:
dfHRVNiek.head()

Unnamed: 0,RR,DateTime,RMSSD,User Name
0,776,2022-11-17 15:17:06,42.056698,niek@email.com
1,758,2022-11-17 15:17:06,42.056698,niek@email.com
2,737,2022-11-17 15:17:06,42.056698,niek@email.com
3,711,2022-11-17 15:17:06,42.056698,niek@email.com
4,738,2022-11-17 15:17:06,42.056698,niek@email.com


In [1335]:
dfHRVNiek = dfHRVNiek.drop(["RR"], axis = 1)

In [1336]:
dfHRVNiek.drop_duplicates(keep='first', inplace=True)

In [1337]:
dfHRVNiek.reset_index(inplace=True)
dfHRVNiek = dfHRVNiek.drop(["index"], axis = 1)

In [1338]:
dfHRVNiek.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name
0,2022-11-17 15:17:06,42.056698,niek@email.com
1,2022-11-18 11:00:00,42.440603,niek@email.com
2,2022-11-24 12:01:05,45.918517,niek@email.com
3,2022-11-30 11:07:17,36.091961,niek@email.com
4,2022-11-30 11:49:08,52.136928,niek@email.com
5,2022-01-12 14:35:12,92.17936,niek@email.com
6,2022-06-12 15:52:08,63.953953,niek@email.com
7,2022-06-12 15:54:01,78.683995,niek@email.com
8,2022-07-12 10:19:15,45.998991,niek@email.com
9,2022-07-12 12:44:46,59.847481,niek@email.com


#### Job

In [1339]:
dfHRVJob = getHRVDataPerUser(r'./Data/HRV/jobhaast@hotmail.com')

In [1340]:
dfHRVJob = dfHRVJob.groupby(["DateTime"]).apply(lambda x: calculateHRV(x))

In [1341]:
dfHRVJob["User Name"] = "job@email.com"

In [1342]:
dfHRVJob = dfHRVJob.drop(["RR"], axis = 1)

In [1343]:
dfHRVJob.drop_duplicates(keep='first', inplace=True)

In [1344]:
dfHRVJob.reset_index(inplace=True)
dfHRVJob = dfHRVJob.drop(["index"], axis = 1)

In [1345]:
dfHRVJob.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name
0,2022-06-12 14:38:47,57.882491,job@email.com
1,2022-06-12 14:41:41,82.642453,job@email.com
2,2022-06-12 14:49:55,129.60289,job@email.com
3,2022-07-12 10:30:46,58.791368,job@email.com
4,2022-07-12 13:11:45,67.209265,job@email.com
5,2022-07-12 13:51:22,105.023172,job@email.com
6,2022-07-12 14:01:30,27.250164,job@email.com
7,2022-12-12 13:28:43,103.473443,job@email.com
8,2022-12-12 13:42:22,83.071749,job@email.com
9,2022-12-12 13:55:26,118.254976,job@email.com


#### Stefan

In [1346]:
dfHRVStefan = getHRVDataPerUser(r'./Data/HRV/s.jaspers1997@gmail.com')

In [1347]:
dfHRVStefan = dfHRVStefan.groupby(["DateTime"]).apply(lambda x: calculateHRV(x))

In [1348]:
dfHRVStefan["User Name"] = "stefan@jaspers.nl"

In [1349]:
dfHRVStefan = dfHRVStefan.drop(["RR"], axis = 1)

In [1350]:
dfHRVStefan.drop_duplicates(keep='first', inplace=True)

In [1351]:
dfHRVStefan.reset_index(inplace=True)
dfHRVStefan = dfHRVStefan.drop(["index"], axis = 1)

In [1352]:
dfHRVStefan.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name
0,2022-07-12 10:25:25,42.801957,stefan@jaspers.nl
1,2022-07-12 13:54:44,61.043251,stefan@jaspers.nl
2,2022-07-12 13:59:15,120.665735,stefan@jaspers.nl
3,2022-12-12 13:24:39,48.239248,stefan@jaspers.nl
4,2022-12-12 13:52:36,56.135213,stefan@jaspers.nl
5,2022-12-12 14:34:22,74.210511,stefan@jaspers.nl
6,2022-12-12 14:47:10,65.346911,stefan@jaspers.nl
7,2022-12-14 11:27:23,154.013215,stefan@jaspers.nl
8,2022-12-14 11:53:47,27.794275,stefan@jaspers.nl
9,2022-12-14 12:10:59,39.124237,stefan@jaspers.nl


#### Noah

In [1353]:
dfHRVNoah = getHRVDataPerUser(r'./Data/HRV/Noah')

In [1354]:
dfHRVNoah = dfHRVNoah.groupby(["DateTime"]).apply(lambda x: calculateHRV(x))

In [1355]:
dfHRVNoah["User Name"] = "Noah@email.com"

In [1356]:
dfHRVNoah = dfHRVNoah.drop(["RR"], axis = 1)

In [1357]:
dfHRVNoah.drop_duplicates(keep='first', inplace=True)

In [1358]:
dfHRVNoah.reset_index(inplace=True)
dfHRVNoah = dfHRVNoah.drop(["index"], axis = 1)

In [1359]:
dfHRVNoah.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name
0,2022-07-12 14:11:10,53.089728,Noah@email.com
1,2022-07-12 14:14:30,174.320772,Noah@email.com
2,2022-07-12 14:18:06,127.283935,Noah@email.com
3,2022-07-12 14:19:29,142.566617,Noah@email.com
4,2022-12-12 13:28:19,30.287191,Noah@email.com
5,2022-12-12 13:31:19,25.434576,Noah@email.com
6,2022-12-12 13:34:52,26.282239,Noah@email.com
7,2022-12-12 13:39:23,31.8305,Noah@email.com
8,2022-12-12 13:49:53,38.763914,Noah@email.com
9,2022-12-12 13:54:46,37.011822,Noah@email.com


### Load selfreporting data

In [1360]:
import pandas as pd
l = [pd.read_csv(filename) for filename in glob.glob("./data/selfreporting/*.csv")]
dfself = pd.concat(l, axis=0)

In [1361]:
dfself.head(n=100)

Unnamed: 0,Air Qualityall Good,Beverage,Cloth 1,Cloth 2,Cloth 3,Cloth 4,Cloth 5,Cloth 6,Cloth 7,Duration Of Location,...,Location,Mode Of Transport,Mood,Smelly,Stuffy,Suffocating,Thermal Comfort,Thermal Preference,Timestamp,User Name
0,YES,2,NO,YES,YES,NO,NO,NO,NO,1-2 hours,...,LA Explora floor 2,1,3,NO,NO,NO,5,1,2022/12/7 2022-12-07 10:42:04.299,job@email.com
1,YES,1,NO,YES,YES,NO,NO,NO,NO,more than 3 hours,...,LA402,1,3,NO,NO,NO,4,2,2022/12/7 2022-12-07 13:51:00.415,job@email.com
2,YES,2,NO,YES,YES,NO,NO,NO,NO,more than 3 hours,...,LA402,1,4,NO,NO,NO,5,2,2022/12/7 2022-12-07 14:04:39.307,job@email.com
3,YES,2,NO,NO,YES,NO,NO,NO,YES,more than 3 hours,...,LA222,1,4,NO,NO,NO,5,1,2022/12/12 2022-12-12 13:34:15.607,job@email.com
4,YES,2,NO,NO,YES,NO,NO,NO,YES,more than 3 hours,...,LA223,1,4,NO,NO,NO,5,1,2022/12/12 2022-12-12 13:42:56.603,job@email.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,YES,1,NO,NO,YES,NO,NO,YES,NO,1-2 hours,...,LD025,4,5,NO,NO,NO,5,1,2022/12/14 2022-12-14 11:54:46.815,stefan@jaspers.nl
5,YES,2,NO,NO,YES,NO,NO,YES,NO,1-2 hours,...,LD025,4,5,NO,NO,NO,6,1,2022/12/14 2022-12-14 12:11:57.538,stefan@jaspers.nl
6,YES,2,NO,NO,YES,NO,NO,YES,NO,1-2 hours,...,LD025,4,4,NO,NO,NO,3,2,2022/12/14 2022-12-14 12:42:24.715,stefan@jaspers.nl
7,YES,3,NO,NO,YES,NO,NO,YES,NO,more than 3 hours,...,LD025,4,5,NO,NO,NO,2,3,2022/12/14 2022-12-14 13:07:28.476,stefan@jaspers.nl


# Data preprocessing

In [1362]:
dfreadings['DateTime'] = pd.to_datetime(dfreadings['DateTime'])

In [1363]:
datetime = dfself['Timestamp'].str.split(".",n = 1, expand = True)
dfself['Timestamp'] = datetime[0]
datetime = dfself['Timestamp'].str.split(" ",n = 1, expand = True)
datetime
dfself['Timestamp'] = datetime[1]

In [1364]:
dfself['DateTime'] = pd.to_datetime(dfself['Timestamp'])

### Merge data

In [1365]:
## Merge different hrv dataframes
dfHRV = pd.concat([dfHRVNiek, dfHRVJob, dfHRVStefan, dfHRVNoah], axis=0)
dfHRV.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name
0,2022-11-17 15:17:06,42.056698,niek@email.com
1,2022-11-18 11:00:00,42.440603,niek@email.com
2,2022-11-24 12:01:05,45.918517,niek@email.com
3,2022-11-30 11:07:17,36.091961,niek@email.com
4,2022-11-30 11:49:08,52.136928,niek@email.com
...,...,...,...
10,2022-12-14 13:06:13,44.089555,stefan@jaspers.nl
0,2022-07-12 14:11:10,53.089728,Noah@email.com
1,2022-07-12 14:14:30,174.320772,Noah@email.com
2,2022-07-12 14:18:06,127.283935,Noah@email.com


In [1366]:
newdf = pd.merge_asof(dfHRV.sort_values('DateTime'),dfself.sort_values('DateTime'),on='DateTime', by='User Name' , tolerance=pd.Timedelta('420s'),direction='nearest')

In [1367]:
## Merge df for readings with selfreporting df on DateTime and get the nearest
df=pd.merge_asof(newdf.sort_values('DateTime'),dfreadings.sort_values('DateTime'),on='DateTime', tolerance=pd.Timedelta('120s'),direction='nearest')

In [1368]:
newdf.head(n=100)

Unnamed: 0,DateTime,RMSSD,User Name,Air Qualityall Good,Beverage,Cloth 1,Cloth 2,Cloth 3,Cloth 4,Cloth 5,...,Humid,Location,Mode Of Transport,Mood,Smelly,Stuffy,Suffocating,Thermal Comfort,Thermal Preference,Timestamp
0,2022-01-12 14:35:12,92.179360,niek@email.com,,,,,,,,...,,,,,,,,,,
1,2022-06-12 14:38:47,57.882491,job@email.com,,,,,,,,...,,,,,,,,,,
2,2022-06-12 14:41:41,82.642453,job@email.com,,,,,,,,...,,,,,,,,,,
3,2022-06-12 14:49:55,129.602890,job@email.com,,,,,,,,...,,,,,,,,,,
4,2022-06-12 15:52:08,63.953953,niek@email.com,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2022-12-14 11:27:23,154.013215,stefan@jaspers.nl,YES,1.0,NO,NO,YES,NO,NO,...,NO,LD025,4.0,4.0,NO,NO,NO,3.0,2.0,2022-12-14 11:28:30
96,2022-12-14 11:40:35,124.155699,job@email.com,YES,2.0,NO,YES,YES,NO,NO,...,NO,LD025,1.0,2.0,NO,NO,NO,5.0,1.0,2022-12-14 11:41:11
97,2022-12-14 11:53:05,148.021125,job@email.com,YES,2.0,NO,YES,YES,NO,NO,...,NO,LD025,1.0,1.0,NO,NO,NO,6.0,1.0,2022-12-14 11:53:51
98,2022-12-14 11:53:25,59.473629,niek@email.com,YES,2.0,NO,NO,YES,NO,NO,...,NO,LD025,1.0,2.0,NO,NO,NO,6.0,1.0,2022-12-14 11:54:04


In [1369]:
df.head(n=200)

Unnamed: 0,DateTime,RMSSD,User Name,Air Qualityall Good,Beverage,Cloth 1,Cloth 2,Cloth 3,Cloth 4,Cloth 5,...,Suffocating,Thermal Comfort,Thermal Preference,Timestamp,Humidity,Temperature,TemperatureF,Light,eCO2,TVOC
0,2022-01-12 14:35:12,92.179360,niek@email.com,,,,,,,,...,,,,,,,,,,
1,2022-06-12 14:38:47,57.882491,job@email.com,,,,,,,,...,,,,,,,,,,
2,2022-06-12 14:41:41,82.642453,job@email.com,,,,,,,,...,,,,,,,,,,
3,2022-06-12 14:49:55,129.602890,job@email.com,,,,,,,,...,,,,,,,,,,
4,2022-06-12 15:52:08,63.953953,niek@email.com,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,2022-12-15 13:58:07,35.985431,Noah@email.com,,,,,,,,...,,,,,,,,,,
129,2022-12-15 14:04:54,37.492575,Noah@email.com,,,,,,,,...,,,,,,,,,,
130,2022-12-15 14:13:33,46.790399,Noah@email.com,,,,,,,,...,,,,,,,,,,
131,2022-12-15 14:30:09,53.977299,Noah@email.com,,,,,,,,...,,,,,,,,,,


In [1370]:
def bool_to_num(val):
    if val.lower() == "yes": return 1
    return 0

In [1371]:
columns = ["Air Qualityall Good", "Smelly", "Stuffy", "Suffocating", "Humid"]
for i in columns:
    df[i] = df[i].apply(lambda x: bool_to_num(x))

for i in range(1, 8):
    column = f"Cloth {i}"
    df[column] = df[column].apply(lambda x: bool_to_num(x))

dummies = ["Duration Of Location", "Eat Recent", "Location"]
df = pd.get_dummies(df, columns=dummies)
df

AttributeError: 'float' object has no attribute 'lower'