In [1]:
# Loading of all necessary imports
import pandas as pd
from os import listdir
from os.path import join
from IPython.display import display, Markdown
import numpy as np
import datetime

from sklearn.preprocessing import OneHotEncoder, normalize, StandardScaler

In [2]:
# Local source of data
data_path = "../data/"

In [3]:
survey_path = join(data_path, 'surveys')

demographics_file_one_path = join(survey_path,'raw', 'demographics', 'part_one-demographics.csv.gz')
demographics_file_two_path = join(survey_path,'raw', 'demographics', 'part_two-demographics_timings.csv.gz')

In [4]:
df_demographics_one = pd.read_csv(demographics_file_one_path, compression="gzip")
df_demographics_two = pd.read_csv(demographics_file_two_path, compression="gzip")

In [5]:
df_demographics = df_demographics_one.merge(df_demographics_two, on='participant_id', how='inner')
df_demographics['gender'] = df_demographics['gender'].astype('object')
df_demographics['shift'] = df_demographics['shift'].astype('object')
df_demographics.describe().T.to_latex(longtable=True, buf='../descriptives/dem_long_numerical.tex') 
df_demographics[['age']].describe().T.to_latex(buf='../descriptives/dem_short_numerical.tex', position='h', caption='The descriptives statistics of the age variable. The only numerical variable used.', label='desc:shortDemNum') 
df_demographics.describe(include='object').T.to_latex(longtable=True, buf='../descriptives/dem_long_categorical.tex') 
df_demographics[['participant_id', 'hours', 'gender', 'shift']].describe(include='object').T.to_latex(buf='../descriptives/dem_short_categorical.tex', position='h', caption='The descriptives statistics of the categorical variables included.', label='desc:shortDemCat') 

  df_demographics.describe().T.to_latex(longtable=True, buf='../descriptives/dem_long_numerical.tex')
  df_demographics[['age']].describe().T.to_latex(buf='../descriptives/dem_short_numerical.tex', position='h', caption='The descriptives statistics of the age variable. The only numerical variable used.', label='desc:shortDemNum')
  df_demographics.describe(include='object').T.to_latex(longtable=True, buf='../descriptives/dem_long_categorical.tex')
  df_demographics[['participant_id', 'hours', 'gender', 'shift']].describe(include='object').T.to_latex(buf='../descriptives/dem_short_categorical.tex', position='h', caption='The descriptives statistics of the categorical variables included.', label='desc:shortDemCat')


In [6]:
raw_survey_containing_rand = join(survey_path, "raw/baseline/part_two-demo_rand_swls_pss_mpfi_waaq_uwes_pcq_chss.csv.gz")
survey_part_one_path = join(survey_path, "scored", "baseline", "part_one-abs_vocab_gats_audit_psqi_ipaq_iod_ocb_irb_itp_bfi_pan_stai.csv.gz")
survey_part_two_path = join(survey_path, "scored", "baseline", "part_two-rand_swls_pss_mpfi_waaq_uwes_pcq_chss.csv.gz")

In [7]:
df_surveys_one = pd.read_csv(survey_part_one_path, compression='gzip')
df_surveys_two = pd.read_csv(survey_part_two_path, compression='gzip')

In [8]:
df_surveys = df_surveys_one.merge(df_surveys_two, on='participant_id', how='inner')
rands = [col for col in df_surveys.columns if 'rand' in col]
df_surveys_short = df_surveys[['audit', 'psqi', 'stai', 'pan_PosAffect', 'pan_NegAffect', 'chss_ChallengeStressors', 'chss_HindranceStressors'] + rands]
df_surveys_short.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_short.tex', position='h', caption='Descriptives of the unprocessed survey data. All the other descriptives can be found in the appendix.', label='desc:shortBase')
df_surveys.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_num.tex', position='h', caption='Descriptives of the unprocessed numerical survey data.', label='desc:longNumBase')
df_surveys.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_cat.tex', position='h', caption='Descriptives of the unprocessed categorical survey data.', label='desc:longCatBase')

  df_surveys_short.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_short.tex', position='h', caption='Descriptives of the unprocessed survey data. All the other descriptives can be found in the appendix.', label='desc:shortBase')
  df_surveys.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_num.tex', position='h', caption='Descriptives of the unprocessed numerical survey data.', label='desc:longNumBase')
  df_surveys.describe().T.to_latex(float_format="%.2f", buf='../descriptives/base_cat.tex', position='h', caption='Descriptives of the unprocessed categorical survey data.', label='desc:longCatBase')


In [9]:
daily_summary_folder_path = join(data_path, "fitbit", "daily-summary")

In [10]:
# Extracting every .gz file and adding a column with participant id for every participant's daily summary
data_frames = []
daily_summary_file_paths = listdir(daily_summary_folder_path)
for daily_summary_path in daily_summary_file_paths:
    if daily_summary_path[-2:] == 'gz':
        df = pd.read_csv(join(daily_summary_folder_path, daily_summary_path), compression='gzip', parse_dates=['Timestamp'])
        df['participant_id'] = daily_summary_path[:-7]
        data_frames.append(df) 
        
df_daily = pd.concat(data_frames)

In [11]:
df_daily_short = df_daily[['participant_id', 'RestingHeartRate', 'Cardio_minutes', 'Fat Burn_minutes', 'Out of Range_minutes', 'Peak_minutes', 'NumberSteps', 'SleepMinutesAsleep', 'Timestamp']]
with pd.option_context("max_colwidth", 1000):    
    df_daily_short.describe(include='object').T.to_latex(float_format="%.2f", buf='../descriptives/short_daily_cat.tex', position='h', caption='Descriptive statistics of the raw categorical daily summary data (concatenated).', label='desc:dailyShortCat')
    df_daily_short.describe().T.to_latex(float_format="%.2f", buf='../descriptives/short_daily_num.tex', position='h', caption='Descriptive statistics of the raw numerical daily summary data (concatenated). All the other raw descriptives can be examined in the appendix.', label='desc:dailyShortNum')
    df_daily.describe(include='object').T.to_latex(float_format="%.2f", buf='../descriptives/long_daily_cat.tex', position='h', 
                                                   caption='Descriptive statistics of all raw categorical daily summary data (concatenated).', label='desc:dailyLongCat')
    df_daily.describe().T.to_latex(float_format="%.2f", buf='../descriptives/long_daily_num.tex', position='h', 
                                         caption='Descriptive statistics of all raw numerical daily summary data (concatenated).', label='desc:dailyLongNum')

  df_daily_short.describe(include='object').T.to_latex(float_format="%.2f", buf='../descriptives/short_daily_cat.tex', position='h', caption='Descriptive statistics of the raw categorical daily summary data (concatenated).', label='desc:dailyShortCat')
  df_daily_short.describe().T.to_latex(float_format="%.2f", buf='../descriptives/short_daily_num.tex', position='h', caption='Descriptive statistics of the raw numerical daily summary data (concatenated). All the other raw descriptives can be examined in the appendix.', label='desc:dailyShortNum')
  df_daily.describe(include='object').T.to_latex(float_format="%.2f", buf='../descriptives/long_daily_cat.tex', position='h',
  df_daily.describe().T.to_latex(float_format="%.2f", buf='../descriptives/long_daily_num.tex', position='h',


In [12]:
ema_path = join(data_path, "surveys", "scored", "EMAs")
df_exercise = pd.read_csv(join(ema_path, 'ex.csv.gz'), compression="gzip")
df_work = pd.read_csv(join(ema_path, 'work.csv.gz'), compression="gzip")
df_sleep = pd.read_csv(join(ema_path, 'sleepd.csv.gz'), compression="gzip")
df_work['work_status'] = df_work['work_status'].astype('object')
ex_short = df_exercise[['ex_Total']].describe().T
sleep_short = df_sleep[['sleepd']].describe().T
work_short = df_work[['work_status']].describe(include='object').T
df_ema = pd.concat([ex_short, sleep_short, work_short])
df_ema = df_ema.fillna('-')
df_ema.T.to_latex(buf='../descriptives/short_ema.tex', position='h', caption='The descriptives for the EMA raw tables. The full descriptives for every table are in the appendix.', label='desc:emaShort')

df_work.describe(include='object').T.to_latex(buf='../descriptives/long_work_cat.tex', position='h', 
                              caption='Raw descriptives for categorical variables in the work EMA table.', label='desc:workLongCat')

df_exercise.describe().T.to_latex(buf='../descriptives/long_ex_num.tex', position='h', 
                              caption='Raw descriptives for numerical variables in the exercise EMA table.', label='desc:exLongNum')
df_exercise.describe(include='object').T.to_latex(buf='../descriptives/long_ex_cat.tex', position='h', 
                              caption='Raw descriptives for categorical variables in the exercise EMA table.', label='desc:exLongCat')

df_sleep.describe().T.to_latex(buf='../descriptives/long_sleep_num.tex', position='h', 
                              caption='Raw descriptives for numerical variables in the sleepd EMA table.', label='desc:sleepLongNum')
df_sleep.describe(include='object').T.to_latex(buf='../descriptives/long_sleep_cat.tex', position='h', 
                              caption='Raw descriptives for categorical variables in the sleepd EMA table.', label='desc:sleepLongCat')

  df_ema.T.to_latex(buf='../descriptives/short_ema.tex', position='h', caption='The descriptives for the EMA raw tables. The full descriptives for every table are in the appendix.', label='desc:emaShort')
  df_work.describe(include='object').T.to_latex(buf='../descriptives/long_work_cat.tex', position='h',
  df_exercise.describe().T.to_latex(buf='../descriptives/long_ex_num.tex', position='h',
  df_exercise.describe(include='object').T.to_latex(buf='../descriptives/long_ex_cat.tex', position='h',
  df_sleep.describe().T.to_latex(buf='../descriptives/long_sleep_num.tex', position='h',
  df_sleep.describe(include='object').T.to_latex(buf='../descriptives/long_sleep_cat.tex', position='h',


In [13]:
participant_path = join(data_path, "processed", "participant_data.csv.gz")
df_participants = pd.read_csv(participant_path, compression='gzip')

df_participants.set_index("participant_id", inplace=True)
daily_path = join(data_path, "processed", "daily_data.csv.gz")
df_daily = pd.read_csv(daily_path, compression='gzip')
df_daily.set_index('participant_id', inplace=True)

# Joining all the different dataframes together using participant ID as the key 
df = df_daily.merge(df_participants, how="left", on='participant_id', suffixes=(None, '_survey'))
df.reset_index(inplace=True)

In [14]:
df['WearTime'] = (df['Cardio_minutes'] + df['Fat Burn_minutes'] + df['Peak_minutes'] + df['Out of Range_minutes'])
df = df[df['WearTime'] > 720]
df_g = df.groupby('participant_id').mean()
df_g['Count'] = df.groupby('participant_id')['NumberSteps'].count()
df = df_g[df_g['Count'] > 5]
df_cluster = df.copy()

  df_g = df.groupby('participant_id').mean()


In [15]:
cluster_features = ['randPhysical', "swls", "psqi", "RestingHeartRate", 'audit', 'pan_PosAffect', 'stai', 'chss_ChallengeStressors', 'chss_HindranceStressors']
scaler = StandardScaler()
df_cluster[cluster_features] = scaler.fit_transform(df_cluster[cluster_features])

df_cluster[cluster_features].dropna().describe().T.to_latex(float_format="%.2f", buf='../descriptives/processed_cluster.tex', position='h', caption='The statistics for the processed cluster data.', label='desc:clusterProcess')

  df_cluster[cluster_features].dropna().describe().T.to_latex(float_format="%.2f", buf='../descriptives/processed_cluster.tex', position='h', caption='The statistics for the processed cluster data.', label='desc:clusterProcess')


In [16]:
processed = df_participants[['stai', 'psqi', 'gender', 'age', 'shift']].dropna()
processed.describe().T.to_latex(float_format="%.2f", buf='../descriptives/processed_sleepQualityAnxiety.tex', position='h', caption='The descriptive statistics of the cleaned data.', label='desc:processSleepAnxiety')

  processed.describe().T.to_latex(float_format="%.2f", buf='../descriptives/processed_sleepQualityAnxiety.tex', position='h', caption='The descriptive statistics of the cleaned data.', label='desc:processSleepAnxiety')


In [17]:
df_subset = df_daily[['sleepd', 'participant_id', 'NumberSteps', 'Fat Burn_minutes', 'Peak_minutes', 'Cardio_minutes', 'Out of Range_minutes', 'work_status']].reset_index().copy()
df_subset['Active_Minutes'] = df_subset['Cardio_minutes'] + df_subset['Peak_minutes'] + df_subset['Fat Burn_minutes'] 
df_subset['WearTime'] = df_subset['Active_Minutes'] + df_subset['Out of Range_minutes']
df_subset = df_subset[df_subset['WearTime'] >= 720]
df_test = df_subset.merge(df_participants, how='inner', on='participant_id', suffixes=(None, '_survey'))
df_test['work_status'] = df_test['work_status'] == 'yes'
df_test = df_test[['sleepd', 'participant_id', 'NumberSteps', 'Active_Minutes', 'shift', 'gender', 'age', 'hours', 'work_status']].dropna()
desc_table = df_test.describe(include=['object', 'number']).T[['count', 'unique', 'freq', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']].fillna('-')
desc_table = desc_table.rename(columns={'freq': 'max_freq'})

desc_table.to_latex(buf='../descriptives/processed_SleepActivity.tex', position='h', caption='The descriptives of the processed sleep activity data.', label='desc:sleepActivity')

KeyError: "['participant_id'] not in index"

In [None]:
df_daily = pd.read_csv(daily_path, compression='gzip')
df_daily['Wear Time'] = df_daily['Cardio_minutes'] + df_daily['Fat Burn_minutes'] + df_daily['Out of Range_minutes'] + df_daily['Peak_minutes']
df_daily = df_daily[df_daily['Wear Time'] > 960]
full_frame = df_daily.merge(df_daily, on='participant_id', how='inner', suffixes=('_yesterday', '_today'))
full_frame = full_frame[(pd.to_datetime(full_frame['Timestamp_yesterday']) - pd.to_datetime(full_frame['Timestamp_today'])) == pd.Timedelta('-1 day')]

In [None]:
df_desc = full_frame.merge(df_participants, on='participant_id', how='inner', suffixes=(None, '_meta'))
df_desc = df_desc[['participant_id', 'RestingHeartRate_today', 'NumberSteps_yesterday', 'age', 'shift', 'gender']].dropna()
desc_table = df_desc.describe(include=['object', 'number']).T[['count', 'unique', 'freq', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']].fillna('-')
desc_table = desc_table.rename(columns={'freq': 'max_freq'})
desc_table.to_latex(buf='../descriptives/stepsHeartRate.tex', caption='The processed descriptives of the number steps v. heart rate hypothesis.', label='desc:restSteps')

In [None]:
df_daily = pd.read_csv(daily_path, compression='gzip')
df_daily['Wear Time'] = df_daily['Cardio_minutes'] + df_daily['Fat Burn_minutes'] + df_daily['Out of Range_minutes'] + df_daily['Peak_minutes']
df_daily = df_daily[df_daily['Wear Time'] > 240]
df_test = df_daily.merge(df_participants, on='participant_id', how='inner', suffixes=(None, '_survey'))
df_test = df_test[['participant_id', 'ex_Total','age', 'gender', 'shift', 'RestingHeartRate']].copy()
df_test.dropna(inplace=True)
desc_table = df_test.describe(include=['object', 'number']).T[['count', 'unique', 'freq', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']].fillna('-')
desc_table = desc_table.rename(columns={'freq': 'max_freq'})
desc_table.to_latex(buf='../descriptives/activityHeartRate.tex', caption='The processed descriptives for the activity and heart rate hypothesis', label='desc:activityRest')