In [1]:
import json
import os
import ast
import csv
import io
from io import StringIO, BytesIO, TextIOWrapper
import gzip
from datetime import datetime, date
from s3_utils import *
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import ast
from datetime import timedelta
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.model_selection import cross_val_score, ShuffleSplit, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import warnings
import sys
import time
from utils import *
warnings.filterwarnings("ignore")

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-0vcwuflv because the default path (/home/ubuntu/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
# Define s3 bucket
bucket = 'fouryouandme-study-data'

#List s3 keys
# get_matching_s3_keys(bucket, prefix='bump/') #Remove prefix to view non-BUMP data
bucket = '4youandme-study-data' # for SinC project
bucket = 'fouryouandme-study-data' # for 4YouandMe, Bump, CamCog or Bodyport project

# prefix = {study_name} or {study_name}/{source}
# sources: app_activities, bodyport, camcog, garmin, oura, redcap, rescuetime
# note camcog not accessible to bodyport (and vice a versa)
get_matching_s3_keys(bucket, prefix='bump/oura')
key = 'bump/redcap/wave_4/study_ids.csv.gz'
df_studyID = pandas_from_csv_s3(bucket, key=key, compression='gzip')

# Some dataframes use 'record_id' instead of 'user_id'. 
# You'll need to match it up with df_studyID where'evidation_id' is 'user_id'
# NOTE: Very few examples of this. Birthing data is the important one

# Birthing Data
key = 'bump/redcap/wave_4/birthing_data_cohort_2_only.csv.gz'
df_birth = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_birth['date'] = pd.to_datetime(df_birth.birth_date).dt.date


df_birth = pd.merge(df_birth, df_studyID, on='record_id')
df_birth['user_id'] = df_birth.evidation_id

# There is a missing value in the birthing data. I'm removing it here
df_birth = df_birth.drop(index=50)

# Bodyport Wave 4
key = 'bump/bodyport/wave_4/bodyport.csv.gz'
df_bodyport = pandas_from_csv_s3(bucket, key=key, compression='gzip')
# OPTIONAL: Convert date format
df_bodyport['date'] = pd.to_datetime(df_bodyport.event_date).dt.date 

# Oura Wave 4
key = 'bump/oura/wave_4/oura_sleep.csv.gz'
df_sleep = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_sleep['date'] = pd.to_datetime(df_sleep.event_date).dt.date

key = 'bump/oura/wave_4/oura_activity.csv.gz'
df_activity = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_activity['date'] = pd.to_datetime(df_activity.event_date).dt.date

key = 'bump/oura/wave_4/oura_readiness.csv.gz'
df_readiness = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_readiness['date'] = pd.to_datetime(df_readiness.event_date).dt.date

# Surveys Wave 4
key = 'bump/app_activities/wave_4/surveys.csv.gz'
df_survey = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_survey['date'] = pd.to_datetime(df_survey.updated_at).dt.date

key = 'bump/app_activities/wave_4/quick_activities.csv.gz'
df_sam = pandas_from_csv_s3(bucket, key=key, compression='gzip')
df_sam['date'] = pd.to_datetime(df_sam.event_date).dt.date

dfs = [df_sleep, df_bodyport, df_birth, df_activity, df_readiness, df_survey, df_sam]

In [4]:
# extracting fatigue related questions
survey_question_str = 'fatigue'

df_fatigue = get_survey_question(df_survey, survey_question_str)
np.sum(df_fatigue.isna())


id               0
user_id          0
survey_id        0
title            0
event_date       0
from             0
to               0
created_at       0
updated_at       0
question_id      0
question_text    0
answer_text      0
date             0
dtype: int64

In [3]:
df_fatigue[df_fatigue.user_id == 2019]

df_fatigue[df_fatigue.date == np.min(df_fatigue.date)]

NameError: name 'df_fatigue' is not defined

In [5]:
a=df_birth[df_birth.user_id==290].birth_date.values[0]
b=df_birth[df_birth.user_id==1429].birth_date.values[0]
date_range = pd.date_range(a, b, freq='d')
date_range
date_df = pd.DataFrame()
date_df['date'] = date_range.date
date_df

Unnamed: 0,date
0,2022-01-05
1,2022-01-06
2,2022-01-07
3,2022-01-08
4,2022-01-09
...,...
145,2022-05-30
146,2022-05-31
147,2022-06-01
148,2022-06-02


In [None]:
# get all column names in merged dfs
names = []
for df in dfs:
    [names.append(i) for i in df.columns.to_list()]
    


In [None]:
bodyport_features = [
    # 'heart_rate',
    # 'breath_average',
    # 'peripheral_fluid',
    # 'total_body_water_percent',
    # 'weight_kg'
]
oura_features = [
    # 'hr_lowest',
    'hr_average',
    'rmssd',
    # 'score_deep',
    # 'temperature_deviation',
    # 'temperature_trend_deviation',
    # 'temperature_delta',
    # 'duration',
    # 'rem',
    # 'efficiency',
    # 'score_alignment',
    # 'score_rem',
    # 'light',
    # 'onset_latency',
    # 'restless',
    'breath_average',
    # 'score_disturbances',
    'score',
    # 'score_efficiency',
    # 'score_latency',
    # 'score_total'
]

feature_names = bodyport_features + oura_features
date_list = [
    "d", "id_x", 
    "user_id_x", 
    "identity_id_x", 
    "created_at_x", 
    "updated_at_x", 
    "retrieved_at_x", 
    "subsource_x", 
    "event_date_x",
    "date", "id_y", 
    "user_id_y", 
    "identity_id_y", 
    "created_at_y", 
    "updated_at_y", 
    "retrieved_at_y", 
    "subsource_y", 
    "creation_date", 
    "event_date_y"
]

In [None]:
import time
from utils import *
# merge df_fatigue and df_sleep
close_users = []
close_users_id = []
# start_close_days = 90
all_survey_user_id = df_fatigue[['answer_text', 'date', 'user_id']].user_id.unique()
#print(len(all_survey_user_id))
#print(all_survey_user_id)
for user_id in tqdm(all_survey_user_id):
    start_time = time.time()
    #print("Current user id:", user_id)
    if len(df_sleep[df_sleep.user_id==user_id]) == 0:
        print("No row in df_sleep, continue")
        continue
    # selected_user_df = df_fatigue.loc[df_fatigue.user_id == user_id]
    # survey_retrived_date = selected_user_df.reset_index()['date'][0]
    min_date = np.min(df_fatigue[df_fatigue.user_id == user_id].date)
    max_date = np.max(df_fatigue[df_fatigue.user_id == user_id].date)
    date_range = pd.date_range(min_date, max_date, freq='d')
    selected_fatigue_df = df_fatigue[df_fatigue.user_id == user_id]
    selected_fatigue_df = selected_fatigue_df[['answer_text', 'date', 'user_id']]
    date_range_df = pd.DataFrame()
    
    date_range_df['date'] = date_range.date
    date_range_df['user_id'] = user_id
    #print(date_range_df)
    add_date_fatigue_df = pd.merge(date_range_df, selected_fatigue_df, how='left')
    #print(selected_fatigue_df)
    #print(add_date_fatigue_df)

    sleep_features = oura_features + ['date', 'user_id']
    merged_df = merge_two_df_by_userid(user_id, add_date_fatigue_df, df_sleep[sleep_features], how='left')
    merge_end_time = time.time()
    #print('merge_time:', merge_end_time-start_time)
    # print(merged_df)
    # merged_df = merged_df.fillna(merged_df.mean())
    # print(merged_df)
    fillna_end_time = time.time()
    #print('fillna_time:', fillna_end_time-merge_end_time)
    merged_df = merged_df.sort_values(by=['date'], ascending=True)
    if len(df_birth[df_birth.user_id == user_id]):
        merged_df['birth_date'] = df_birth[df_birth.user_id == user_id].birth_date.values[0]
    else:
        merged_df['birth_date'] = np.nan
    close_users.append(merged_df)
    close_users_id.append(user_id)

len(close_users_id)
    # schedule_birth = birth['birth_scheduled'].to_list()[0] #schedule_birth == 1: induced deliveries; schedule_birth == 2: non induced
    # if len(birth) > 0 and pd.isnull(birthdate) == False and schedule_birth == 2:
    #     start = birthdate - pd.to_timedelta(start_close_days, unit='d') # pd.to_timedelta(100, unit='d')
    #     end = birthdate + pd.to_timedelta(5, unit='d')
    #     date_range = pd.date_range(start, end, freq='d')
    #     df = get_user(user_id, start, birthdate)
    #     dr = pd.DataFrame()
    #     dr["d"] = date_range.date
    #     dr.set_index(dr.d, inplace=True)
    #     df = dr.join(df)
    #     df = df[feature_names + date_list]
    #     if df.isna().sum().max() < round(start_close_days) / 2: # max null counts in each col is less than 5
    #         df.set_index(df.d - df.d.min(), inplace=True)
    #         df = df.resample("D").mean()
    #         df.set_index(df.index.days, inplace=True)
    #         # df = df.interpolate("linear", 0) #linear interpolation on data
    #         df = df.fillna(method='ffill')
    #         df = df.fillna(df.mean(0))
    #         close_users.append(df)
    #         close_users_id.append(user_id)

In [9]:
close_users[0]
#df_fatigue[[f for f in ['answer_text', 'date', 'user_id']]]


Unnamed: 0,date,user_id_x,answer_text,hr_average,rmssd,breath_average,score,user_id_y,birth_date
0,2021-10-28,1037,4,,,,,,2022-04-14
1,2021-10-29,1037,,,,,,,2022-04-14
2,2021-10-30,1037,,,,,,,2022-04-14
3,2021-10-31,1037,,,,,,,2022-04-14
4,2021-11-01,1037,3,58.25,75.0,18.125,84.0,1037.0,2022-04-14
...,...,...,...,...,...,...,...,...,...
189,2022-05-03,1037,,50.33,86.0,16.625,61.0,1037.0,2022-04-14
190,2022-05-04,1037,,50.12,67.0,16.375,73.0,1037.0,2022-04-14
191,2022-05-05,1037,,51.96,86.0,16.875,80.0,1037.0,2022-04-14
192,2022-05-06,1037,,52.22,99.0,16.250,66.0,1037.0,2022-04-14


In [None]:

len(merge_two_df_by_userid(1037, df_fatigue[['answer_text', 'date', 'user_id']], df_sleep[['hr_average', 'date', 'user_id']]).date.unique())


In [None]:
len(df_fatigue[['answer_text', 'date', 'user_id']][df_fatigue.user_id==1037].date.unique())

In [None]:
len(df_sleep[['hr_average', 'date', 'user_id']][df_sleep.user_id==1037].date.unique())

In [10]:
# store to local directory for R
import os
stored_path = os.path.join('.', 'all_date_combined_df_sleep_'+survey_question_str)
if not os.path.exists(stored_path):
    os.mkdir(stored_path)
for df, uid in zip(close_users, close_users_id):
    df.to_csv(os.path.join('.', 'all_date_combined_df_sleep_'+survey_question_str, 'user_id_'+str(uid)+'.csv'))

In [11]:
for df, uid in zip(close_users, close_users_id):
    print(uid, len(df))

1037 194
1386 231
1391 219
1703 174
1005 231
1759 139
1712 165
1429 193
1032 276
1038 257
603 277
79 311
1988 137
1966 117
1995 136
1991 19
1747 176
1724 128
989 191
2015 141
1436 220
604 279
991 188
1373 258
1427 179
1378 245
1745 34
1002 133
1435 191
1441 223
1708 169
1044 208
1439 238
1706 187
992 217
1453 182
1366 184
966 218
605 164
1751 72
429 261
975 228
1425 197
1035 224
615 195
1755 146
1452 170
1731 172
1707 102
1976 136
1700 180
1374 125
1369 182
2083 107
1996 134
1989 148
2067 104
1715 175
2001 131
2014 107
2068 113
1400 215
173 411
404 233
1709 162
1992 127
2032 124
1696 190
174 240
225 245
289 256
1444 152
1701 176
1047 196
1999 123
1658 194
2018 131
1442 92
1716 132
1389 154
2006 115
118 190
977 161
406 165
410 339
978 177
976 55
1041 149
192 310
581 327
972 198
987 185
1423 177
409 177
1004 61
997 95
94 219
405 310
1014 76
159 212
750 213
428 183
1443 63
1001 264
127 233
68 287
407 277
984 87
980 166
39 280
185 310
1351 1
122 404
1363 70
595 140
734 173
2062 87
1757 155

In [None]:
# Stationary test


In [None]:
# def get_user(user_id, start=None, end=None):
#     user_sleep = df_sleep[df_sleep.user_id == user_id]#.dropna()
#     user_bp = df_bodyport[df_bodyport.user_id == user_id]#.dropna()
    
#     df2 = pd.merge(user_sleep, user_bp, on="date")

#     if "creation_date" in df2.columns:
#         for i in range(len(df2)):
#             df2["creation_date"][i] = dt.datetime.strptime(df2["creation_date"][i], '%Y-%m-%d %H:%M:%S')
#     df2.set_index(df2["date"], inplace=True)
#     df2.sort_index(inplace=True)
    
#     if start and end:
#         mask = (df2['date'] > np.datetime64(start)) & (df2['date'] <= np.datetime64(end))
#         # mask = pd.to_datetime(df2["date"]).between(start.astype(str)[0], end.astype(str)[0], inclusive=True)
#         df2 = df2[mask]
#     return df2

In [11]:
# #merge all data features
# close_users = []
# close_users_id = []
# start_close_days = 90
# for user_id in tqdm(df_birth[['birth_scheduled', 'birth_date', 'user_id']].dropna().user_id.unique()):
#     birth = df_birth.loc[df_birth.user_id == user_id]
#     birthdate = birth.reset_index()['date'][0]
#     schedule_birth = birth['birth_scheduled'].to_list()[0] #schedule_birth == 1: induced deliveries; schedule_birth == 2: non induced
#     if len(birth) > 0 and pd.isnull(birthdate) == False and schedule_birth == 2:
#         start = birthdate - pd.to_timedelta(start_close_days, unit='d') # pd.to_timedelta(100, unit='d')
#         end = birthdate + pd.to_timedelta(5, unit='d')
#         date_range = pd.date_range(start, end, freq='d')
#         df = get_user(user_id, start, birthdate)
#         dr = pd.DataFrame()
#         dr["d"] = date_range.date
#         dr.set_index(dr.d, inplace=True)
#         df = dr.join(df)
#         df = df[feature_names + date_list]
#         if df.isna().sum().max() < round(start_close_days) / 2: # max null counts in each col is less than 5
#             df.set_index(df.d - df.d.min(), inplace=True)
#             df = df.resample("D").mean()
#             df.set_index(df.index.days, inplace=True)
#             # df = df.interpolate("linear", 0) #linear interpolation on data
#             df = df.fillna(method='ffill')
#             df = df.fillna(df.mean(0))
#             close_users.append(df)
#             close_users_id.append(user_id)
            

1%|          | 1/138 [00:00<00:00, 251.96it/s]


NameError: name 'get_user' is not defined

In [14]:
len(close_users_id)

27

In [14]:
# def exploreDataBirthBA(df, col, user_id):
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     sns.set_theme(style='darkgrid')
#     pdf = df.loc[df.user_id == user_id]    
#     plt.figure(figsize=(12,4))
#     sns.scatterplot(data=pdf, x='date', y=col, ci=None, color='purple')
#     sns.lineplot(data=pdf, x='date', y=col, ci=None)

#     # Plot birthing data if it exists for that user
#     if (len(df_birth.loc[df_birth.user_id == user_id]) != 0):
#         birth = df_birth.loc[df_birth.user_id == user_id].reset_index()
#         plt.axvline(x=birth.date, color = 'y', ls='--')
#         ymin, ymax = plt.gca().get_ylim()
#         xmin, xmax = plt.gca().get_xlim()
#         plt.text(birth.date, ymax, birth['date'][0], fontsize=12, color='y')
        
#         # Dataframe of data before birth
#         after = pdf[~(pdf['date'] < birth.date[0])]
#         before = pdf[~(pdf['date'] > birth.date[0])]
#         before_avg = before[col].mean()
#         after_avg = after[col].mean()
# #         print('Pre-birth Average: ', before_avg)
# #         print('Post-birth Average: ', after_avg)
#         plt.hlines(y=before_avg, xmin=xmin, xmax=birth.date, color='blue', linestyles='dashdot')
#         plt.hlines(y=after_avg, xmin=birth.date, xmax=xmax, color='red', linestyles='dashdot')
# #         sns.lineplot(data=before, x='date', y=col, ci=None, color='r')
# #         sns.lineplot(data=after, x='date', y=col, ci=None, color='r')

#     plt.xlabel(''); plt.ylabel(col)
#     plt.title('User ID: ' + str(user_id))
#     plt.show()
# exploreDataBirthBA(df_sleep, 'hr_average', 30)

AttributeError: 'PathCollection' object has no property 'ci'