##Important library install and connection

In [None]:
!pip install synapseclient



In [None]:
import pandas as pd
import synapseclient

syn = synapseclient.Synapse()
with open('token.txt', 'r') as file:
  token = file.readline().strip()
syn.login(authToken=token)

Welcome, Humphrey Kanyoke!



INFO:synapseclient_default:Welcome, Humphrey Kanyoke!



##Dataset with their access codes
List of tables available in the database. Those which are of our use is marked as (o) and those which are not of our use is marked as (x). Table which might be required for discussion or for extension purpose are marked as (~)

|Table name         | Access code  | Note | Time Series | # Patients| Avg # entries per patient| Median  # entries per patient |
|-------------------|--------------|------|-------------|-----------|-------|-------|
|Day One Survey **(x)**      |syn16782072   |
|PAR-Q Survey **(x)**       | syn16782071  |
|Daily Check Survey **(o)** |syn16782070   | | Y | 17622 | 7.74 | 4.0 |
|Activity and Sleep Survey **(~)** | syn16782069| for discussion purpose | ? | 23232 | 1.07 | 1.0 |
|Risk Factor Survey **(o)** |syn16782068   | | ? | 13851 | 1.03 | 1.0 |
|Cardio Diet Survey **(x)** | syn16782067  |
|Satisfied Survey   **(x)**| syn16782066 |
|**APH Heart Age Survey (o)**|**syn16782065** | **Contains BP data** | Y | 4759 | 2.26 | 1.0 |
|Six Minute Walk Activity **(o)**| syn16782064 | | Y | 3441 | 1.98 | 1.0 |
|Demographics Survey **(o)** | syn16782063 | Contains wakeup, sleep time| Y | 7565 | 1.64 | 1.0 |
|HealthKit Data **(o)**| syn16782062 | Distance walked/run data from smart device | Y | 4920 | 23.77 | 5.0 |
|HealthKit Sleep **(o)**| syn16782061 | Sleep data from smart device| Y | 626 | 4.22 | 2.0 |
|HealthKit Workout **(o)** | syn16782060 | Workout data (mostly walking) from smart device| Y | 881 | 3.71 | 1.0 |
|Motion Tracker **(x)**| syn16782059 | Its a tmp file
|Six Minute Walk - Displacement Vectors **(x)**| syn16782058| Unknown file type


##Data Access and pre-processing


In [None]:
query = syn.tableQuery("SELECT * FROM  	syn16782061")
raw_df = query.asDataFrame()
print(raw_df)

# Specify the columns to download
cols_to_download = ['data.csv']

# Download the CSV file
downloaded_files = syn.downloadTableColumns(query, cols_to_download)

                                    recordId               appVersion  \
0_0     45b15d05-e35c-4d20-8372-8bd06953036b   version 1.0.9, build 9   
1_0     77ac3da1-ba61-451f-9589-a52d342158f2   version 1.0.9, build 9   
2_0     12d38b68-b651-43d4-8d9e-d68eb592c600   version 1.0.9, build 9   
3_0     712574b2-a11d-4d62-a2ea-f29a77e9b109   version 1.0.9, build 9   
4_0     4f626368-696a-446a-aff5-5bfa6f1b9f43   version 1.0.9, build 9   
...                                      ...                      ...   
2639_0  85412e5c-d59f-43ba-9bd7-1a57974e0d7c  version 1.0.10, build 1   
2640_0  fb6d5041-ba29-49a9-b4a2-c06e6b9d8c16  version 1.0.10, build 1   
2641_0  bea4c938-d962-4086-97a1-bc69e139f624  version 1.0.10, build 1   
2642_0  d92cf7bb-be17-4024-a47b-ee146841a244  version 1.0.10, build 1   
2643_0  139146be-afaa-45d2-83d1-44cc941e23ad  version 1.0.10, build 1   

              phoneInfo                            healthCode      createdOn  \
0_0       iPhone 6 Plus  383e1eee-cf63-4c4c

INFO:synapseclient_default:Downloading 0 files, 2644 cached locally


**Clean Sleep data**

In [None]:
import pandas as pd

def clean_sleep_record(raw_df, index):
  # Get the file handle ID of the CSV file
  file_handle_id = raw_df.iloc[index]['data.csv']

  # Get the path of the file in the cache
  file_path = syn.cache.get(file_handle_id)

  try:
    # Load the CSV file into a pandas DataFrame and group by day (summming)
    df = pd.read_csv(file_path)
    df = df[df['category.value']=='HKCategoryValueSleepAnalysisAsleep']
    df['startTime'] = pd.to_datetime(df['startTime'])
    df['value'] = df['value'] / 60    # Convert from seconds to minutes
    df['date'] = df['startTime'].dt.date
    df['awake_count'] = df['date']
    df['bed_time'] = df['startTime'].dt.time
    df = df.drop_duplicates(subset=['startTime', 'value'])    # Remove duplicate sleep entries
    # Group by 'date', calculate sum of 'value', count the number of rows per group, and get the date of the row with the highest 'value'
    clean_df = df.groupby('date').agg({'value': ['sum', 'count', lambda x: df.loc[x.idxmax(), 'bed_time']]}).reset_index()



    clean_df.columns = ['date', 'sleep_minutes', 'awake_count', 'bed_time']

    return clean_df

  except:
    ('Bad date entry')

**Create Master sleep dataframe**

In [None]:
from tqdm import tqdm

# Create empty df to which entries in the form of (healthCode, Date, value) will be entered
sleep = pd.DataFrame(columns=['healthCode', 'date', 'sleep_minutes', 'awake_count', 'bed_time'])

# For every clean set of sleep entries, add all the (Date, value) tuples to its corresponding healthCode to create the entry to add to the main results dataframe
for i in tqdm(range(raw_df.shape[0])):
  sleep_entries = clean_sleep_record(raw_df, i)
  if sleep_entries is not None:
    sleep_entries['healthCode'] = raw_df.iloc[i]['healthCode']
    sleep = pd.concat([sleep, sleep_entries], ignore_index=True)

# Adds waking minutes and removes entries with more than 18 hours of sleep
sleep.loc['awake_minutes'] = (24 * 60) - sleep['sleep_minutes']
sleep = sleep[sleep['sleep_minutes'] < (18 * 60)]
sleep.to_csv('sleep.csv', index=False)

100%|██████████| 2644/2644 [00:23<00:00, 114.74it/s]


In [None]:
print(sleep)

**Merge BP and Sleep Data**

In [None]:
query = syn.tableQuery("SELECT * FROM   syn16782065")
bp = query.asDataFrame()
bp['createdOn'] = bp['createdOn'] / 1000
bp['createdOn'] = pd.to_datetime(bp['createdOn'], unit='s')
bp['date'] = bp['createdOn'].dt.date

bp = bp.merge(sleep, on=['healthCode', 'date'], how='left')

In [None]:
print(bp[bp['sleep'].notna()])

                                   recordId  \
390    e00755a4-6dd0-4090-96a1-1e67d71eb218   
436    659a5771-b9ab-4f15-86d9-ce784e7403b5   
437    659a5771-b9ab-4f15-86d9-ce784e7403b5   
1026   3e868459-8c89-4b8f-a0b8-f490427924a6   
1188   e6def815-422e-4946-9682-18c7ebc67487   
...                                     ...   
10646  7eb1f1aa-d2b0-4a94-be5e-64cceeb9dfe4   
10648  498d8f4e-abbe-4e71-9182-58dab53f40a9   
10656  25a30d51-0f2e-405c-9993-04e7ef368639   
10720  6326eca7-3555-414c-a90b-e15d32b7a315   
10725  64e5fc2d-7372-4372-8291-3b7ed8bf99bc   

                                 healthCode           createdOn  \
390    601b719e-5310-44be-972a-170265931579 2015-03-20 17:30:17   
436    9fcb1379-a2f3-4a56-87f1-ff709cf39edb 2015-03-20 16:09:29   
437    9fcb1379-a2f3-4a56-87f1-ff709cf39edb 2015-03-20 16:09:29   
1026   601b719e-5310-44be-972a-170265931579 2015-03-19 14:31:17   
1188   eabc5b1b-1b42-4abb-808f-22f79046e886 2015-03-20 01:04:05   
...                              

###**Data Download and Pre-processing of HealthKit Data (Activity Data)**


**Setting up Config for download**

In [None]:
import synapseclient
syn = synapseclient.Synapse()
syn.cache.cache_root_dir = '/content/activity_data'    # Change cache path
with open('token.txt', 'r') as file: # fran_token.txt would work just fine
  token = file.readline().strip()
syn.login(authToken=token)


Welcome, Humphrey Kanyoke!



INFO:synapseclient_default:Welcome, Humphrey Kanyoke!



**Downloading all individual .csv file**\
*!! Download of 3.62 GBs of data, may take 90 minutes or more*

In [None]:
## Querying the HealthKit Data table

query0 = syn.tableQuery("select healthCode from syn16782065")
patients_list_df = query0.asDataFrame()
patients_list = patients_list_df['healthCode'].astype(str).to_list()
patients = ','.join(f"'{x}'" for x in patients_list)

query1 = "select * from syn16782062 where healthCode in ("+ patients + ")"
query1 = syn.tableQuery(query1)
raw_df_dat = query1.asDataFrame()

# Specify the columns to download
cols_to_download = ['data.csv']

# Download the CSV file
syn.downloadTableColumns(query1, cols_to_download)

In [None]:

''' ************** NO NEED TO RUN THIS CODE NOW (but it can be used if needed for other parts of database) **********************
#This section of code should be run only in order to get the list of activities.
#List of activities are fetched and stored in predictor_variables variable already

def getAllActivities(raw_df_dat):
    activities_list = []
    for i in range(0,raw_df_dat.shape[0]):
        fhi = raw_df_dat.iloc[i]['data.csv']
        fp = syn.cache.get(fhi)

        hc_activity = pd.read_csv(fp)
        activities_list.extend(list(set(hc_activity['type'])))
        activities_list = list(set(activities_list))
        print("Scanning", i, " of ", raw_df_dat.shape[0], " completed")

    print("All activities = ", activities_list)
    return activities_list
 #df_act = pd.DataFrame()

all_activities = getAllActivities(raw_df_dat)

def clean_all_activities(all_activities):
    clean_activities = []
    for item in all_activities:
        #print(item, ", ", item.class)
        if(isinstance(item, str)):
            if(item[0:2] == 'HK'):
                clean_activities.append(item)

    with open('C:\Research\Hamphrey\\all_activities.txt', 'w') as the_file:
        for elem in clean_activities:
            print(elem, file=the_file)
    return clean_activities

final_activities_list = clean_all_activities(all_activities)
print(final_activities_list)
'''

**Creating Master activities dataframe**

In [None]:
activities_for_sum = ['HKQuantityTypeIdentifierFlightsClimbed', 'HKQuantityTypeIdentifierDistanceWalkingRunning', 'HKQuantityTypeIdentifierStepCount', 'HKQuantityTypeIdentifierDistanceCycling', 'HKQuantityTypeIdentifierActiveEnergyBurned']
activities_for_avg = ['HKQuantityTypeIdentifierHeartRate', 'HKQuantityTypeIdentifierBloodPressureDiastolic', 'HKQuantityTypeIdentifierBloodPressureSystolic']

cols_act_df = ['healthCode',
               'date',
               'HKQuantityTypeIdentifierFlightsClimbed',
               'HKQuantityTypeIdentifierHeartRate',
               'HKQuantityTypeIdentifierDistanceWalkingRunning',
               'HKQuantityTypeIdentifierBloodPressureDiastolic',
               'HKQuantityTypeIdentifierBloodPressureSystolic',
               'HKQuantityTypeIdentifierStepCount',
               'HKQuantityTypeIdentifierDistanceCycling',
               'HKQuantityTypeIdentifierActiveEnergyBurned']

bad_records = []
def clean_data_records(raw_df_dat, index):
    file_handle_id = raw_df_dat.iloc[index]['data.csv']
    # file_path = syn.cache.get(file_handle_id)
    file_path = file_handle_dict.get(file_handle_id)
    df_dat_clean = pd.DataFrame(columns=cols_act_df)

    try:
      df_dat = pd.read_csv(file_path)
      df_dat['startTime'] = pd.to_datetime(df_dat['startTime'])
      df_dat['date'] = df_dat['startTime'].dt.date

      df_dat_for_sum = df_dat[df_dat['type'].isin(activities_for_sum)]
      df_dat_for_avg = df_dat[df_dat['type'].isin(activities_for_avg)]

      df_dat_for_sum = df_dat_for_sum.groupby(['date', 'type'])['value'].sum().reset_index()
      df_dat_for_avg = df_dat_for_avg.groupby(['date', 'type'])['value'].mean().reset_index()

      df_grpby_date_type = pd.concat([df_dat_for_sum, df_dat_for_avg])
      uniq_dates = list(set(df_grpby_date_type['date']))

      #making different type values present in rows to the columns of dataframe
      for d in uniq_dates:
          row = {}
          row['healthCode'] = raw_df_dat.iloc[index]['healthCode']
          row['date'] = d
          d_rows = df_grpby_date_type[df_grpby_date_type['date']==d]

          for t in d_rows['type']:
              row[t] = d_rows[d_rows['type'] == t].reset_index()['value'].to_list()[0]

          df_dat_clean = pd.concat([df_dat_clean, pd.DataFrame(row, index=[0])]).fillna(0)
          return df_dat_clean

    except:
        print('file not found')
        bad_records.append(raw_df_dat.iloc[index]['recordId'])

df_activities = pd.DataFrame(columns = cols_act_df)

# For every clean set of sleep entries, add all the (Date, value) tuples to its corresponding healthCode to create the entry to add to the main results dataframe
for i in range(raw_df_dat.shape[0]):
#for i in range(200): #To run a subset
    data_entries = clean_data_records(raw_df_dat, i)

    if data_entries is not None:
        df_activities = pd.concat([df_activities, data_entries], ignore_index=True).fillna(0)
        print(i, " of ", raw_df_dat.shape[0], " processed")

df_activities.to_csv('C:\\Research\\Hamphrey\\out.csv', index=False)

**Merging BP and activities data**

In [None]:
query = syn.tableQuery("SELECT * FROM   syn16782065")
bp = query.asDataFrame()
bp['createdOn'] = bp['createdOn'] / 1000
bp['createdOn'] = pd.to_datetime(bp['createdOn'], unit='s')
bp['date'] = bp['createdOn'].dt.date

bp_activities = bp.merge(df_activities, on=['healthCode', 'date'], how='left')

In [None]:
bp_activities

Unnamed: 0,recordId,healthCode,createdOn,appVersion,phoneInfo,bloodPressureInstruction,bloodPressureInstruction_unit,heartAgeDataBloodGlucose,heartAgeDataBloodGlucose_unit,heartAgeDataDiabetes,...,heartAgeDataAge,date,HKQuantityTypeIdentifierFlightsClimbed,HKQuantityTypeIdentifierHeartRate,HKQuantityTypeIdentifierDistanceWalkingRunning,HKQuantityTypeIdentifierBloodPressureDiastolic,HKQuantityTypeIdentifierBloodPressureSystolic,HKQuantityTypeIdentifierStepCount,HKQuantityTypeIdentifierDistanceCycling,HKQuantityTypeIdentifierActiveEnergyBurned
0,6f1b8ed2-4509-4d42-a025-131b5d993865,9f936943-acd6-4e44-9d67-21de6cc206ae,2015-03-10 15:57:38,"version 1.0, build 5.1",iPhone 5s (GSM),108,mg/dL,86.0,mg/dL,False,...,49.0,2015-03-10,,,,,,,,
1,09c7cac9-9134-4d47-aef3-6d72281515b3,9f936943-acd6-4e44-9d67-21de6cc206ae,2015-03-12 00:42:33,"version 1.0, build 5.1",iPhone 5s (GSM),102,mg/dL,86.0,mg/dL,False,...,49.0,2015-03-12,,,,,,,,
2,a45869e7-dd3c-4162-bfbf-34629dda6f46,54cde2ca-b0d5-4fe4-a58d-476fdb47b192,2015-03-16 04:20:33,"version 1.0, build 5.1",iPhone 6 Plus,160,mg/dL,0.0,mg/dL,False,...,19.0,2015-03-16,,,,,,,,
3,0f596fab-a546-4b98-98a2-a1571709eef6,211e9177-22a7-485a-96ae-06b447da128a,2015-03-16 04:15:22,"version 1.0, build 5.1",iPhone 6,115,mg/dL,90.0,mg/dL,False,...,43.0,2015-03-16,,,,,,,,
4,defb3bb2-3d1a-4817-9b8b-01706618f2b4,abf07e97-af5c-4ca7-ba76-1458454e9020,2015-03-15 22:17:51,"version 1.0, build 5.1",iPhone 5s (GSM+CDMA),120,mg/dL,0.0,mg/dL,False,...,21.0,2015-03-15,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12099,d802d649-9361-4990-89ed-49918dd18f82,28a6a341-a5ed-465b-aa62-f89571005b51,2015-10-27 16:50:28,"version 1.0.10, build 1",iPhone 6,116,mmHg,0.0,mg/dL,False,...,22.0,2015-10-27,,,,,,,,
12100,5063a915-33e8-405f-834a-cd1cef19c876,19fe1a43-2ad8-458c-ba33-4de5d6d08943,2015-10-27 18:08:38,"version 1.0.10, build 1",iPhone 6 Plus,120,mmHg,114.0,mg/dL,False,...,45.0,2015-10-27,,,,,,,,
12101,4065a67e-f5e0-43cd-a2ed-c8f9990bfdf3,caf19b48-35b4-413e-983b-727c2f975f3c,2015-10-27 13:04:59,"version 1.0.10, build 1","iPhone8,1",90,mmHg,0.0,mg/dL,False,...,25.0,2015-10-27,,,,,,,,
12102,0b1116bf-306b-4138-97a9-3b3bbe399502,55e5c0cf-2a72-4de0-8a51-f4c63752721d,2015-10-27 21:18:23,"version 1.0.10, build 1",iPhone 5s (GSM),130,mmHg,88.0,mg/dL,False,...,86.0,2015-10-27,,,,,,,,


###**Data Download and Pre-processing of HealthKit Workout**


**Setting up config for download**

In [None]:
syn.cache.cache_root_dir = '/content/workout_data'    # Change cache path

**Downloading all individual .csv file**
*Took 3m 21s to download the data*

In [None]:
query_wo = syn.tableQuery("select healthCode from syn16782065")
patients_list_df = query_wo.asDataFrame()
patients_list = patients_list_df['healthCode'].astype(str).to_list()
patients = ','.join(f"'{x}'" for x in patients_list)

query_wo = "select * from syn16782060 where healthCode in ("+ patients + ")"
query_wo = syn.tableQuery(query_wo)
raw_df_wo = query_wo.asDataFrame()

# Specify the columns to download
cols_to_download = ['data.csv']

# Download the CSV file
syn.downloadTableColumns(query_wo, cols_to_download)

Downloading 2178 files, 0 cached locally


INFO:synapseclient_default:Downloading 2178 files, 0 cached locally


OrderedDict([('31304289',
              '/content/workout_data/289/31304289/1d8a3286-0aa8-4ce2-a81d-6bd3dc842d78.csv'),
             ('31304328',
              '/content/workout_data/328/31304328/6e4315c6-3dd6-424d-9f2f-66edb13981f9.csv'),
             ('31304354',
              '/content/workout_data/354/31304354/0f70e97c-85e8-4225-85f7-894c122b2405.csv'),
             ('31304414',
              '/content/workout_data/414/31304414/557d2613-48d1-4eee-b551-ec8151c0c747.csv'),
             ('31304437',
              '/content/workout_data/437/31304437/f5b6519d-7667-4c92-b3e6-c0eca75eda5d.csv'),
             ('31304498',
              '/content/workout_data/498/31304498/cdc09f8d-f7e2-448f-aae1-8c47f88f84b2.csv'),
             ('31304541',
              '/content/workout_data/541/31304541/82b59599-a3df-4037-a9fc-d2ebacd2388a.csv'),
             ('31304574',
              '/content/workout_data/574/31304574/924431dd-7266-4d0d-baa3-cd584fc99f80.csv'),
             ('31304605',
              

**Creating workout dataframe**

In [None]:
bad_records_wo = []
def clean_data_records(raw_df_wo, index):
    file_handle_id = raw_df_wo.iloc[index]['data.csv']
    file_path = syn.cache.get(file_handle_id)

    try:
      df_dat = pd.read_csv(file_path)
      df_dat['startTime'] = pd.to_datetime(df_dat['startTime'])
      df_dat['endTime'] = pd.to_datetime(df_dat['endTime'])

      df_dat = df_dat[df_dat['workoutType'].str.contains('HKWorkoutActivityType')] # deleting rows which contains junt activity type
      df_dat = df_dat.drop(['type', 'total.distance', 'unit', 'source', 'sourceIdentifier', 'metadata'], axis=1) # Dropping unnecessary columns
      df_dat = df_dat.drop_duplicates(subset=['startTime', 'endTime']) # removing redundant rows
      #df_dat_sum = df_dat.groupby('date')['energy.consumed'].sum().reset_index()
      df_dat = df_dat[pd.to_numeric(df_dat['energy.consumed'], errors='coerce').notnull()] # deleting values having non-numeric energy.consumed value
      return df_dat

    except:
        bad_records_wo.append(raw_df_wo.iloc[index]['recordId'])

df_workout = pd.DataFrame()

# For every clean set of sleep entries, add all the (Date, value) tuples to its corresponding healthCode to create the entry to add to the main results dataframe
for i in range(raw_df_wo.shape[0]):
#for i in range(209): #To run a subset
    data_entries = clean_data_records(raw_df_wo, i)
    #print(data_entries)
    if data_entries is not None:
        data_entries['healthCode'] = raw_df_wo.iloc[i]['healthCode']
        df_workout = pd.concat([df_workout, data_entries], ignore_index=True)
        print(i, " of ", raw_df_wo.shape[0], " processed")

df_workout.to_csv('workout.csv', index=False)


0  of  2178  processed
1  of  2178  processed
2  of  2178  processed
3  of  2178  processed
4  of  2178  processed
5  of  2178  processed
6  of  2178  processed
7  of  2178  processed
8  of  2178  processed
9  of  2178  processed
10  of  2178  processed
11  of  2178  processed
12  of  2178  processed
13  of  2178  processed
14  of  2178  processed
15  of  2178  processed
16  of  2178  processed
17  of  2178  processed
18  of  2178  processed
19  of  2178  processed
20  of  2178  processed
21  of  2178  processed
22  of  2178  processed
23  of  2178  processed
24  of  2178  processed
25  of  2178  processed
26  of  2178  processed
27  of  2178  processed
28  of  2178  processed
29  of  2178  processed
30  of  2178  processed
31  of  2178  processed
32  of  2178  processed
33  of  2178  processed
34  of  2178  processed
35  of  2178  processed
36  of  2178  processed
37  of  2178  processed
38  of  2178  processed
39  of  2178  processed
40  of  2178  processed
41  of  2178  processed
42

Add rolling K averaging to populate missing values in dataframe

In [None]:
def rolling_k_days(bp, predictor, k):
  '''
  Populates missing values a time series table with the rolling average of the k prior days
  (not all days will be populated, as any day for which there is no data in the previous k days will
  remain as an empty value)
  '''

  predictor_df = predictor.copy()

  # Add the blood pressure dates to sleep data
  predictor_df = pd.merge(predictor_df, bp[['healthCode', 'date']], on=['healthCode', 'date'], how='outer')

  # Resample to daily data (create an entry for everyday in the dates range, even if it has empty values)
  predictor_df['date'] = pd.to_datetime(predictor_df['date'])
  predictor_df.set_index('date', inplace=True)
  predictor_df = predictor_df.groupby('healthCode').resample('D').mean()

  # Fill in missing days with NaNs
  predictor_df = predictor_df.reset_index().set_index('date').groupby('healthCode').apply(lambda x: x.asfreq('D')).reset_index()

  # Calculate rolling average of k days to populate as many days with missing data as possible
  predictor_df = predictor_df.sort_values(['healthCode', 'date'])
  predictor_df['k_sleep_minutes'] = predictor_df.groupby('healthCode')['sleep_minutes'].rolling(window=k, min_periods=1).mean().reset_index(0, drop=True)

  # Merge with BP data
  bp['date'] = pd.to_datetime(bp['date'])
  merged_bp = bp.merge(predictor_df, on=['healthCode', 'date'], how='left')
  print('Shape of bp df', bp.shape)
  print('shape of predictor df', predictor.shape)
  print(merged_bp[merged_bp['k_sleep_minutes'].notna()])
  return merged_bp

In [None]:
k = 10
rolling_k_days(bp, sleep, 10)