<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/main/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Important library install and connection

In [2]:
path = 'data/'

Extract range of dates from bp data

In [3]:
'''
the values in the systolic and diastolic columns are mixed up and erroneous i.e., systolic should be always greater than diastolic but due to manual entry
some of the people entered it the other way. Moreover, some people have entered wrong data like 8.0 for diastolic etc.
The code below will swap the diastolic and systolic values where required and remove the entries which falls below the specified range
'''
def fix_sys_dias(bp):
  print('unfiltered shape', bp.shape)

  # Create a mask where diastolic is greater than systolic
  mask = bp['diastolic'] < bp['systolic']

  # Use the mask to swap the values
  bp.loc[mask, ['systolic', 'diastolic']] = bp.loc[mask, ['diastolic', 'systolic']].values
  print('number of sys-dias swaps', mask.sum())

  bp = bp[(bp['systolic'] > 60) | (bp['systolic'] < 260)]     # systolic is out of range
  print('systolic filter', bp.shape)
  bp = bp[(bp['diastolic'] > 20) | (bp['diastolic'] < 150)]     # diastolic is out of range
  print('diastolic filter', bp.shape)

  return bp

In [4]:
from datetime import timedelta
import pandas as pd

# Extracts range from BP data
bp = pd.read_csv(path + 'bp.csv')
bp['createdOn'] = bp['createdOn'] / 1000
bp['createdOn'] = pd.to_datetime(bp['createdOn'], unit='s')
bp['date'] = bp['createdOn'].dt.date
bp = bp.rename(columns={'heartAgeDataSystolicBloodPressure': 'systolic', 'bloodPressureInstruction': 'diastolic'})
bp = fix_sys_dias(bp)
bp = bp[['healthCode', 'date', 'diastolic', 'systolic']]
# bp = bp.fillna(0, inplace=True)
print(bp.head())
bp.fillna(0, inplace=True)
print(type(bp))

min_date = bp['date'].min() - timedelta(days=1)     # Add a padding day to min date to still consider prior day values
max_date = bp['date'].max()

print('min', min_date)
print('max', max_date)

unfiltered shape (10772, 24)
number of sys-dias swaps 157
systolic filter (10771, 24)
diastolic filter (10771, 24)
                             healthCode        date  diastolic  systolic
0  9f936943-acd6-4e44-9d67-21de6cc206ae  2015-03-10        108      77.0
1  9f936943-acd6-4e44-9d67-21de6cc206ae  2015-03-12        102      73.0
2  54cde2ca-b0d5-4fe4-a58d-476fdb47b192  2015-03-16        160     130.0
3  211e9177-22a7-485a-96ae-06b447da128a  2015-03-16        115      75.0
4  abf07e97-af5c-4ca7-ba76-1458454e9020  2015-03-15        120      80.0
<class 'pandas.core.frame.DataFrame'>
min 2015-03-09
max 2015-10-27


Read HealthKit data

In [5]:
hk = pd.read_csv(path + 'healthkit.csv')
hk = hk.rename(columns={'HKQuantityTypeIdentifierHeartRate': 'heart_rate', 'HKQuantityTypeIdentifierDistanceWalkingRunning': 'distance_walking',
                   'HKQuantityTypeIdentifierBloodPressureDiastolic': 'diastolic', 'HKQuantityTypeIdentifierBloodPressureSystolic': 'systolic',
                   'HKQuantityTypeIdentifierStepCount': 'steps', 'HKQuantityTypeIdentifierFlightsClimbed': 'floors',
                   'HKQuantityTypeIdentifierActiveEnergyBurned': 'active_calories', 'HKQuantityTypeIdentifierDistanceCycling': 'distance_cycling'})
hk['date'] = pd.to_datetime(hk['date'])
hk['date'] = hk['date'].dt.date
hk.fillna(0, inplace=True)


hk = hk[(hk['date'] >= min_date) & (hk['date'] <= max_date)]
print(hk)
print(hk[['healthCode', 'date']].drop_duplicates().shape)

                                  healthCode        date  floors heart_rate  \
0       12a38046-1512-409a-b3a1-6046e97e650e  2015-06-22     0.0        0.0   
1       12a38046-1512-409a-b3a1-6046e97e650e  2015-06-23     0.0        0.0   
2       12a38046-1512-409a-b3a1-6046e97e650e  2015-06-21     0.0        0.0   
3       5aa79529-ef84-45e1-a4f3-804aa871fc9a  2015-06-29     7.0    1.08333   
4       2e9fb3e4-a83d-4d89-a607-f8fba4b0cc5c  2015-06-29     0.0        0.0   
...                                      ...         ...     ...        ...   
121465  1c33a77b-8da7-4a98-a6c6-cc4f91d031c5  2015-10-25     0.0        0.0   
121466  1c33a77b-8da7-4a98-a6c6-cc4f91d031c5  2015-10-26     0.0        0.0   
121467  ffe3d273-9da3-4cfa-b8a2-7c8e7c5e39c0  2015-10-27     0.0          0   
121468  4ef2413e-48ae-41e1-9641-e1b47ab450a1  2015-10-27     2.0        0.0   
121469  4ef2413e-48ae-41e1-9641-e1b47ab450a1  2015-10-26     2.0        0.0   

          distance_walking  diastolic  systolic    

Read sleep data

In [6]:
sleep = pd.read_csv(path + 'sleep.csv')
sleep['date'] = pd.to_datetime(sleep['date'])
sleep['date'] = sleep['date'].dt.date
sleep.fillna(0, inplace=True)

sleep = sleep[(sleep['date'] >= min_date) & (sleep['date'] <= max_date)]
print(sleep)

                                healthCode        date  sleep_minutes  \
0     383e1eee-cf63-4c4c-9194-6100ff86e310  2015-06-28     411.000000   
1     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-27      43.000000   
2     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-28     184.000000   
3     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-29     254.250000   
4     6b771aad-7228-41ab-8628-39cf44e79f33  2015-06-12     228.000000   
...                                    ...         ...            ...   
8415  ca1152fe-05d5-498d-92ad-76771338b50e  2015-10-26     442.000000   
8416  ac6e2546-9ec0-47d6-981e-44341f38446d  2015-10-26     277.366667   
8417  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-25       5.000000   
8418  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-26     231.000000   
8419  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-27      11.000000   

      awake_count  bed_time  
0               1  01:42:00  
1               1  02:08:31  
2               1  22:34:12  
3  

Read workout data

In [7]:
wo = pd.read_csv(path + 'workout.csv')
wo = wo.rename(columns={'energy.consumed': 'wo_calories'})

wo['startTime'] = wo['startTime'].str.slice(0,19)   # Remove timezone information
wo['startTime'] = pd.to_datetime(wo['startTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
wo['endTime'] = wo['endTime'].str.slice(0,19)   # Remove timezone information
wo['endTime'] = pd.to_datetime(wo['endTime'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce')

# Calculate active time in minutes
wo['active_minutes'] = (wo['endTime'] - wo['startTime']).dt.total_seconds() / 60
wo.fillna(0, inplace=True)

# Group by day
wo['date'] = wo['startTime'].dt.date
wo = wo.groupby(['date', 'healthCode']).agg({'active_minutes': 'sum', 'wo_calories': 'sum'}).reset_index()

print(wo)

            date                            healthCode  active_minutes  \
0     2014-06-02  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
1     2014-06-15  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
2     2014-06-17  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
3     2014-06-22  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
4     2014-06-26  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
...          ...                                   ...             ...   
4317  2015-10-26  0a6eb7f8-c05d-4119-bfe0-f0e233d9f747             0.0   
4318  2015-10-26  37633071-388d-4e6e-b6fc-29e37a70a936             0.0   
4319  2015-10-26  56e4f3c3-2c3b-4f65-a598-2908a13353d0             0.0   
4320  2015-10-26  a1461e7f-9a77-411f-9698-6dd6e5bed59b             0.0   
4321  2015-10-26  a88f3758-0e43-43bd-bc6a-55a0780ec643             0.0   

      wo_calories  
0           0.000  
1           0.000  
2           0.000  
3           0.000  
4          

  wo.fillna(0, inplace=True)


**Augmentation**

In [8]:
def rolling_k_days(predictor, k):
  '''
  Populates missing values a time series table with the rolling average of the k prior days
  (not all days will be populated, as any day for which there is no data in the previous k days will
  remain as an empty value)
  '''
  predictor_df = predictor.copy()
  print('original shape', predictor_df.shape)

  # Resample to daily data (create an entry for everyday in the dates range, even if it has empty values)
  predictor_df['date'] = pd.to_datetime(predictor_df['date'])
  predictor_df.set_index('date', inplace=True)
  # predictor_df = predictor_df.groupby('healthCode').resample('D').mean()
  print('post sample shape', predictor_df.shape)

  # Fill in missing days with NaNs
  predictor_df = predictor_df.reset_index().set_index('date').groupby('healthCode', group_keys=False).apply(lambda x: x.asfreq('D')).reset_index()

  predictor_df = predictor_df.sort_values(['healthCode', 'date'])

  # Select variables (columns) to augment
  cols_to_augment = predictor_df.columns[:2]  # augment all but the first two
  cols_to_augment = ['floors']

  # Calculate rolling average of k days to populate as many days with missing data as possible
  predictor_df[cols_to_augment] = predictor_df.groupby('healthCode')[cols_to_augment].rolling(window=k, min_periods=1).mean().reset_index(0, drop=True)

  return predictor_df

In [9]:
def master_merge(predictor_df, bp):
  # print('df 0 shape', bp.shape)
  # print('df 1 shape', predictor_df.shape)
  bp['date'] = pd.to_datetime(bp['date'])
  predictor_df['date'] = predictor_df['date'].astype(bp['date'].dtypes)
  predictor_df = predictor_df.drop_duplicates(subset=['healthCode', 'date'])
  master_df = bp.merge(predictor_df, on=['healthCode', 'date'], how='inner')
  # print('merged shape', master_df.shape)
  return master_df

In [10]:
def get_non_zero(df):
    excluded_cols = ['healthCode', 'date', 'systolic', 'diastolic']
    selected_cols = list(set(df.columns) - set(excluded_cols))
    print('selected cols', selected_cols)
    valid_values = df[selected_cols].ne(0).dropna().sum().sum()
    print('Number of non-zero and not NaN values:', valid_values)
    return valid_values

In [11]:
from ast import Not
# Merge healthkit_df with sleep_df
print('hk shape', hk.shape)
print('hk unique date-user combinations', hk[['healthCode', 'date']].drop_duplicates().shape)
print('sleep shape', sleep.shape)
predictor_df = hk.merge(sleep, on=['healthCode', 'date'], how='outer')
print('merged shape', predictor_df.shape)
get_non_zero(predictor_df)



# Merge workout_df with merged (healthkit_df, sleep_df) --> predictor_df
print('hk_sleep shape', predictor_df.shape)
print('wo shape', wo.shape)
predictor_df = predictor_df.merge(wo, on=['healthCode', 'date'], how='outer')
# print number of unique date and user combinations in predictor
print('predictor unique date-user combinations', predictor_df[['healthCode', 'date']].drop_duplicates().shape)

print('merged shape', predictor_df.shape)
get_non_zero(predictor_df)




# AUGMENT - predictor_df (K-rolling avg, and knn)
# print('AUGMENTATION')
k = 10
# print('df 0 shape', predictor_df.shape)
# augmented_df = rolling_k_days(predictor_df, k)
# print('augmented unique date-user combinations', augmented_df[['healthCode', 'date']].drop_duplicates().shape)
# print('augmented shape', augmented_df.shape, '\n')


# MASTER MERGE - Merge with bp_df
# print('MASTER MERGE - baseline')
baseline = master_merge(predictor_df, bp)
baseline.fillna(0, inplace=True)
print('baseline', baseline.shape, '\n')
print(baseline.head(3))
get_non_zero(baseline)
# print('baseline', baseline[baseline['sleep_minutes'].notna()].shape, '\n')

# print('MASTER MERGE - augmented')
# augmented = master_merge(augmented_df, bp)
# print('augmented', augmented[augmented['sleep_minutes'].notna()].shape)








# ADD HISTORICAL BP - Add BP historical variable to master df (1 period, 2 periods, 3 periods)
'''
Here is where we list the historical BP values for each row in master_df:
select a period of k days. If k is more than 1, aggregate within the period via averaging
then a number of periods from [1, 2, 3]
periods should not overlap with each other
'''



# RUN RF - Implement RF on master_df with: 1) no agumentation, 2) k-roll agumentation, 3) knn_augmentation
'''
Implement one RF first
'''


'''
Implement same RF with bootstraping
'''


'''
Extract feature importances and select top N most important features
'''


'''
Run second RF with top N most important features
'''




hk shape (121324, 10)
hk unique date-user combinations (41687, 2)
sleep shape (8255, 5)
merged shape (128649, 13)
selected cols ['bed_time', 'distance_cycling', 'floors', 'sleep_minutes', 'active_calories', 'awake_count', 'distance_walking', 'heart_rate', 'steps']
Number of non-zero and not NaN values: 766814
hk_sleep shape (128649, 13)
wo shape (4322, 4)
predictor unique date-user combinations (46446, 2)
merged shape (129811, 15)
selected cols ['bed_time', 'distance_cycling', 'floors', 'wo_calories', 'sleep_minutes', 'active_calories', 'awake_count', 'distance_walking', 'active_minutes', 'heart_rate', 'steps']
Number of non-zero and not NaN values: 1016133
baseline (1001, 17) 

                             healthCode       date  diastolic_x  systolic_x  \
0  0d5a3219-bf97-4a33-b24c-73d9c1b72067 2015-03-16          120        84.0   
1  cdda2358-6c36-4f1b-b362-d53fa4103882 2015-03-20          130        80.0   
2  ecae2a86-b881-49db-8385-fcabdb799a77 2015-03-21          120        70.0

'\nRun second RF with top N most important features\n'