<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
path = 'data/'

Extract range of dates from bp data

In [2]:
from datetime import timedelta
import pandas as pd
from utils import fix_sys_dias, master_merge, get_non_zero

# Extracts range from BP data
bp = pd.read_csv(path + 'bp.csv')
bp['createdOn'] = bp['createdOn'] / 1000
bp['createdOn'] = pd.to_datetime(bp['createdOn'], unit='s')
bp['date'] = bp['createdOn'].dt.date
bp = bp.rename(columns={'heartAgeDataSystolicBloodPressure': 'systolic', 'bloodPressureInstruction': 'diastolic'})
bp = fix_sys_dias(bp)
bp = bp[['healthCode', 'date', 'diastolic', 'systolic']]


min_date = bp['date'].min() - timedelta(days=1)     # Add a padding day to min date to still consider prior day values
max_date = bp['date'].max()

print('min', min_date)
print('max', max_date)
print(bp.columns)

unfiltered shape (10772, 24)
number of sys-dias swaps 10438
Shape of bp table after removing out of range systolic values (10700, 24)
Shape of bp table after removing out of range diastolic values (10509, 24)
min 2015-03-09
max 2015-10-27
Index(['healthCode', 'date', 'diastolic', 'systolic'], dtype='object')


**Read HealthKit data**

In [3]:
hk = pd.read_csv(path + 'hk.csv')
hk = hk.rename(columns={'HKQuantityTypeIdentifierHeartRate': 'heart_rate', 'HKQuantityTypeIdentifierDistanceWalkingRunning': 'distance_walking',
                   'HKQuantityTypeIdentifierBloodPressureDiastolic': 'diastolic_ex', 'HKQuantityTypeIdentifierBloodPressureSystolic': 'systolic_ex',
                   'HKQuantityTypeIdentifierStepCount': 'steps', 'HKQuantityTypeIdentifierFlightsClimbed': 'floors',
                   'HKQuantityTypeIdentifierActiveEnergyBurned': 'active_calories', 'HKQuantityTypeIdentifierDistanceCycling': 'distance_cycling'})
hk = hk.drop(['systolic_ex', 'diastolic_ex'], axis=1)
hk['date'] = pd.to_datetime(hk['date'])
hk['date'] = hk['date'].dt.date

**Read sleep data**

In [4]:
sleep = pd.read_csv(path + 'sleep.csv')
sleep['date'] = pd.to_datetime(sleep['date'])
sleep['date'] = sleep['date'].dt.date
sleep['bed_time'] = sleep[sleep['bed_time'].notnull()]['bed_time']/ pd.Timedelta(minutes=1) ## Suggested change to convert bed_time from hh:mm:ss to integer


**Read workout data**

In [5]:
wo = pd.read_csv(path + 'workout.csv')
wo = wo.rename(columns={'energy.consumed': 'wo_calories'})

wo['startTime'] = wo['startTime'].str.slice(0,19)   # Remove timezone information
wo['startTime'] = pd.to_datetime(wo['startTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
wo['endTime'] = wo['endTime'].str.slice(0,19)   # Remove timezone information
wo['endTime'] = pd.to_datetime(wo['endTime'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce')

# Calculate active time in minutes
wo['active_minutes'] = (wo['endTime'] - wo['startTime']).dt.total_seconds() / 60

# Group by day
wo['date'] = wo['startTime'].dt.date
wo = wo.groupby(['date', 'healthCode']).agg({'active_minutes': 'sum', 'wo_calories': 'sum'}).reset_index()

print(wo)

            date                            healthCode  active_minutes  \
0     2014-06-02  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
1     2014-06-15  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
2     2014-06-17  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
3     2014-06-22  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
4     2014-06-26  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
...          ...                                   ...             ...   
4317  2015-10-26  0a6eb7f8-c05d-4119-bfe0-f0e233d9f747             0.0   
4318  2015-10-26  37633071-388d-4e6e-b6fc-29e37a70a936             0.0   
4319  2015-10-26  56e4f3c3-2c3b-4f65-a598-2908a13353d0             0.0   
4320  2015-10-26  a1461e7f-9a77-411f-9698-6dd6e5bed59b             0.0   
4321  2015-10-26  a88f3758-0e43-43bd-bc6a-55a0780ec643             0.0   

      wo_calories  
0           0.000  
1           0.000  
2           0.000  
3           0.000  
4          

**Merge all sub_datasets into master dataset**

In [6]:
# Merge healthkit_df with sleep_df
print('hk shape', hk.shape)
print('sleep shape', sleep.shape)
predictor_df = hk.merge(sleep, on=['healthCode', 'date'], how='outer')
print('hk-sleep merged shape', predictor_df.shape)
get_non_zero(predictor_df)
print('--------------------------------------------------------------------------------')

# Merge workout_df with merged (healthkit_df, sleep_df) --> predictor_df
print('hk-sleep shape', predictor_df.shape)
print('wo shape', wo.shape)
predictor_df = predictor_df.merge(wo, on=['healthCode', 'date'], how='outer')
# print number of unique date and user combinations in predictor
print('hk-sleep-wo shape', predictor_df.shape)
get_non_zero(predictor_df)
print('--------------------------------------------------------------------------------')

# MASTER MERGE - Merge with bp_df
print('MASTER MERGE - baseline')
baseline = master_merge(predictor_df, bp)
print('baseline (hk-sleep-wo-bp merge)', baseline.shape, '\n')
get_non_zero(baseline)
baseline.to_csv('data/baseline.csv', index=False)
print('--------------------------------------------------------------------------------')


hk shape (51123, 8)
sleep shape (8420, 5)
hk-sleep merged shape (55916, 11)
selected cols ['bed_time', 'distance_cycling', 'distance_walking', 'sleep_minutes', 'active_calories', 'floors', 'heart_rate', 'steps', 'awake_count']
Number of non-zero and not NaN values: 151545
Number of rows with at least one non-zero and not NaN value: 55701
--------------------------------------------------------------------------------
hk-sleep shape (55916, 11)
wo shape (4322, 4)
hk-sleep-wo shape (56797, 13)
selected cols ['active_minutes', 'bed_time', 'distance_cycling', 'distance_walking', 'sleep_minutes', 'wo_calories', 'active_calories', 'floors', 'heart_rate', 'steps', 'awake_count']
Number of non-zero and not NaN values: 155733
Number of rows with at least one non-zero and not NaN value: 56537
--------------------------------------------------------------------------------
MASTER MERGE - baseline


baseline (hk-sleep-wo-bp merge) (10509, 15) 

selected cols ['active_minutes', 'bed_time', 'distance_cycling', 'distance_walking', 'sleep_minutes', 'wo_calories', 'active_calories', 'floors', 'heart_rate', 'steps', 'awake_count']
Number of non-zero and not NaN values: 3403
Number of rows with at least one non-zero and not NaN value: 1117
--------------------------------------------------------------------------------


**AUGMENTATIONS** - 
Implements 3 augmentation strategies:
1) k-rolling average: replaces missing values with the rolling average of a k sized window along the temporal dimension for each user
2) KNN intra user imputation: searches for nearest neighbors only within the same user
3) KNN inter user imputation: searches for nearest neighbors accross all users

In [8]:
from augmentations import rolling_k_days, knn_impute_intra_user, knn_impute_inter_user

# AUGMENTATION - rolling k days
k = 3
print('AUGMENTATION - rolling k days')
# Prepare pre_augmentation df with all data with bp and predictor_df
pre_augmented = bp.merge(predictor_df, on=['healthCode', 'date'], how='outer')
# drop systolic and diastolic columns
pre_augmented = pre_augmented.drop(['systolic', 'diastolic'], axis=1)
# Augment the pre_augmented df
pre_augmented = rolling_k_days(pre_augmented, k)
augmented_k = master_merge(pre_augmented, bp)
print('augmented', augmented_k.shape)
print('augmented NON-ZERO values')
get_non_zero(augmented_k)
augmented_k.to_csv('data/augmented_k.csv')
print('--------------------------------------------------------------------------------')

# AUGMENTATION - knn impute intra user
k = 3
print('AUGMENTATION - knn impute intra user')
augmented_intra = knn_impute_intra_user(predictor_df, k)
augmented_intra = master_merge(augmented_intra, bp)
print('imputed', augmented_intra.shape)
print('imputed NON-ZERO values')
get_non_zero(augmented_intra)
augmented_intra.to_csv('data/augmented_intra.csv')
print('--------------------------------------------------------------------------------')

# AUGMENTATION - knn impute inter user
k = 3
print('AUGMENTATION - knn impute inter user')
augmented_inter = knn_impute_inter_user(predictor_df, k)
augmented_inter = master_merge(augmented_inter, bp)
print('imputed', augmented_inter.shape)
print('imputed NON-ZERO values')
get_non_zero(augmented_inter)
augmented_inter.to_csv('data/augmented_inter.csv')

AUGMENTATION - rolling k days
augmented (10509, 15)
augmented NON-ZERO values
selected cols ['active_minutes', 'bed_time', 'distance_cycling', 'distance_walking', 'sleep_minutes', 'wo_calories', 'active_calories', 'floors', 'heart_rate', 'steps', 'awake_count']
Number of non-zero and not NaN values: 4250
Number of rows with at least one non-zero and not NaN value: 1364
--------------------------------------------------------------------------------
AUGMENTATION - knn impute intra user
imputed (10509, 15)
imputed NON-ZERO values
selected cols ['active_minutes', 'bed_time', 'distance_cycling', 'distance_walking', 'sleep_minutes', 'wo_calories', 'active_calories', 'floors', 'heart_rate', 'steps', 'awake_count']
Number of non-zero and not NaN values: 4399
Number of rows with at least one non-zero and not NaN value: 1120
--------------------------------------------------------------------------------
AUGMENTATION - knn impute inter user
imputed (10509, 15)
imputed NON-ZERO values
selected c