<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Important library install and connection

In [1]:
path = 'data/'

Extract range of dates from bp data

In [2]:
'''
the values in the systolic and diastolic columns are mixed up and erroneous i.e., systolic should be always greater than diastolic but due to manual entry
some of the people entered it the other way. Moreover, some people have entered wrong data like 8.0 for diastolic etc.
The code below will swap the diastolic and systolic values where required and remove the entries which falls below the specified range
'''
def fix_sys_dias(bp):
  print('unfiltered shape', bp.shape)

  # Create a mask where diastolic is greater than systolic
  mask = bp['diastolic'] > bp['systolic']

  # Use the mask to swap the values
  bp.loc[mask, ['systolic', 'diastolic']] = bp.loc[mask, ['diastolic', 'systolic']].values
  print('number of sys-dias swaps', mask.sum())
  bp = bp[(bp['systolic'] >= 40) & (bp['systolic'] <= 340)]     # keeping only within range systolic values
  print('Shape of bp table after removing out of range systolic values', bp.shape)
  bp = bp[(bp['diastolic'] >= 10) & (bp['diastolic'] <= 200)]     # keeping only within range diastolic values
  print('Shape of bp table after removing out of range diastolic values', bp.shape)

  return bp

In [3]:
from datetime import timedelta
import pandas as pd

# Extracts range from BP data
bp = pd.read_csv(path + 'bp.csv')
bp['createdOn'] = bp['createdOn'] / 1000
bp['createdOn'] = pd.to_datetime(bp['createdOn'], unit='s')
bp['date'] = bp['createdOn'].dt.date
bp = bp.rename(columns={'heartAgeDataSystolicBloodPressure': 'systolic', 'bloodPressureInstruction': 'diastolic'})
bp = fix_sys_dias(bp)
bp = bp[['healthCode', 'date', 'diastolic', 'systolic']]

bp.fillna(0, inplace=True)


min_date = bp['date'].min() - timedelta(days=1)     # Add a padding day to min date to still consider prior day values
max_date = bp['date'].max()

print('min', min_date)
print('max', max_date)
print(bp.columns)

unfiltered shape (10772, 24)
number of sys-dias swaps 10438
Shape of bp table after removing out of range systolic values (10700, 24)
Shape of bp table after removing out of range diastolic values (10509, 24)
min 2015-03-09
max 2015-10-27
Index(['healthCode', 'date', 'diastolic', 'systolic'], dtype='object')


Read HealthKit data

In [15]:
hk = pd.read_csv(path + 'hk.csv')
hk = hk.rename(columns={'HKQuantityTypeIdentifierHeartRate': 'heart_rate', 'HKQuantityTypeIdentifierDistanceWalkingRunning': 'distance_walking',
                   'HKQuantityTypeIdentifierBloodPressureDiastolic': 'diastolic_ex', 'HKQuantityTypeIdentifierBloodPressureSystolic': 'systolic_ex',
                   'HKQuantityTypeIdentifierStepCount': 'steps', 'HKQuantityTypeIdentifierFlightsClimbed': 'floors',
                   'HKQuantityTypeIdentifierActiveEnergyBurned': 'active_calories', 'HKQuantityTypeIdentifierDistanceCycling': 'distance_cycling'})
hk = hk.drop(['systolic_ex', 'diastolic_ex'], axis=1)
hk['date'] = pd.to_datetime(hk['date'])
hk['date'] = hk['date'].dt.date
hk.fillna(0, inplace=True)

hk = hk[(hk['date'] >= min_date) & (hk['date'] <= max_date)]
print(hk)

                                 healthCode        date  distance_cycling  \
0      003149e8-dd47-4a71-b434-2d9243773aa2  2015-08-07               0.0   
1      003149e8-dd47-4a71-b434-2d9243773aa2  2015-08-09               0.0   
2      003149e8-dd47-4a71-b434-2d9243773aa2  2015-08-10               0.0   
3      003149e8-dd47-4a71-b434-2d9243773aa2  2015-08-11               0.0   
4      003149e8-dd47-4a71-b434-2d9243773aa2  2015-08-14               0.0   
...                                     ...         ...               ...   
51118  fffddf4f-ebac-4ae2-86bb-50651a59236e  2015-10-16               0.0   
51119  fffddf4f-ebac-4ae2-86bb-50651a59236e  2015-10-21               0.0   
51120  fffddf4f-ebac-4ae2-86bb-50651a59236e  2015-10-22               0.0   
51121  fffddf4f-ebac-4ae2-86bb-50651a59236e  2015-10-23               0.0   
51122  fffddf4f-ebac-4ae2-86bb-50651a59236e  2015-10-24               0.0   

       distance_walking  floors  active_calories    steps  heart_rate  
0  

**Read sleep data**

In [16]:
sleep = pd.read_csv(path + 'sleep.csv')
sleep['date'] = pd.to_datetime(sleep['date'])
sleep['date'] = sleep['date'].dt.date
sleep['bed_time'] = sleep[sleep['bed_time'].notnull()]['bed_time']/ pd.Timedelta(minutes=1) ## Suggested change to convert bed_time from hh:mm:ss to integer
sleep.fillna(0, inplace=True)

sleep = sleep[(sleep['date'] >= min_date) & (sleep['date'] <= max_date)]
print(sleep)

                                healthCode        date  sleep_minutes  \
0     383e1eee-cf63-4c4c-9194-6100ff86e310  2015-06-28     411.000000   
1     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-27      43.000000   
2     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-28     184.000000   
3     a7d67779-d082-4c74-9a06-f0544a37d378  2015-06-29     254.250000   
4     6b771aad-7228-41ab-8628-39cf44e79f33  2015-06-12     228.000000   
...                                    ...         ...            ...   
8415  ca1152fe-05d5-498d-92ad-76771338b50e  2015-10-26     442.000000   
8416  ac6e2546-9ec0-47d6-981e-44341f38446d  2015-10-26     277.366667   
8417  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-25       5.000000   
8418  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-26     231.000000   
8419  a88f3758-0e43-43bd-bc6a-55a0780ec643  2015-10-27      11.000000   

      awake_count     bed_time  
0               1   102.000000  
1               1   128.516667  
2               1  1354.

**Read workout data**

In [6]:
wo = pd.read_csv(path + 'workout.csv')
wo = wo.rename(columns={'energy.consumed': 'wo_calories'})

wo['startTime'] = wo['startTime'].str.slice(0,19)   # Remove timezone information
wo['startTime'] = pd.to_datetime(wo['startTime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
wo['endTime'] = wo['endTime'].str.slice(0,19)   # Remove timezone information
wo['endTime'] = pd.to_datetime(wo['endTime'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce')

# Calculate active time in minutes
wo['active_minutes'] = (wo['endTime'] - wo['startTime']).dt.total_seconds() / 60
wo.fillna(0)

# Group by day
wo['date'] = wo['startTime'].dt.date
wo = wo.groupby(['date', 'healthCode']).agg({'active_minutes': 'sum', 'wo_calories': 'sum'}).reset_index()

print(wo)

            date                            healthCode  active_minutes  \
0     2014-06-02  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
1     2014-06-15  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
2     2014-06-17  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
3     2014-06-22  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
4     2014-06-26  8ebdc824-bd18-4290-a887-197a6b3ec4bc             0.0   
...          ...                                   ...             ...   
4317  2015-10-26  0a6eb7f8-c05d-4119-bfe0-f0e233d9f747             0.0   
4318  2015-10-26  37633071-388d-4e6e-b6fc-29e37a70a936             0.0   
4319  2015-10-26  56e4f3c3-2c3b-4f65-a598-2908a13353d0             0.0   
4320  2015-10-26  a1461e7f-9a77-411f-9698-6dd6e5bed59b             0.0   
4321  2015-10-26  a88f3758-0e43-43bd-bc6a-55a0780ec643             0.0   

      wo_calories  
0           0.000  
1           0.000  
2           0.000  
3           0.000  
4          

In [7]:
def master_merge(predictor_df, bp):
  bp['date'] = pd.to_datetime(bp['date'])
  predictor_df['date'] = predictor_df['date'].astype(bp['date'].dtypes)
  predictor_df = predictor_df.drop_duplicates(subset=['healthCode', 'date'])
  master_df = bp.merge(predictor_df, on=['healthCode', 'date'], how='left')
  return master_df

In [23]:
def get_non_zero(df):
    excluded_cols = ['healthCode', 'date', 'systolic', 'diastolic']
    selected_cols = list(set(df.columns) - set(excluded_cols))
    print('selected cols', selected_cols)
    valid_values = df[selected_cols].ne(0).sum().sum()
    print('Number of non-zero and not NaN values:', valid_values)
    valid_rows = df[selected_cols].ne(0).any(axis=1).sum()
    print('Number of rows with at least one non-zero and not NaN value:', valid_rows)
    return valid_values

In [29]:
from sklearn.impute import KNNImputer
import numpy as np

def knn_impute(predictor):
    cols = [col for col in predictor.columns if col not in ['healthCode', 'date', 'systoclic', 'diastolic']]

    # Create the imputer
    imputer = KNNImputer(n_neighbors=3)

    # Split the DataFrame by 'id', apply the imputation, and concatenate the results
    predictor_imputed = pd.concat(
        (pd.DataFrame(imputer.fit_transform(sub_df[cols]), columns=cols) 
         for id, sub_df in predictor.groupby('healthCode')),
        ignore_index=True
    )
    # Convert the result back to a DataFrame (if necessary)
    predictor_imputed = pd.DataFrame(predictor_imputed, columns=predictor.columns)

    return predictor_imputed

In [102]:
from sklearn.neighbors import NearestNeighbors

from sklearn.neighbors import NearestNeighbors

from sklearn.neighbors import NearestNeighbors

def get_neighbors(df, index, k=3):
     # Assuming 'df' is your DataFrame and you want to find the neighbors based on all columns except 'healthCode'
    df.fillna(0, inplace=True)
    cols = [col for col in df.columns if col not in ['healthCode', 'date', 'systoclic', 'diastolic']]
    X = df[df['healthCode'] == df['healthCode'].iloc[index]]
    X = X[cols]
    X = X.fillna(0)
    print('entries with selected health')

    # Reset the index and keep the old index
    X.reset_index(inplace=True)

    print(X.shape)

    # Create the estimator
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(X[cols])

    # Get the k nearest neighbors of the specified index
    distances, indices = nbrs.kneighbors(X[X['index'] == index][cols].values.reshape(1, -1))

    # Print the indices of the neighbors
    print('ooooo', indices[0])
    print(df.iloc[X.iloc[indices[0]]['index']])

get_neighbors(predictor_df, 5400, 60)

(52837, 13)
(151, 12)
ooooo [ 33  44  31  28  52   7  42  92  36  10  15 106  46   5  17  93  56   3
  40 111  75 105 113  24  53  83  64   2  80  98  84  68  12  21  96  35
 109  71  54  60  34   4 118  25  62  50  79  29  82  76  13 112  22 104
  45 101  39  74 122  73  99]
                                healthCode       date  distance_cycling  \
5400  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-11               0.0   
5411  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-22               0.0   
5398  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-09               0.0   
5395  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-06               0.0   
5419  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-30               0.0   
...                                    ...        ...               ...   
5406  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-04-17               0.0   
5441  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-05-22               0.0   
5489  1e399fc2-7854-4608-b8cc-ddac6c77e005 2015-



In [76]:
from ast import Not
from augmentations import rolling_k_days

# Merge healthkit_df with sleep_df
print('hk shape', hk.shape)
print('sleep shape', sleep.shape)
predictor_df = hk.merge(sleep, on=['healthCode', 'date'], how='outer')
print('hk-sleep merged shape', predictor_df.shape)
get_non_zero(predictor_df)
print('--------------------------------------------------------------------------------')


# Merge workout_df with merged (healthkit_df, sleep_df) --> predictor_df
print('hk-sleep shape', predictor_df.shape)
print('wo shape', wo.shape)
predictor_df = predictor_df.merge(wo, on=['healthCode', 'date'], how='outer')
# print number of unique date and user combinations in predictor
print('hk-sleep-wo shape', predictor_df.shape)
get_non_zero(predictor_df)
print('--------------------------------------------------------------------------------')


# MASTER MERGE - Merge with bp_df
print('MASTER MERGE - baseline')
baseline = master_merge(predictor_df, bp)
baseline.fillna(0, inplace=True)
print('baseline (hk-sleep-wo-bp merge)', baseline.shape, '\n')
get_non_zero(baseline)
print('--------------------------------------------------------------------------------')

k = 10
print('AUGMENTATION - rolling k days')
augmented = rolling_k_days(baseline, k)
augmented.fillna(0, inplace=True)
print('augmented', augmented.shape)
print('augmented NON-ZERO values')
get_non_zero(augmented)
print('--------------------------------------------------------------------------------')

print('AUGMENTATION - knn impute')
imputed = knn_impute(augmented)
imputed.fillna(0, inplace=True)
print('imputed', imputed.shape)
print('imputed NON-ZERO values')
get_non_zero(imputed)




# ADD HISTORICAL BP - Add BP historical variable to master df (1 period, 2 periods, 3 periods)
'''
Here is where we list the historical BP values for each row in master_df:
select a period of k days. If k is more than 1, aggregate within the period via averaging
then a number of periods from [1, 2, 3]
periods should not overlap with each other
'''



# RUN RF - Implement RF on master_df with: 1) no agumentation, 2) k-roll agumentation, 3) knn_augmentation
'''
Implement one RF first
'''


'''
Implement same RF with bootstraping
'''


'''
Extract feature importances and select top N most important features
'''


'''
Run second RF with top N most important features
'''




hk shape (47327, 8)
sleep shape (8255, 5)
hk-sleep merged shape (51955, 11)
selected cols ['heart_rate', 'distance_cycling', 'floors', 'steps', 'bed_time', 'active_calories', 'sleep_minutes', 'distance_walking', 'awake_count']
Number of non-zero and not NaN values: 295697
Number of rows with at least one non-zero and not NaN value: 51955
--------------------------------------------------------------------------------
hk-sleep shape (51955, 11)
wo shape (4322, 4)
hk-sleep-wo shape (52837, 13)
selected cols ['wo_calories', 'heart_rate', 'distance_cycling', 'floors', 'steps', 'active_minutes', 'bed_time', 'active_calories', 'sleep_minutes', 'distance_walking', 'awake_count']
Number of non-zero and not NaN values: 404567
Number of rows with at least one non-zero and not NaN value: 52837
--------------------------------------------------------------------------------
MASTER MERGE - baseline
baseline (hk-sleep-wo-bp merge) (10509, 15) 

selected cols ['wo_calories', 'heart_rate', 'distance_c

'\nRun second RF with top N most important features\n'

**Preparing data for training**

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_columns = ['diastolic', 'systolic']

def data_split(df):
  train, test = train_test_split(df, test_size=0.2)
  y_train = train[y_columns]
  y_test = test[y_columns]

  x_train = train.drop(y_columns, axis=1)
  x_test = test.drop(y_columns, axis=1)

  return (x_train, y_train), (x_test, y_test)

In [20]:
# RUN RF - Implement RF on master_df with: 1) no agumentation, 2) k-roll agumentation, 3) knn_augmentation
'''
Implement one RF first with no augmentation
'''
print(baseline.columns)
angel_baseline_df = baseline.drop(['healthCode', 'date'], axis=1)
angel_baseline_df = angel_baseline_df.dropna()
print('Shape of baseline df with no NaN values: ' , angel_baseline_df.shape)
(x_train, y_train), (x_test, y_test) = data_split(angel_baseline_df)

def rfr_predict(target_col, ntrees=30):
  model = RandomForestRegressor(n_estimators=ntrees)
  model = model.fit(x_train, y_train[target_col])
  pred = model.predict(x_test).round()
  mse = mean_squared_error(pred,y_test[target_col])
  mae = mean_absolute_error(pred,y_test[target_col])
  return (mse, mae)

#Predicting systolic BP using baseline df with non NaN values
mse, mae = rfr_predict('systolic', ntrees=60)
print("\nPerformance scores of systolic prediction")
print('Mean squared error = {:5.3f}'.format(mse))
print('Mean absolute error =  {:5.3f}'.format(mae))

#Predicting diastolic BP using baseline df with non NaN values
mse, mae = rfr_predict('diastolic')
print("\nPerformance scores of diastolic prediction")
print('Mean squared error = {:5.3f}'.format(mse))
print('Mean absolute error =  {:5.3f}'.format(mae))


Index(['healthCode', 'date', 'diastolic', 'systolic', 'distance_cycling',
       'distance_walking', 'floors', 'active_calories', 'steps', 'heart_rate',
       'sleep_minutes', 'awake_count', 'bed_time', 'active_minutes',
       'wo_calories'],
      dtype='object')
Shape of baseline df with no NaN values:  (10509, 13)

Performance scores of systolic prediction
Mean squared error = 191.376
Mean absolute error =  9.918

Performance scores of diastolic prediction
Mean squared error = 124.766
Mean absolute error =  8.049
