**Load data**

<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
path = 'data/'
baseline = pd.read_csv(path + 'baseline.csv')
augmented_k = pd.read_csv(path + 'augmented_k.csv')
augmented_inter = pd.read_csv(path + 'augmented_inter.csv')
augmented_intra = pd.read_csv(path + 'augmented_intra.csv')

In [2]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp

def experiment(dataset, model, ntrees, N, key, target, double_run=False, log_path='', bootstrap_iter=0, bootstrap_size=0.8, aug='None'):
    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    if bootstrap_iter > 0:
        bp_predictor.bootstrap(dataset, bootstrap_iter, bootstrap_size)
    else:
        bp_predictor.predict(dataset)
    log_exp(log_path, bp_predictor, aug='None', N=N, double=False, bootstrap=(bootstrap_iter>0))     # log results
    
    # Second run with top N features
    top_n = list(bp_predictor.feature_importances.keys())[:N]       # get top N features from dict 
    cols = key + top_n + target                                     # add key and target columns
    bp_predictor.predict(dataset[cols])                             # predict with top N features
    log_exp(log_path, bp_predictor, aug='None', N=N, double=True, bootstrap=(bootstrap_iter>0))   # log results

**Get Top N features as well as error measures with RF model** - without Bootstraping

In [3]:
# Run RF on master_df with no agumentation
from bp_predictor import BloodPresurePredictor
from utils import log_exp


############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
model = 'rf'                        # rf or xgb (Random Forest or XGBoost)
ntrees = 60                         # Number of trees in the forest
double_run = False                  # Whether to use a second run with top N features or not
bootstrap = False                   # Whether to use bootstrap samples
bootstrap_iter = 0                # Number of bootstrap iterations
bootstrap_size = 0.8                # Size of bootstrap samples
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results


############################################# EXPERIMENTS #############################################
# Predicting systolic BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
dataset = baseline
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')

# Predicting systolic BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')

BASELINE - NO AUGMENTATION
aug:, None, dataset size: 17, model: rf, ntrees: 60, sys_mae: 7.0, dias_mae: 8.25, 
          top_n: floors,distance_walking,bed_time,active_calories,wo_calories, double run: False, bootstrap: False
aug:, None, dataset size: 17, model: rf, ntrees: 60, sys_mae: 13.75, dias_mae: 13.25, 
          top_n: distance_walking,wo_calories,active_calories,bed_time,floors, double run: True, bootstrap: False
--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
aug:, None, dataset size: 40, model: rf, ntrees: 60, sys_mae: 11.75, dias_mae: 8.25, 
          top_n: sleep_minutes,distance_walking,active_calories,wo_calories,floors, double run: False, bootstrap: False
aug:, None, dataset size: 40, model: rf, ntrees: 60, sys_mae: 7.625, dias_mae: 5.375, 
          top_n: wo_calories,sleep_minutes,floors,active_calories,distance_walking, double run: True, bootstrap: False
------------------------------------------------------------

In [4]:
# OPTIONAL - ADD HISTORICAL BP - Add BP historical variable to master df (1 period, 2 periods, 3 periods)
'''
Here is where we list the historical BP values for each row in master_df:
select a period of k days. If k is more than 1, aggregate within the period via averaging
then a number of periods from [1, 2, 3]
periods should not overlap with each other
'''

'\nHere is where we list the historical BP values for each row in master_df:\nselect a period of k days. If k is more than 1, aggregate within the period via averaging\nthen a number of periods from [1, 2, 3]\nperiods should not overlap with each other\n'