**Load data**

<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
path = 'data/'
baseline = pd.read_csv(path + 'baseline.csv')
augmented_k = pd.read_csv(path + 'augmented_k.csv')
augmented_inter = pd.read_csv(path + 'augmented_inter.csv')
augmented_intra = pd.read_csv(path + 'augmented_intra.csv')

In [2]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp

def experiment(dataset, model, ntrees, N, key, target, double_run=False, log_path='', bootstrap_iter=0, bootstrap_size=0.8, aug='None'):
    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    if bootstrap_iter > 0:
        bp_predictor.bootstrap(dataset, bootstrap_iter, bootstrap_size)
    else:
        bp_predictor.predict(dataset)
    log_exp(log_path, bp_predictor, aug='None', N=N, double=False, bootstrap=(bootstrap_iter>0))     # log results
    
    # Second run with top N features
    top_n = list(bp_predictor.feature_importances.keys())[:N]       # get top N features from dict 
    cols = key + top_n + target                                     # add key and target columns
    bp_predictor.predict(dataset[cols])                             # predict with top N features
    log_exp(log_path, bp_predictor, aug='None', N=N, double=True, bootstrap=(bootstrap_iter>0))   # log results

In [7]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp, get_unique_healthCodes

def personalised_experiment(dataset, model, ntrees, N, key, target, double_run=False, log_path='', bootstrap_iter=0, bootstrap_size=0.8, aug='None'):
    # First run with all features (either bootstrapped or not)

    all_healthCodes = get_unique_healthCodes(dataset)
    count=1
    for indv in all_healthCodes:
        print(f'healthCode {indv} ({count} of total {len(all_healthCodes)}) ------------------')
        for m in model:
            ds = dataset[dataset['healthCode'] == indv]
            bp_predictor = BloodPresurePredictor(m, ntrees)
            bp_predictor.predict(ds)
            log_exp(log_path, bp_predictor, aug='None', N=N, double=False, bootstrap=(bootstrap_iter>0))     # log results
        
            top_n = list(bp_predictor.feature_importances.keys())[:N]       # get top N features from dict 
            cols = key + top_n + target                                     # add key and target columns
            bp_predictor.predict(dataset[cols])                             # predict with top N features
            log_exp(log_path, bp_predictor, aug='None', N=N, double=True, bootstrap=(bootstrap_iter>0))   # log results
        count += 1   
        print() 

**Get Top N features as well as error measures with XGB model** - without Bootstraping

In [4]:
############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
model = 'xgb'                        # rf or xgb (Random Forest or XGBoost)
ntrees = 60                         # Number of trees in the forest
double_run = False                  # Whether to use a second run with top N features or not
bootstrap = False                   # Whether to use bootstrap samples
bootstrap_iter = 100               # Number of bootstrap iterations
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results


############################################# EXPERIMENTS #############################################
# Predicting systolic BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
dataset = baseline
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')

# Predicting systolic BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION
dataset size: 21, model: xgb, ntrees: 60, sys_mae: 6.529, dias_mae: 4.507, top_n: floors; distance_cycling; sleep_minutes; distance_walking; active_calories, double run: False, bootstrap: True
dataset size: 86, model: xgb, ntrees: 60, sys_mae: 8.333, dias_mae: 4.5, top_n: N/A, double run: True, bootstrap: True
--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
dataset size: 33, model: xgb, ntrees: 60, sys_mae: 6.051, dias_mae: 3.935, top_n: active_calories; steps; bed_time; floors; wo_calories, double run: False, bootstrap: True
dataset size: 40, model: xgb, ntrees: 60, sys_mae: 9.625, dias_mae: 5.375, top_n: N/A, double run: True, bootstrap: True
--------------------------------------------------------------------------------
KNN INTRA AUGMENTATION
dataset size: 127, model: xgb, ntrees: 60, sys_mae: 4.849, dias_mae: 3.956, top_n: steps; floors; wo_calories; active_calories; distance_cycling, double run: False

**Personalised BP Prediction**

In [9]:
############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
model = ['rf', 'xgb']                        # rf or xgb (Random Forest or XGBoost)
ntrees = 60                         # Number of trees in the forest
double_run = False                  # Whether to use a second run with top N features or not
bootstrap = False                   # Whether to use bootstrap samples
bootstrap_iter = 100               # Number of bootstrap iterations
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results


############################################# EXPERIMENTS #############################################
# Predicting BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
aug = 'None'

personalised_experiment(baseline, model, ntrees, N, key, target, double_run=False, log_path=log_path, aug=aug)
print('--------------------------------------------------------------------------------')

# Predicting BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
personalised_experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
personalised_experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
personalised_experiment(dataset, model, ntrees, N, key, target, double_run=double_run, log_path=log_path, 
           bootstrap_iter=bootstrap_iter, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION
healthCode a88f3758-0e43-43bd-bc6a-55a0780ec643 (1 of total 1) ------------------
dataset size: 3, model: rf, ntrees: 60, sys_mae: 6.0, dias_mae: 4.0, top_n: floors; wo_calories; active_minutes; distance_walking; active_calories, double run: False, bootstrap: False
dataset size: 96, model: rf, ntrees: 60, sys_mae: 9.35, dias_mae: 7.55, top_n: N/A, double run: True, bootstrap: False
dataset size: 3, model: xgb, ntrees: 60, sys_mae: 21.0, dias_mae: 5.0, top_n: distance_cycling; floors; wo_calories; active_minutes; distance_walking, double run: False, bootstrap: False
dataset size: 96, model: xgb, ntrees: 60, sys_mae: 10.45, dias_mae: 7.8, top_n: N/A, double run: True, bootstrap: False

--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
healthCode a88f3758-0e43-43bd-bc6a-55a0780ec643 (1 of total 1) ------------------
dataset size: 4, model: rf, ntrees: 60, sys_mae: 19.0, dias_mae: 5.0, top_n: floors; steps; wo_ca

In [7]:
# OPTIONAL - ADD HISTORICAL BP - Add BP historical variable to master df (1 period, 2 periods, 3 periods)
'''
Here is where we list the historical BP values for each row in master_df:
select a period of k days. If k is more than 1, aggregate within the period via averaging
then a number of periods from [1, 2, 3]
periods should not overlap with each other

from augmentations import historical_BP

historical = historical_BP(baseline, 3)
historical
'''

TypeError: can only concatenate str (not "int") to str