**Load data**

<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
path = 'data/'
baseline = pd.read_csv(path + 'baseline.csv')
augmented_k = pd.read_csv(path + 'augmented_k.csv')
augmented_inter = pd.read_csv(path + 'augmented_inter.csv')
augmented_intra = pd.read_csv(path + 'augmented_intra.csv')

In [2]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp
from utils import data_split

def experiment(dataset, model, ntrees, N, key, target, log_path='', bootstrap=False, 
               bootstrap_size=0.8, aug='None'):
    # Split dataset into train and test sets of features and labels
    (x_train, y_train), (x_test, y_test) = data_split(dataset, y_columns=target, key_cols=key)
    x_train = x_train.drop(key, axis=1)
    x_test = x_test.drop(key, axis=1)

    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    bp_predictor.fit(x_train, y_train, bootstrap, bootstrap_size)
    # Evaluate the model
    bp_predictor.evaluate(x_test, y_test)
    
    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=False, bootstrap=bootstrap, test_size=x_test.shape)     
    
    # Second run with top N features
    top_n = list(bp_predictor.feature_importances.keys())[:N]        # get top N features from dict 
    x_train = x_train[top_n]                                         # select top N features
    bp_predictor.fit(x_train[top_n], y_train)                        # predict with top N features
    bp_predictor.evaluate(x_test[top_n], y_test)                     # evaluate with top N features
    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=True, bootstrap=bootstrap, test_size=x_test.shape)

In [44]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp, get_unique_healthCodes, average_dicts
from collections import defaultdict

def personalized_experiment(dataset, model, ntrees, N, key, target, log_path='', bootstrap=False, 
               bootstrap_size=0.8, aug='None', second_run=False, save_path=None):
    
    # Split dataset into train and test sets of features and labels
    (x_train, y_train), (x_test, y_test) = data_split(dataset, y_columns=target, key_cols=key)
    x_train_keys = x_train[key]
    x_test_keys = x_test[key]
    x_train = x_train.drop(key, axis=1)
    x_test = x_test.drop(key, axis=1)

    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    bp_predictor.fit(x_train, y_train, bootstrap, bootstrap_size)

    # Get all unique healthCodes
    all_users = get_unique_healthCodes(dataset)

    # Initialize lists to store metrics results
    mae = defaultdict(list)
    mse = defaultdict(list)
    temp_feature_importances = []
    # Personalize the model for each user
    for user in all_users:
        tr_mask = x_train_keys.iloc[:, 0] == user
        test_mask = x_test_keys.iloc[:, 0] == user
        x_train_user, y_train_user = x_train[tr_mask], y_train[tr_mask]

        x_test_user, y_test_user = x_test[test_mask], y_test[test_mask]

        # Skips if there are no samples for the user
        if x_train_user.shape[0] < 1 or x_test_user.shape[0] < 1:
            continue

        else:
            bp_predictor.fine_tune(x_train_user, y_train_user)  # Fit the personalized model
            bp_predictor.evaluate(x_test_user, y_test_user, fine_tuned=True)     # Evaluate the personalized model
            # Performs second run with top N features if specified
            if second_run:
                top_n = list(bp_predictor.feature_importances.keys())[:N]
                bp_predictor.fine_tune(x_train_user[top_n], y_train_user)
                bp_predictor.evaluate(x_test_user[top_n], y_test_user, fine_tuned=True)
            for bp_type in target:
                mae[bp_type].append(bp_predictor.mae[bp_type])
                mse[bp_type].append(bp_predictor.mse[bp_type])
            temp_feature_importances.append(bp_predictor.feature_importances)

            # Saves the model for the user
            if save_path:
                for bp_type in target:
                    bp_predictor.ftmodel[bp_type].save_model(f'{save_path}/{user}_{bp_type}.json')

    if len(mae) == 0:
        print('No testing samples')
        return
    
    # Average metrics for all users
    for bp_type in target:
        bp_predictor.mae[bp_type] = sum(mae[bp_type]) / len(mae[bp_type])
        bp_predictor.mse[bp_type] = sum(mse[bp_type]) / len(mse[bp_type])
    bp_predictor.feature_importances = average_dicts(temp_feature_importances)

    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=second_run, bootstrap=bootstrap, 
            test_size=x_test.shape, personalized=True)
   
        

**Experiments with non-personalized model**

In [45]:
############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
model = 'xgb'                       # rf or xgb (Random Forest or XGBoost)
ntrees = 60                         # Number of trees in the forest
double_run = False                  # Whether to use a second run with top N features or not
bootstrap = True                    # Whether to use bootstrap samples
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results


############################################# EXPERIMENTS #############################################
# Predicting systolic BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
dataset = baseline
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')

# Predicting systolic BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION
dataset size: (4, 11), model: xgb, ntrees: 60, sys_mae: 10.25,
           dias_mae: 10.25, top_n: wo_calories; steps; distance_walking; bed_time; sleep_minutes, second run: False, bootstrap: True
dataset size: (4, 11), model: xgb, ntrees: 60, sys_mae: 18.25,
           dias_mae: 11.75, top_n: N/A, second run: True, bootstrap: True
--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
dataset size: (8, 11), model: xgb, ntrees: 60, sys_mae: 9.75,
           dias_mae: 6.75, top_n: wo_calories; awake_count; sleep_minutes; distance_cycling; floors, second run: False, bootstrap: True
dataset size: (8, 11), model: xgb, ntrees: 60, sys_mae: 8.5,
           dias_mae: 7.625, top_n: N/A, second run: True, bootstrap: True
--------------------------------------------------------------------------------
KNN INTRA AUGMENTATION
dataset size: (31, 11), model: xgb, ntrees: 60, sys_mae: 9.968,
           dias_mae: 9.548, top_n: awa

**Experiments with personalized model**

In [46]:
############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
ntrees = 60                         # Number of trees in the forest
second_run = False                  # Whether to use a second run with top N features or not
bootstrap = False                   # Whether to use bootstrap samples
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results
save_path = 'model_states'          # Path to save personalized models


############################################# EXPERIMENTS #############################################
# Predicting BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
aug = 'None'
dataset = baseline
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, second_run=second_run, save_path=save_path)
print('--------------------------------------------------------------------------------')

# Predicting BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, second_run=second_run, save_path=save_path)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, second_run=second_run, save_path=save_path)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, second_run=second_run, save_path=save_path)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION


AttributeError: 'BloodPresurePredictor' object has no attribute 'save_model'

In [None]:
def historical_BP(predictor, k):
    data = predictor.copy()
    #Sorting the data by healthCode and date
    predictor.sort_values(by=['healthCode', 'date'], inplace=True)

    for col in ['systolic', 'diastolic']:
        data[col+'_'+str(k)] = predictor.groupby('healthCode')[col].transform(lambda x: x.rolling(window=k, min_periods=1).mean())

    return historical_BP

In [None]:
#ADD HISTORICAL BP


# from augmentations import historical_BP

historical = historical_BP(baseline, 3)