**Load data**

<a href="https://colab.research.google.com/github/Blistt/bp-recommender/blob/Romasa/BP_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
path = 'data/'
baseline = pd.read_csv(path + 'baseline.csv')
augmented_k = pd.read_csv(path + 'augmented_k.csv')
augmented_inter = pd.read_csv(path + 'augmented_inter.csv')
augmented_intra = pd.read_csv(path + 'augmented_intra.csv')

In [2]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp, data_split, historical_BP

def experiment(dataset, model, ntrees, N, key, target, log_path='', bootstrap=False, 
               bootstrap_size=0.8, aug='None', historical=True):
    # Add historical blood pressure to the dataset if specified
    if historical:
        dataset = historical_BP(dataset, 3)

    # Split dataset into train and test sets of features and labels
    (x_train, y_train), (x_test, y_test) = data_split(dataset, y_columns=target, key_cols=key)
    x_train = x_train.drop(key, axis=1)
    x_test = x_test.drop(key, axis=1)

    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    bp_predictor.fit(x_train, y_train, bootstrap, bootstrap_size)
    # Evaluate the model
    bp_predictor.evaluate(x_test, y_test)
    
    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=False, bootstrap=bootstrap, test_size=x_test.shape)     
    
    # Second run with top N features
    top_n = list(bp_predictor.feature_importances.keys())[:N]        # get top N features from dict 
    x_train = x_train[top_n]                                         # select top N features
    bp_predictor.fit(x_train[top_n], y_train)                        # predict with top N features
    bp_predictor.evaluate(x_test[top_n], y_test)                     # evaluate with top N features
    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=True, bootstrap=bootstrap, test_size=x_test.shape)

In [3]:
from bp_predictor import BloodPresurePredictor
from utils import log_exp, get_unique_healthCodes, average_dicts, strat_data_split
from collections import defaultdict

def personalized_experiment(dataset, model, ntrees, N, key, target, log_path='', bootstrap=False, 
               bootstrap_size=0.8, aug='None', second_run=False, save_path=None, historical=True):
    # Add historical blood pressure to the dataset if specified
    if historical:
        dataset = historical_BP(dataset, 3)
    
    # Split dataset into train and test sets of features and labels
    (x_train, y_train), (x_test, y_test) = strat_data_split(dataset, y_columns=target, key_cols=key)
    x_train_keys = x_train[key]
    x_test_keys = x_test[key]
    x_train = x_train.drop(key, axis=1)
    x_test = x_test.drop(key, axis=1)


    # First run with all features (either bootstrapped or not)
    bp_predictor = BloodPresurePredictor(model, ntrees)
    bp_predictor.fit(x_train, y_train, bootstrap, bootstrap_size)

    # Get all unique healthCodes
    all_users = get_unique_healthCodes(dataset)

    # Initialize lists to store metrics results
    mae = defaultdict(list)
    mse = defaultdict(list)
    temp_feature_importances = []
    # Personalize the model for each user
    for user in all_users:
        tr_mask = x_train_keys.iloc[:, 0] == user
        test_mask = x_test_keys.iloc[:, 0] == user
        x_train_user, y_train_user = x_train[tr_mask], y_train[tr_mask]

        x_test_user, y_test_user = x_test[test_mask], y_test[test_mask]

        # Skips if there are no samples for the user
        if x_train_user.shape[0] < 1 or x_test_user.shape[0] < 1:
            continue

        else:
            bp_predictor.fine_tune(x_train_user, y_train_user)                   # Fit the personalized model
            bp_predictor.evaluate(x_test_user, y_test_user, fine_tuned=True)     # Evaluate the personalized model
            # Performs second run with top N features if specified
            if second_run:
                top_n = list(bp_predictor.feature_importances.keys())[:N]
                bp_predictor.fine_tune(x_train_user[top_n], y_train_user)
                bp_predictor.evaluate(x_test_user[top_n], y_test_user, fine_tuned=True)
            for bp_type in target:
                mae[bp_type].append(bp_predictor.mae[bp_type])
                mse[bp_type].append(bp_predictor.mse[bp_type])
            temp_feature_importances.append(bp_predictor.feature_importances)

            # Saves the model and the feature importances for the user
            if save_path:
                for bp_type in target:
                    bp_predictor.ftmodel[bp_type].save_model(f'{save_path}/model_states/{user}_{bp_type}.json')
                # Save dict of feature importances
                with open(f'{save_path}/feature_importances/{user}.json', 'w') as f:
                    f.write(str(bp_predictor.feature_importances))

    if len(mae) == 0:
        print('No testing samples')
        return
    
    # Average metrics for all users
    for bp_type in target:
        bp_predictor.mae[bp_type] = sum(mae[bp_type]) / len(mae[bp_type])
        bp_predictor.mse[bp_type] = sum(mse[bp_type]) / len(mse[bp_type])
    bp_predictor.feature_importances = average_dicts(temp_feature_importances)

    # log results
    log_exp(log_path, bp_predictor, aug=aug, N=N, second_run=second_run, bootstrap=bootstrap, 
            test_size=x_test.shape, personalized=True)
   
        

**Experiments with non-personalized model**

In [4]:
from utils import historical_BP

############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
model = 'xgb'                       # rf or xgb (Random Forest or XGBoost)
ntrees = 60                         # Number of trees in the forest
double_run = False                  # Whether to use a second run with top N features or not
bootstrap = True                    # Whether to use bootstrap samples
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results
historical = True                  # Whether to use historical BP or not


############################################# EXPERIMENTS #############################################
# Predicting systolic BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
dataset = baseline
aug = 'None'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug, historical=historical)
print('--------------------------------------------------------------------------------')

# Predicting systolic BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug, historical=historical)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug, historical=historical)
print('--------------------------------------------------------------------------------')


# Predicting systolic BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
experiment(dataset, model, ntrees, N, key, target, log_path=log_path, 
           bootstrap=bootstrap, bootstrap_size=bootstrap_size, aug=aug, historical=historical)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION
dataset size: (2102, 13), model: xgb, ntrees: 60, sys_mae: 8.928,
           dias_mae: 6.974, top_n: steps; bed_time; floors; active_calories; wo_calories, second run: False, bootstrap: True
dataset size: (2102, 13), model: xgb, ntrees: 60, sys_mae: 10.162,
           dias_mae: 8.206, top_n: active_calories; steps; wo_calories; floors; bed_time, second run: True, bootstrap: True
--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
dataset size: (2102, 13), model: xgb, ntrees: 60, sys_mae: 8.968,
           dias_mae: 7.042, top_n: steps; active_calories; bed_time; floors; wo_calories, second run: False, bootstrap: True
dataset size: (2102, 13), model: xgb, ntrees: 60, sys_mae: 10.383,
           dias_mae: 8.317, top_n: steps; floors; wo_calories; active_calories; bed_time, second run: True, bootstrap: True
--------------------------------------------------------------------------------
KNN INTRA AUGMENTATION
data

**Experiments with personalized model**

In [5]:
############################################# PARAMETERS #############################################
N = 5                               # Number of most important features to display
ntrees = 60                         # Number of trees in the forest
bootstrap = True                   # Whether to use bootstrap samples
bootstrap_size = 0.8                # Portion of the dataset to sample for bootstrap
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
log_path = 'exp_log.csv'            # Path of file to log experiment results
save_path = 'user_data'          # Path to save personalized models
historical = False                  # Whether to use historical BP or not


############################################# EXPERIMENTS #############################################
# Predicting BP using baseline with non NaN values
print('BASELINE - NO AUGMENTATION')
aug = 'None'
dataset = baseline
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, save_path=save_path, historical=historical)
print('--------------------------------------------------------------------------------')

# Predicting BP using k-roll augmentation with non NaN values
print('K-ROLL AUGMENTATION')
dataset = augmented_k
aug = 'K-roll'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, save_path=save_path, historical=historical)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn intra augmentation with non NaN values
print('KNN INTRA AUGMENTATION')
dataset = augmented_intra
aug = 'KNN-intra'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, save_path=save_path, historical=historical)
print('--------------------------------------------------------------------------------')


# Predicting BP using knn inter augmentation with non NaN values
print('KNN INTER AUGMENTATION')
dataset = augmented_inter
aug = 'KNN-inter'
personalized_experiment(dataset, 'xgb', ntrees, N, key, target, log_path=log_path, bootstrap=bootstrap, 
                        bootstrap_size=bootstrap_size, aug=aug, save_path=save_path, historical=historical)
print('--------------------------------------------------------------------------------')


BASELINE - NO AUGMENTATION
dataset size: (193, 11), model: xgb, ntrees: 60, sys_mae: 21.0,
           dias_mae: 5.0, top_n: N/A, second run: False, bootstrap: True
--------------------------------------------------------------------------------
K-ROLL AUGMENTATION
dataset size: (245, 11), model: xgb, ntrees: 60, sys_mae: 21.0,
           dias_mae: 5.0, top_n: N/A, second run: False, bootstrap: True
--------------------------------------------------------------------------------
KNN INTRA AUGMENTATION
dataset size: (193, 11), model: xgb, ntrees: 60, sys_mae: 7.702,
           dias_mae: 7.619, top_n: N/A, second run: False, bootstrap: True
--------------------------------------------------------------------------------
KNN INTER AUGMENTATION
dataset size: (193, 11), model: xgb, ntrees: 60, sys_mae: 9.601,
           dias_mae: 6.777, top_n: N/A, second run: False, bootstrap: True
--------------------------------------------------------------------------------


In [43]:
from utils import load_user_model, load_json_as_dict
import os
from itertools import islice
import pandas as pd

def get_recommendations(entry, model_path, f_imp_path, dataset_path, key, target, n=5, var_adjust=False):
    model_files = os.listdir(model_path)
    ids = [f.split('_')[0] for f in model_files]
    id = ids[entry]     # Extract the id of the user to make recommendations for

    # Load the models for the user
    model_sys = load_user_model(f'{model_path}/{id}_systolic.json')
    model_dia = load_user_model(f'{model_path}/{id}_diastolic.json')

    # Get the feature importances for the user
    file_path = f'{f_imp_path}/{id}.json'
    feature_importances = load_json_as_dict(file_path)
    top_n = dict(islice(feature_importances.items(), n))


    # Get predictor values for one of the user's test cases
    test_dataset = pd.read_csv(f'{dataset_path}/test.csv')
    test_entry = test_dataset[test_dataset['healthCode'] == id].iloc[[0]]

    # Generate predictions for boths types of bp for the test entry
    expected_sys = 120.0
    expected_dia = 80.0
    x = test_entry.drop(key + target, axis=1)
    # print datatypes of all of x columns
    x = x.apply(pd.to_numeric, errors='coerce')

    sys_prediction = model_sys.predict(x)
    print(f'Predicted value: {sys_prediction}')
    sys_to_correct = sys_prediction - expected_sys
    dia_prediction = model_dia.predict(x)
    print(f'Predicted value: {dia_prediction}')
    dia_to_correct = dia_prediction - expected_dia

    if sys_to_correct < 0:
        sys_to_correct = 0
    if dia_to_correct < 0:
        dia_to_correct = 0

    total = expected_sys + expected_dia
    sys_w = expected_dia / total
    dia_w = expected_sys / total

    # Get weighter combination of systolic and diastolic to correct
    bp_to_correct = (sys_w * sys_to_correct + dia_w * dia_to_correct) / 2
    print('Weighted correction:', bp_to_correct)

    if bp_to_correct <= 0:
        print('No correction needed')
        return
    
    if var_adjust:
        # summ all the top n feature importances values
        var_explained = sum(top_n.values())
        pred_adjustment = bp_to_correct / var_explained
        # Adjust the top n feature importances
        for key in top_n.keys():
            top_n[key] *= pred_adjustment

    # Multiply each top n prediction value by its corresponding feature importance
    recs = {}
    for key in top_n.keys():
        recs[key] = x[key] * top_n[key]
        print(f'key: {key}  -   value: {x[key].item()}   -  f_score: {top_n[key]}  -  rec: {recs[key].item()}')

In [46]:
# Get recommendations for one of the users in the testing set
entry = 33                 # Choose from 0 to 58
n = 5                     # Number of most important features to display
model_path = 'user_data/model_states'
f_imp_path = 'user_data/feature_importances'
dataset_path = 'data/train_test'
key = ['healthCode', 'date']        # Columns to use as key
target = ['systolic', 'diastolic']  # Columns to predict
var_adjust = False

get_recommendations(entry, model_path, f_imp_path, dataset_path, key, target, n=n, var_adjust=var_adjust)

Predicted value: [115.56446]
Predicted value: [80.0675]
Weighted correction: [0.02024918]
key: steps  -   value: 5102.0   -  f_score: 0.13045477867126465  -  rec: 665.5802807807922
key: bed_time  -   value: 955.0231794263708   -  f_score: 0.12088577449321747  -  rec: 115.44871670393182
key: sleep_minutes  -   value: 321.08017579491   -  f_score: 0.11633481085300446  -  rec: 37.35280151975028
key: active_calories  -   value: 0.0   -  f_score: 0.11332559585571289  -  rec: 0.0
key: wo_calories  -   value: 243.72729600887163   -  f_score: 0.11147531121969223  -  rec: 27.169576175323016
