# Multi-Fidelity Deep Gaussian process benchmark

This notebook replicates the benchmark experiments from the paper:

[Deep Gaussian Processes for Multi-fidelity Modeling (Kurt Cutajar, Mark Pullin, Andreas Damianou, Neil Lawrence, Javier González)](https://arxiv.org/abs/1903.07320)

Note that the code for the "Deep Multi-fidelity Gaussian process" is not publically available.

In [None]:
from prettytable import PrettyTable
import numpy as np
import scipy.stats
from sklearn.metrics import mean_squared_error, r2_score
import emukit.examples.multi_fidelity_dgp

from emukit.examples.multi_fidelity_dgp.baseline_model_wrappers import Ar1Model, Nargp, HfGpOnly

from emukit.core import ContinuousParameter, ParameterSpace
from emukit.experimental_design import LatinDesign, RandomDesign
from emukit.multi_fidelity.models import MultiFidelityDeepGP

from emukit.test_functions.multi_fidelity import (multi_fidelity_borehole_function, multi_fidelity_branin_function,
                                                  multi_fidelity_park_function, multi_fidelity_hartmann_3d,
                                                  multi_fidelity_currin_function)

# Parameters for different benchmark functions

In [None]:
# fcn_names = ['currin', 'borehole', 'park', 'branin', 'hartmann-3d']

fcn_name = 'currin'

# Parameters for different benchmark functions
y_scale_dict = {'borehole': 100, 'branin': 1, 'currin': 1, 'park': 1, 'hartmann-3d': 1}

noise_level_dict = {'borehole': [0.05, 0.1], 'branin': [0, 0, 0], 'currin': [0, 0], 
                    'park': [0, 0], 'hartmann-3d': [0, 0, 0]}

do_x_scaling_dict = {'borehole': True, 'branin': False, 'currin': False, 'park': False, 'hartmann-3d': False}

num_data_dict = {'borehole': [60, 5], 'branin': [80, 30, 10], 'currin': [12, 5], 
                    'park': [30, 5], 'hartmann-3d': [80, 40, 20]}



fcns = {'borehole': multi_fidelity_borehole_function, 'branin': multi_fidelity_branin_function,
       'park': multi_fidelity_park_function, 'currin': multi_fidelity_currin_function, 
       'hartmann-3d': multi_fidelity_hartmann_3d}

In [None]:
# Function to repeat test across different random seeds.

def do_benchmark(fcn_name):
    metrics = dict()
    fcn, space, n_data = get_fcn_params(fcn_name)

    # Some random seeds to use
    seeds = [123, 184, 202, 289, 732]

    for i, seed in enumerate(seeds):
        run_name = str(seed) + str(n_data)
        metrics[run_name] = test_function(fcn, space, n_data, seed)
        print('After ' + str(i+1) + ' runs of ' + fcn_name)
        print_metrics(metrics)

    return metrics

In [None]:
# Print metrics as table 
def print_metrics(metrics):
    model_names = list(list(metrics.values())[0].keys())
    metric_names = ['r2', 'mnll', 'rmse']
    table = PrettyTable(['model'] + metric_names)

    for name in model_names:
        mean = []
        for metric_name in metric_names:
            mean.append(np.mean([metric[name][metric_name] for metric in metrics.values()]))
        table.add_row([name] + mean)

    print(table)

In [None]:
# Get the parameters for a given function
def get_fcn_params(fcn_name):
    handles = fcns[fcn_name]()[0].f
    space = fcns[fcn_name]()[1]
    n_data = num_data_dict[fcn_name]
    
    space._parameters = space._parameters[:-1]
    return handles, space, n_data

In [None]:
def test_function(fcn, space, n_data, seed):
    np.random.seed(seed)

    x_test, y_test, X, Y = generate_data(fcn, n_data, 1000, space)

    mf_dgp_fix_lf_mean = MultiFidelityDeepGP(X, Y, n_iter=5000)
    mf_dgp_fix_lf_mean.name = 'mf_dgp_fix_lf_mean'

    models = [HfGpOnly(X, Y), Ar1Model(X, Y), Nargp(X, Y), mf_dgp_fix_lf_mean]
    return benchmark_models(models, x_test, y_test)

In [None]:
def benchmark_models(models, x_test, y_test):
    metrics = dict()
    for model in models:
        model.optimize()
        y_mean, y_var = model.predict(x_test)
        metrics[model.name] = calculate_metrics(y_test, y_mean, y_var)
        print('+ ######################## +')
        print(model.name, 'r2', metrics[model.name]['r2'])
        print('+ ######################## + ')
    return metrics

In [None]:
def generate_data(fcn, n_data, n_test_points, space):
    """
    Generates train and test data for
    """
    
    do_x_scaling = do_x_scaling_dict[fcn_name]
    
    # Generate training data
    latin = LatinDesign(space)
    X = [latin.get_samples(n) for n in n_data]
    
    # Scale X if required
    if do_x_scaling:
        scalings = X[0].std(axis=0)
    else:
        scalings = np.ones(X[0].shape[1])
        
    for x in X:
        x /= scalings
    
    Y = []
    for i, x in enumerate(X):
        Y.append(fcn[i](x * scalings))
    
    y_scale = y_scale_dict[fcn_name]
    
    # scale y and add noise if required
    noise_levels = noise_level_dict[fcn_name]
    for y, std_noise in zip(Y, noise_levels):
        y /= y_scale + std_noise * np.random.randn(y.shape[0], 1)
    
    # Generate test data
    x_test = latin.get_samples(n_test_points)
    x_test /= scalings
    y_test = fcn[-1](x_test * scalings)
    y_test /= y_scale

    i_highest_fidelity = (len(n_data) - 1) * np.ones((x_test.shape[0], 1))
    x_test = np.concatenate([x_test, i_highest_fidelity], axis=1)
    print(X[1].shape)
    return x_test, y_test, X, Y

In [None]:
def calculate_metrics(y_test, y_mean_prediction, y_var_prediction):
    # R2
    r2 = r2_score(y_test, y_mean_prediction)
    # RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_mean_prediction))
    # Test log likelihood
    mnll = -np.sum(scipy.stats.norm.logpdf(y_test, loc=y_mean_prediction, scale=np.sqrt(y_var_prediction)))/len(y_test)
    return {'r2': r2, 'rmse': rmse, 'mnll': mnll}

In [None]:
metrics = []
metrics.append(do_benchmark(fcn_name))

In [None]:
for (metric) in zip(metrics):
    print(fcn_name)
    print_metrics(metric[0])