# MFDGP model for malaria data

This notebook fits the MF-DGP model to malaria data. We assume the low fidelity is data from the year 2005 and high fidelity is data from 2015. 

The inputs to the model are the latitude/longitude points and the is the infection rate of malaria among children which varies between 0 and 1. 

Where the data is < 0, that means we have no data so these points are discarded. 

We use all the data from the low fidelity, and take a small subsample from the high fidelity then test against the remaining high fidelity data.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from emukit.multi_fidelity.models.multi_fidelity_deep_gp import DGP_Base, MultiFidelityDeepGP
from gpflow.kernels import RBF, White, Linear
from gpflow.likelihoods import Gaussian
from gpflow.actions import Loop, Action
from gpflow.mean_functions import Zero
from gpflow.training import AdamOptimizer
import gpflow.training.monitor as mon

In [2]:
def make_dgpMF_model(X, Y, Z):
    return MultiFidelityDeepGP(X, Y, Z, Gaussian(), minibatch_size=1000)

# Load data

To download data go to [https://map.ox.ac.uk/explorer/#/](https://map.ox.ac.uk/explorer/#/) and select the layer `Plasmodium falciparum parasite rate in 2-10 year olds in Africa` and click download. Select the zip file option. You should then have a zip file called `2015_Nature_Africa_PR.2000.zip`.

Unzip the folder and enter the folder locaiton below.

In [None]:
import os
import georaster
import pandas as pd

data_folder_location = '~/Downloads/2015_Nature_Africa_PR.2000/'

name_lf = '2015_Nature_Africa_PR.2005.tif'
name_hf = '2015_Nature_Africa_PR.2015.tif'
lf_data = get_map_as_df(os.path.join(data_folder_location, name_lf))
hf_data = get_map_as_df(os.path.join(data_folder_location, name_hf))
    
def get_map_as_df(path):
    my_image = georaster.SingleBandRaster(fpath, load_data=False)
    return pd.DataFrame(data=np.stack([image.coordinates()[1].flatten(), image.coordinates()[0].flatten(),
                  image.read_single_band(1).flatten()], axis=1), columns=['latitude', 'longitude', 'value'])

**Change paths to where your data is stored**

In [4]:
import scipy.special

# Discard points where we have no data
lf_valid = lf_data.value > 0
hf_valid = hf_data.value > 0

y_lf = lf_data.value.values[lf_valid, None]
y_hf = hf_data.value.values[hf_valid, None]

# Transform data so it lies on real line
y_lf_transformed = scipy.special.logit(y_lf)
y_hf_transformed = scipy.special.logit(y_hf)

# Construct features
x_lf = np.stack([lf_data.latitude.values[lf_valid], lf_data.longitude.values[lf_valid]], axis=1)
x_hf = np.stack([hf_data.latitude.values[hf_valid], hf_data.longitude.values[hf_valid]], axis=1)

# Choose a random subset of high fidelity points for training
i_train = np.random.choice(x_hf.shape[0], 1000, replace=False)

x_hf_train = x_hf[i_train, :]
y_hf_train = y_hf_transformed[i_train, :]

### Initialze inducing points to a subset of the data

In [5]:
i_z = np.random.choice(x_lf.shape[0], 750, replace=False)
z_low = x_lf[i_z, :]

i_z_low = np.random.choice(x_lf.shape[0], 100, replace=False)
z_high = np.concatenate([x_lf[i_z_low, :], y_lf_transformed[i_z_low, :]], axis=1)

In [6]:
dgp = make_dgpMF_model([x_lf, x_hf_train], [y_lf_transformed, y_hf_train], [z_low, z_high])


Data at Fidelity  1
X -  (890223, 2)
Y -  (890223, 1)
Z -  (750, 2)

Data at Fidelity  2
X -  (1000, 2)
Y -  (1000, 1)
Z -  (100, 3)


### Trainining loop + some printing

In [8]:
class PrintAction(Action):
    def __init__(self, model, text):
        self.model = model
        self.text = text
        
    def run(self, ctx):
        if ctx.iteration % 500 == 0:
            likelihood = ctx.session.run(self.model.likelihood_tensor)
            objective = ctx.session.run(self.model.objective)

            print('ELBO {:.4f};  KL {:,.4f}'.format(ctx.session.run(self.model.L), ctx.session.run(self.model.KL)))
            print('{}: iteration {} objective {:,.4f}'.format(self.text, ctx.iteration, objective))


def run_adam(model, lr, iterations, callback=None):
    adam = AdamOptimizer(lr).make_optimize_action(model)
    actions = [adam] if callback is None else [adam, callback]
    loop = Loop(actions, stop=iterations)()
    model.anchor(model.enquire_session())

In [None]:
# we then optimize the model jointly, keeping the likelihood variance fixed

dgp.model.likelihood.likelihood.variance = y_hf_train.var()*.01
dgp.model.likelihood.likelihood.variance.trainable = True

run_adam(dgp.model, 3e-3, 20000, callback=PrintAction(dgp.model, 'MF-DGP with Adam')) #3e-3 learning rate is also decent

In [None]:
import scipy
# batch predict
batch_size = 1000
n_points = x_hf.shape[0]
n_batches = int(np.ceil(n_points/batch_size))
y_result = np.zeros(n_points)
for i in range(n_batches):
    i_start = i*batch_size
    i_end = np.min([(i+1) * batch_size, n_points])
    transformed_predictions = dgp.predict_f(x_hf[i_start:i_end, :], 200)[0].mean(axis=0)
    y_result[i_start:i_end] = scipy.special.expit(transformed_predictions)[:, 0]

In [None]:
plt.figure(figsize=(12, 12))
plt.scatter(y_hf, y_result, alpha=0.1)
min_max = [y_hf.min(), y_hf.max()]
plt.plot(min_max, min_max, color='r')
plt.xlabel('Truth')
plt.ylabel('Prediction');

In [None]:
from sklearn.metrics import r2_score
r2_score(y_hf, y_result)