diff --git a/emukit/core/interfaces/__init__.py b/emukit/core/interfaces/__init__.py index 4f0537ee..699e62e7 100644 --- a/emukit/core/interfaces/__init__.py +++ b/emukit/core/interfaces/__init__.py @@ -8,4 +8,5 @@ IPriorHyperparameters, # noqa: F401 IJointlyDifferentiable, # noqa: F401 IModelWithNoise, # noqa: F401 + ICrossCovarianceDifferentiable, # noqa: F401 ) diff --git a/emukit/core/interfaces/models.py b/emukit/core/interfaces/models.py index 0926e699..7f813d11 100644 --- a/emukit/core/interfaces/models.py +++ b/emukit/core/interfaces/models.py @@ -72,6 +72,35 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np. raise NotImplementedError +class ICrossCovarianceDifferentiable: + def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray: + """ + Calculate posterior covariance between two sets of points. + + :param X1: An array of shape n_points1 x n_dimensions. This is the first argument of the + posterior covariance function. + :param X2: An array of shape n_points2 x n_dimensions. This is the second argument of the + posterior covariance function. + :return: An array of shape n_points1 x n_points2 of posterior covariances between X1 and X2. + Namely, [i, j]-th entry of the returned array will represent the posterior covariance + between i-th point in X1 and j-th point in X2. + """ + raise NotImplementedError + + def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray: + """ + Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2 + with respect to x1. + + :param X1: Prediction inputs of shape (q1, d) + :param X2: Prediction inputs of shape (q2, d) + :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance + between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j]) + with respect to x1[i, k] + """ + raise NotImplementedError + + class IPriorHyperparameters: def generate_hyperparameters_samples(self, n_samples: int, n_burnin: int, subsample_interval: int, step_size: float, leapfrog_steps: int) -> np.ndarray: diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py index c20c6e7b..caebc145 100644 --- a/emukit/model_wrappers/gpy_model_wrappers.py +++ b/emukit/model_wrappers/gpy_model_wrappers.py @@ -7,13 +7,21 @@ import numpy as np import GPy -from ..core.interfaces import IModel, IDifferentiable, IJointlyDifferentiable, IPriorHyperparameters, IModelWithNoise +from ..core.interfaces import ( + IModel, + IDifferentiable, + IJointlyDifferentiable, + IPriorHyperparameters, + IModelWithNoise, + ICrossCovarianceDifferentiable, +) from ..experimental_design.interfaces import ICalculateVarianceReduction from ..bayesian_optimization.interfaces import IEntropySearchModel class GPyModelWrapper( - IModel, IDifferentiable, IJointlyDifferentiable, ICalculateVarianceReduction, IEntropySearchModel, IPriorHyperparameters, IModelWithNoise + IModel, IDifferentiable, IJointlyDifferentiable, ICrossCovarianceDifferentiable, ICalculateVarianceReduction, + IEntropySearchModel, IPriorHyperparameters, IModelWithNoise, ): """ This is a thin wrapper around GPy models to allow users to plug GPy models into Emukit @@ -118,6 +126,38 @@ def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.nd """ return self.model.posterior_covariance_between_points(X1, X2, include_likelihood=False) + def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray: + """ + Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2 + with respect to x1. + + :param X1: Prediction inputs of shape (q1, d) + :param X2: Prediction inputs of shape (q2, d) + :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance + between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j]) + with respect to x1[i, k] + """ + # Get the relevant shapes + q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0] + # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train + cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train)) + # Instantiate an array to hold gradients of prior covariance between outputs at X1 and X2 + cov_X1_X2_grad = np.zeros((input_dim, q1, q2)) + # Calculate the gradient wrt. X1 of these prior covariances. GPy API allows for doing so + # only one dimension at a time, hence need to iterate over all input dimensions + for i in range(input_dim): + # Calculate the gradient wrt. X1 of the prior covariance between X1 and X_train + cov_X1_Xtrain_grad[i, :, :] = self.model.kern.dK_dX(X1, self.model.X, i) + # Calculate the gradient wrt. X1 of the prior covariance between X1 and X2 + cov_X1_X2_grad[i, :, :] = self.model.kern.dK_dX(X1, X2, i) + + # Get the prior covariance between outputs at x_train and X2 + cov_Xtrain_X2 = self.model.kern.K(self.model.X, X2) + # Calculate the gradient of the posterior covariance between outputs at X1 and X2 + cov_grad = cov_X1_X2_grad - cov_X1_Xtrain_grad @ self.model.posterior.woodbury_inv @ cov_Xtrain_X2 + return cov_grad.transpose((1, 2, 0)) + + @property def X(self) -> np.ndarray: """ @@ -177,27 +217,36 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np :param x_train: Training inputs of shape (n, d) :param kern: Covariance of the GP model :param w_inv: Woodbury inverse of the posterior fit of the GP - :return: Gradient of the posterior covariance of shape (q, q, q, d) + :return: Gradient of the posterior covariance of shape (q, q, q, d). Here, res[i, j, k, l] is the derivative + of the [i, j]-th entry of the posterior covariance matrix with respect to x_predict[k, l] """ q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0] - dkxX_dx = np.empty((q, n, d)) - dkxx_dx = np.empty((q, q, d)) + # Tensor for the gradients of (q, n) cross-covariance matrix between x_predict and x_train with respect to + # x_predict (of shape (q, d)): + d_cross_cov_xpredict_xtrain_dx = np.zeros((d, q*q, n)) + # Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to + # x_predict (of shape (q, d)) + d_cov_xpredict_dx = np.zeros((d, q*q, q)) for i in range(d): - dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i) - dkxx_dx[:, :, i] = kern.dK_dX(x_predict, x_predict, i) + # Fill d_cross_cov_xpredict_xtrain_dx such that after reshaping to (d, q, q, n), entry [i, j] is + # the derivative of the cross-covariance between x_predict and x_train (of shape (q, n)) with respect + # to scalar x_predict[j, i] + d_cross_cov_xpredict_xtrain_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i) + # Fill d_cov_xpredict_dx such that after reshaping to (d, q, q, q), entry [i, j] is the derivative + # of the prior covariance at x_predict (of shape (q, q)) with respect to the scalar x_predict[j, i] + d_cov_xpredict_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i) + d_cross_cov_xpredict_xtrain_dx = d_cross_cov_xpredict_xtrain_dx.reshape((d, q, q, n)) + d_cov_xpredict_dx = d_cov_xpredict_dx.reshape((d, q, q, q)) + d_cov_xpredict_dx += d_cov_xpredict_dx.transpose((0, 1, 3, 2)) + d_cov_xpredict_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0. + K = kern.K(x_predict, x_train) - - dsigma = np.zeros((q, q, q, d)) - for i in range(q): - for j in range(d): - Ks = np.zeros((q, n)) - Ks[i, :] = dkxX_dx[i, :, j] - dKss_dxi = np.zeros((q, q)) - dKss_dxi[i, :] = dkxx_dx[i, :, j] - dKss_dxi[:, i] = dkxx_dx[i, :, j].T - dKss_dxi[i, i] = 0 - dsigma[:, :, i, j] = dKss_dxi - Ks @ w_inv @ K.T - K @ w_inv @ Ks.T - return dsigma + dsigma = ( + d_cov_xpredict_dx + - K @ w_inv @ d_cross_cov_xpredict_xtrain_dx.transpose((0, 1, 3, 2)) + - d_cross_cov_xpredict_xtrain_dx @ w_inv @ K.T + ) + return dsigma.transpose((2, 3, 1, 0)) def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.ndarray) -> np.ndarray: @@ -211,12 +260,14 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np. :return: Gradient of the posterior mean of shape (q, q, d) """ q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0] - dkxX_dx = np.empty((q, n, d)) + # Tensor with derivative of the (prior) cross-covariance between x_predict and x_train with respect + # to x_predict + d_cross_cov_xpredict_xtrain_dx = np.empty((q, n, d)) dmu = np.zeros((q, q, d)) for i in range(d): - dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i) + d_cross_cov_xpredict_xtrain_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i) for j in range(q): - dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten() + dmu[j, j, i] = (d_cross_cov_xpredict_xtrain_dx[j, :, i][None, :] @ w_vec[:, None]).flatten() return dmu diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py new file mode 100644 index 00000000..cce5207f --- /dev/null +++ b/tests/emukit/models/test_gpy_model_wrappers.py @@ -0,0 +1,53 @@ +import GPy +import numpy as np +import pytest + +from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper + + +@pytest.fixture +def test_data(gpy_model): + np.random.seed(42) + return np.random.randn(5, gpy_model.X.shape[1]) + + +@pytest.fixture +def test_data2(gpy_model): + np.random.seed(42) + return np.random.randn(4, gpy_model.X.shape[1]) + + +def test_joint_prediction_gradients(gpy_model, test_data): + epsilon = 1e-5 + mean, cov = gpy_model.predict_with_full_covariance(test_data) + # Get the gradients + mean_dx, cov_dx = gpy_model.get_joint_prediction_gradients(test_data) + + for i in range(test_data.shape[0]): # Iterate over each test point + for j in range(test_data.shape[1]): # Iterate over each dimension + # Approximate the gradient numerically + perturbed_input = test_data.copy() + perturbed_input[i, j] += epsilon + mean_perturbed, cov_perturbed = gpy_model.predict_with_full_covariance(perturbed_input) + mean_dx_numerical = (mean_perturbed - mean) / epsilon + cov_dx_numerical = (cov_perturbed - cov) / epsilon + # Check that numerical approx. similar to true gradient + assert pytest.approx(mean_dx_numerical.ravel(), abs=1e-8, rel=1e-2) == mean_dx[:, i, j] + assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j] + + +def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2): + epsilon = 1e-5 + cov = gpy_model.get_covariance_between_points(test_data, test_data2) + # Get the gradients + cov_dx = gpy_model.get_covariance_between_points_gradients(test_data, test_data2) + + for i in range(test_data.shape[0]): # Iterate over each test point + for j in range(test_data.shape[1]): # Iterate over each dimension + # Approximate the gradient numerically + perturbed_input = test_data.copy() + perturbed_input[i, j] += epsilon + cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2) + cov_dx_numerical = (cov_perturbed[i] - cov[i]) / epsilon + # Check that numerical approx. similar to true gradient + assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[i, :, j]