From 49f538984cf1addeacf2f3a105cebbde085fb5b4 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 13:18:00 +0100
Subject: [PATCH 01/14] Add covariance between points gradient and vectorize
 covariance gradient calculation

---
 emukit/model_wrappers/gpy_model_wrappers.py | 72 ++++++++++++++++-----
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index cf891771..f832fd8e 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -61,6 +61,19 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         dvariance_dx = dSigma(X, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
         return dmean_dx, dvariance_dx
 
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Computes and returns model gradients of the covariance between outputs at points X1 and X2 with respect
+        to X1.
+
+        :param X1: points to compute gradients at, nd array of shape (q1, d)
+        :param X2: points for the covariance of which to compute the gradient, nd array of shape (q2, d)
+        :return: gradient of the covariance matrix of shape (q1, q2) between outputs at X1 and X2
+                 (return shape is (q1, q2, q1, d)).
+        """
+        dcov_dx1 = dCov(X1, X2, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
+        return dcov_dx1
+
     def set_data(self, X: np.ndarray, Y: np.ndarray) -> None:
         """
         Sets training data in model
@@ -164,24 +177,23 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     :return: Gradient of the posterior covariance of shape (q, q, q, d)
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    dkxX_dx = np.empty((q, n, d))
-    dkxx_dx = np.empty((q, q, d))
+    # Tensor for the gradients of (q, n) covariance matrix between x_predict and x_train with respect to
+    # x_predict (of shape (q, d))
+    dkxX_dx = np.zeros((d, q*q, n))
+    # Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to
+    # x_predict (of shape (q, d))
+    dkxx_dx = np.zeros((d, q*q, q))
     for i in range(d):
-        dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
-        dkxx_dx[:, :, i] = kern.dK_dX(x_predict, x_predict, i)
+        dkxX_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
+        dkxx_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
+    dkxX_dx = dkxX_dx.reshape((d, q, q, n))
+    dkxx_dx = dkxx_dx.reshape((d, q, q, q))
+    dkxx_dx += dkxx_dx.transpose((0, 1, 3, 2))
+    dkxx_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
+    
     K = kern.K(x_predict, x_train)
-
-    dsigma = np.zeros((q, q, q, d))
-    for i in range(q):
-        for j in range(d):
-            Ks = np.zeros((q, n))
-            Ks[i, :] = dkxX_dx[i, :, j]
-            dKss_dxi = np.zeros((q, q))
-            dKss_dxi[i, :] = dkxx_dx[i, :, j]
-            dKss_dxi[:, i] = dkxx_dx[i, :, j].T
-            dKss_dxi[i, i] = 0
-            dsigma[:, :, i, j] = dKss_dxi - Ks @ w_inv @ K.T - K @ w_inv @ Ks.T
-    return dsigma
+    dsigma = dkxx_dx - K @ w_inv @ dkxX_dx.transpose((0, 1, 3, 2)) - dkxX_dx @ w_inv @ K.T
+    return dsigma.transpose((2, 3, 1, 0))
 
 
 def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.ndarray) -> np.ndarray:
@@ -203,6 +215,34 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
             dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
     return dmu
 
+
+def dCov(x1: np.ndarray, x2: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np.ndarray) -> np.ndarray:
+    """
+    Compute the derivative of the posterior covariance matrix between prediction inputs x1 and x2
+    (of shape (q1, q2)) with respect to x1
+
+    :param x1: Prediction inputs of shape (q1, d)
+    :param x2: Prediction inputs of shape (q2, d)
+    :param x_train: Training inputs of shape (n, d)
+    :param kern: Covariance of the GP model
+    :param w_inv: Woodbury inverse of the posterior fit of the GP
+    :return: nd array of shape (q1, q2, q1, d) representing the gradient of the posterior covariance between x1 and x2,
+        where res[:, :, i, j] is the gradient of the covariance between outputs at x1 and x2 with respect to x1[i, j]
+    """
+    q1, q2, d, n = x1.shape[0], x2.shape[0], x1.shape[1], x_train.shape[0]
+    dkx1X_dx = np.zeros((d, q1*q1, n))
+    dkx1x2_dx = np.zeros((d, q1*q1, q2))
+    for i in range(d):
+        dkx1X_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x_train, i)
+        dkx1x2_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x2, i)
+    dkx1X_dx = dkx1X_dx.reshape((d, q1, q1, n))
+    dkx1x2_dx = dkx1x2_dx.reshape((d, q1, q1, q2))
+    
+    K_Xx2 = kern.K(x_train, x2)
+    dcov = dkx1x2_dx - dkx1X_dx @ w_inv @ K_Xx2
+    return dcov.transpose((2, 3, 1, 0))
+
+
 class GPyMultiOutputWrapper(IModel, IDifferentiable, ICalculateVarianceReduction, IEntropySearchModel):
     """
     A wrapper around GPy multi-output models.

From d0e25c8c786ce2a0f45d18725e15aa804af3b55c Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 13:54:06 +0100
Subject: [PATCH 02/14] Add tests for the gradients

---
 .../emukit/models/test_gpy_model_wrappers.py  | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tests/emukit/models/test_gpy_model_wrappers.py

diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
new file mode 100644
index 00000000..ca31fd58
--- /dev/null
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -0,0 +1,53 @@
+import GPy
+import numpy as np
+import pytest
+
+from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper
+
+
+@pytest.fixture
+def test_data(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(5, gpy_model.X.shape[1])
+
+
+@pytest.fixture
+def test_data2(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(4, gpy_model.X.shape[1])
+
+
+def test_joint_prediction_gradients(gpy_model, test_data):
+    epsilon = 1e-5
+    mean, cov = gpy_model.predict_with_full_covariance(test_data)
+    # Get the gradients
+    mean_dx, cov_dx = gpy_model.get_joint_prediction_gradients(test_data)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            mean_perturbed, cov_perturbed = gpy_model.predict_with_full_covariance(perturbed_input)
+            mean_dx_numerical = (mean_perturbed - mean) / epsilon
+            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(mean_dx_numerical, abs=1e-8, rel=1e-3) == mean_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+    
+
+def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2):
+    epsilon = 1e-5
+    cov = gpy_model.get_covariance_between_points(test_data, test_data2)
+    # Get the gradients
+    cov_dx = gpy_model.get_covariance_between_points(test_data, test_data2)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
+            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]

From bda97ad4fc51852bcc37e364c3f8f09f76bf741d Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 14:27:36 +0100
Subject: [PATCH 03/14] Fix shapes in gradient tests

---
 tests/emukit/models/test_gpy_model_wrappers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
index ca31fd58..375e9f08 100644
--- a/tests/emukit/models/test_gpy_model_wrappers.py
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -32,15 +32,15 @@ def test_joint_prediction_gradients(gpy_model, test_data):
             mean_dx_numerical = (mean_perturbed - mean) / epsilon
             cov_dx_numerical = (cov_perturbed - cov) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(mean_dx_numerical, abs=1e-8, rel=1e-3) == mean_dx[:, :, i, j]
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+            assert pytest.approx(mean_dx_numerical.ravel(), abs=1e-8, rel=1e-2) == mean_dx[:, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]
     
 
 def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2):
     epsilon = 1e-5
     cov = gpy_model.get_covariance_between_points(test_data, test_data2)
     # Get the gradients
-    cov_dx = gpy_model.get_covariance_between_points(test_data, test_data2)
+    cov_dx = gpy_model.get_covariance_between_points_gradients(test_data, test_data2)
 
     for i in range(test_data.shape[0]):  # Iterate over each test point
         for j in range(test_data.shape[1]):  # Iterate over each dimension
@@ -50,4 +50,4 @@ def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data
             cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
             cov_dx_numerical = (cov_perturbed - cov) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]

From 6ed9094c86522a6f072834989f5f3ca60f41087b Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Mon, 12 Apr 2021 15:10:05 +0100
Subject: [PATCH 04/14] Rewrite the covariance gradient calculation code

---
 emukit/model_wrappers/gpy_model_wrappers.py   | 74 +++++++++----------
 .../emukit/models/test_gpy_model_wrappers.py  |  4 +-
 2 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 80fc2f9c..69588e17 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -61,19 +61,6 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         dvariance_dx = dSigma(X, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
         return dmean_dx, dvariance_dx
 
-    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
-        """
-        Computes and returns model gradients of the covariance between outputs at points X1 and X2 with respect
-        to X1.
-
-        :param X1: points to compute gradients at, nd array of shape (q1, d)
-        :param X2: points for the covariance of which to compute the gradient, nd array of shape (q2, d)
-        :return: gradient of the covariance matrix of shape (q1, q2) between outputs at X1 and X2
-                 (return shape is (q1, q2, q1, d)).
-        """
-        dcov_dx1 = dCov(X1, X2, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
-        return dcov_dx1
-
     def set_data(self, X: np.ndarray, Y: np.ndarray) -> None:
         """
         Sets training data in model
@@ -120,6 +107,40 @@ def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.nd
         """
         return self.model.posterior_covariance_between_points(X1, X2)
 
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
+        with respect to x1.
+
+        :param x1: Prediction inputs of shape (q1, d)
+        :param x2: Prediction inputs of shape (q2, d)
+        :param x_train: Training inputs of shape (n_train, d)
+        :param kern: Covariance of the GP model
+        :param w_inv: Woodbury inverse of the posterior fit of the GP
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance between x1 and x2
+            with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j]) with respect to x1[i, k]
+        """
+        # Get the relevent shapes
+        q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0]
+        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train
+        cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train))
+        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X2
+        cov_X1_X2_grad = np.zeros((input_dim, q1, q2))
+        # Calculate the gradient wrt. X1 of these prior covariances. GPy API allows for doing so
+        # only one dimension at a time, hence need to iterate over all input dimensions
+        for i in range(input_dim):
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X_train
+            cov_X1_Xtrain_grad[i, :, :] = self.model.kern.dK_dX(X1, self.model.X, i)
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X2
+            cov_X1_X2_grad[i, :, :] = self.model.kern.dK_dX(X1, X2, i)
+        
+        # Get the prior covariance between outputs at x_train and X2
+        cov_Xtrain_X2 = self.model.kern.K(self.model.X, X2)
+        # Calculate the gradient of the posterior covariance between outputs at X1 and X2
+        cov_grad = cov_X1_X2_grad - cov_X1_Xtrain_grad @ self.model.posterior.woodbury_inv @ cov_Xtrain_X2
+        return cov_grad.transpose((1, 2, 0))
+
+
     @property
     def X(self) -> np.ndarray:
         """
@@ -223,33 +244,6 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
     return dmu
 
 
-def dCov(x1: np.ndarray, x2: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np.ndarray) -> np.ndarray:
-    """
-    Compute the derivative of the posterior covariance matrix between prediction inputs x1 and x2
-    (of shape (q1, q2)) with respect to x1
-
-    :param x1: Prediction inputs of shape (q1, d)
-    :param x2: Prediction inputs of shape (q2, d)
-    :param x_train: Training inputs of shape (n, d)
-    :param kern: Covariance of the GP model
-    :param w_inv: Woodbury inverse of the posterior fit of the GP
-    :return: nd array of shape (q1, q2, q1, d) representing the gradient of the posterior covariance between x1 and x2,
-        where res[:, :, i, j] is the gradient of the covariance between outputs at x1 and x2 with respect to x1[i, j]
-    """
-    q1, q2, d, n = x1.shape[0], x2.shape[0], x1.shape[1], x_train.shape[0]
-    dkx1X_dx = np.zeros((d, q1*q1, n))
-    dkx1x2_dx = np.zeros((d, q1*q1, q2))
-    for i in range(d):
-        dkx1X_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x_train, i)
-        dkx1x2_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x2, i)
-    dkx1X_dx = dkx1X_dx.reshape((d, q1, q1, n))
-    dkx1x2_dx = dkx1x2_dx.reshape((d, q1, q1, q2))
-    
-    K_Xx2 = kern.K(x_train, x2)
-    dcov = dkx1x2_dx - dkx1X_dx @ w_inv @ K_Xx2
-    return dcov.transpose((2, 3, 1, 0))
-
-
 class GPyMultiOutputWrapper(IModel, IDifferentiable, ICalculateVarianceReduction, IEntropySearchModel):
     """
     A wrapper around GPy multi-output models.
diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
index 375e9f08..cce5207f 100644
--- a/tests/emukit/models/test_gpy_model_wrappers.py
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -48,6 +48,6 @@ def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data
             perturbed_input = test_data.copy()
             perturbed_input[i, j] += epsilon
             cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
-            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            cov_dx_numerical = (cov_perturbed[i] - cov[i]) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[i, :, j]

From e7b436be555b9ff1f2d55e8141470ba199295c5b Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 13:18:00 +0100
Subject: [PATCH 05/14] Add covariance between points gradient and vectorize
 covariance gradient calculation

---
 emukit/model_wrappers/gpy_model_wrappers.py | 72 ++++++++++++++++-----
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 0bb9560e..80fc2f9c 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -61,6 +61,19 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         dvariance_dx = dSigma(X, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
         return dmean_dx, dvariance_dx
 
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Computes and returns model gradients of the covariance between outputs at points X1 and X2 with respect
+        to X1.
+
+        :param X1: points to compute gradients at, nd array of shape (q1, d)
+        :param X2: points for the covariance of which to compute the gradient, nd array of shape (q2, d)
+        :return: gradient of the covariance matrix of shape (q1, q2) between outputs at X1 and X2
+                 (return shape is (q1, q2, q1, d)).
+        """
+        dcov_dx1 = dCov(X1, X2, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
+        return dcov_dx1
+
     def set_data(self, X: np.ndarray, Y: np.ndarray) -> None:
         """
         Sets training data in model
@@ -171,24 +184,23 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     :return: Gradient of the posterior covariance of shape (q, q, q, d)
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    dkxX_dx = np.empty((q, n, d))
-    dkxx_dx = np.empty((q, q, d))
+    # Tensor for the gradients of (q, n) covariance matrix between x_predict and x_train with respect to
+    # x_predict (of shape (q, d))
+    dkxX_dx = np.zeros((d, q*q, n))
+    # Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to
+    # x_predict (of shape (q, d))
+    dkxx_dx = np.zeros((d, q*q, q))
     for i in range(d):
-        dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
-        dkxx_dx[:, :, i] = kern.dK_dX(x_predict, x_predict, i)
+        dkxX_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
+        dkxx_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
+    dkxX_dx = dkxX_dx.reshape((d, q, q, n))
+    dkxx_dx = dkxx_dx.reshape((d, q, q, q))
+    dkxx_dx += dkxx_dx.transpose((0, 1, 3, 2))
+    dkxx_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
+    
     K = kern.K(x_predict, x_train)
-
-    dsigma = np.zeros((q, q, q, d))
-    for i in range(q):
-        for j in range(d):
-            Ks = np.zeros((q, n))
-            Ks[i, :] = dkxX_dx[i, :, j]
-            dKss_dxi = np.zeros((q, q))
-            dKss_dxi[i, :] = dkxx_dx[i, :, j]
-            dKss_dxi[:, i] = dkxx_dx[i, :, j].T
-            dKss_dxi[i, i] = 0
-            dsigma[:, :, i, j] = dKss_dxi - Ks @ w_inv @ K.T - K @ w_inv @ Ks.T
-    return dsigma
+    dsigma = dkxx_dx - K @ w_inv @ dkxX_dx.transpose((0, 1, 3, 2)) - dkxX_dx @ w_inv @ K.T
+    return dsigma.transpose((2, 3, 1, 0))
 
 
 def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.ndarray) -> np.ndarray:
@@ -210,6 +222,34 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
             dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
     return dmu
 
+
+def dCov(x1: np.ndarray, x2: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np.ndarray) -> np.ndarray:
+    """
+    Compute the derivative of the posterior covariance matrix between prediction inputs x1 and x2
+    (of shape (q1, q2)) with respect to x1
+
+    :param x1: Prediction inputs of shape (q1, d)
+    :param x2: Prediction inputs of shape (q2, d)
+    :param x_train: Training inputs of shape (n, d)
+    :param kern: Covariance of the GP model
+    :param w_inv: Woodbury inverse of the posterior fit of the GP
+    :return: nd array of shape (q1, q2, q1, d) representing the gradient of the posterior covariance between x1 and x2,
+        where res[:, :, i, j] is the gradient of the covariance between outputs at x1 and x2 with respect to x1[i, j]
+    """
+    q1, q2, d, n = x1.shape[0], x2.shape[0], x1.shape[1], x_train.shape[0]
+    dkx1X_dx = np.zeros((d, q1*q1, n))
+    dkx1x2_dx = np.zeros((d, q1*q1, q2))
+    for i in range(d):
+        dkx1X_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x_train, i)
+        dkx1x2_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x2, i)
+    dkx1X_dx = dkx1X_dx.reshape((d, q1, q1, n))
+    dkx1x2_dx = dkx1x2_dx.reshape((d, q1, q1, q2))
+    
+    K_Xx2 = kern.K(x_train, x2)
+    dcov = dkx1x2_dx - dkx1X_dx @ w_inv @ K_Xx2
+    return dcov.transpose((2, 3, 1, 0))
+
+
 class GPyMultiOutputWrapper(IModel, IDifferentiable, ICalculateVarianceReduction, IEntropySearchModel):
     """
     A wrapper around GPy multi-output models.

From 8e1699030cc3415c51b12db1e85c89940a2c46d1 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 13:54:06 +0100
Subject: [PATCH 06/14] Add tests for the gradients

---
 .../emukit/models/test_gpy_model_wrappers.py  | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tests/emukit/models/test_gpy_model_wrappers.py

diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
new file mode 100644
index 00000000..ca31fd58
--- /dev/null
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -0,0 +1,53 @@
+import GPy
+import numpy as np
+import pytest
+
+from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper
+
+
+@pytest.fixture
+def test_data(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(5, gpy_model.X.shape[1])
+
+
+@pytest.fixture
+def test_data2(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(4, gpy_model.X.shape[1])
+
+
+def test_joint_prediction_gradients(gpy_model, test_data):
+    epsilon = 1e-5
+    mean, cov = gpy_model.predict_with_full_covariance(test_data)
+    # Get the gradients
+    mean_dx, cov_dx = gpy_model.get_joint_prediction_gradients(test_data)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            mean_perturbed, cov_perturbed = gpy_model.predict_with_full_covariance(perturbed_input)
+            mean_dx_numerical = (mean_perturbed - mean) / epsilon
+            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(mean_dx_numerical, abs=1e-8, rel=1e-3) == mean_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+    
+
+def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2):
+    epsilon = 1e-5
+    cov = gpy_model.get_covariance_between_points(test_data, test_data2)
+    # Get the gradients
+    cov_dx = gpy_model.get_covariance_between_points(test_data, test_data2)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
+            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]

From 38a0691b9883884b933ee16e6d69be598a263d65 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Wed, 17 Feb 2021 14:27:36 +0100
Subject: [PATCH 07/14] Fix shapes in gradient tests

---
 tests/emukit/models/test_gpy_model_wrappers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
index ca31fd58..375e9f08 100644
--- a/tests/emukit/models/test_gpy_model_wrappers.py
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -32,15 +32,15 @@ def test_joint_prediction_gradients(gpy_model, test_data):
             mean_dx_numerical = (mean_perturbed - mean) / epsilon
             cov_dx_numerical = (cov_perturbed - cov) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(mean_dx_numerical, abs=1e-8, rel=1e-3) == mean_dx[:, :, i, j]
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+            assert pytest.approx(mean_dx_numerical.ravel(), abs=1e-8, rel=1e-2) == mean_dx[:, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]
     
 
 def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2):
     epsilon = 1e-5
     cov = gpy_model.get_covariance_between_points(test_data, test_data2)
     # Get the gradients
-    cov_dx = gpy_model.get_covariance_between_points(test_data, test_data2)
+    cov_dx = gpy_model.get_covariance_between_points_gradients(test_data, test_data2)
 
     for i in range(test_data.shape[0]):  # Iterate over each test point
         for j in range(test_data.shape[1]):  # Iterate over each dimension
@@ -50,4 +50,4 @@ def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data
             cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
             cov_dx_numerical = (cov_perturbed - cov) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-3) == cov_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]

From d6d6b0f140267519dcc33508df5c2bf75d01b628 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Mon, 12 Apr 2021 15:10:05 +0100
Subject: [PATCH 08/14] Rewrite the covariance gradient calculation code

---
 emukit/model_wrappers/gpy_model_wrappers.py   | 74 +++++++++----------
 .../emukit/models/test_gpy_model_wrappers.py  |  4 +-
 2 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 80fc2f9c..69588e17 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -61,19 +61,6 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         dvariance_dx = dSigma(X, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
         return dmean_dx, dvariance_dx
 
-    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
-        """
-        Computes and returns model gradients of the covariance between outputs at points X1 and X2 with respect
-        to X1.
-
-        :param X1: points to compute gradients at, nd array of shape (q1, d)
-        :param X2: points for the covariance of which to compute the gradient, nd array of shape (q2, d)
-        :return: gradient of the covariance matrix of shape (q1, q2) between outputs at X1 and X2
-                 (return shape is (q1, q2, q1, d)).
-        """
-        dcov_dx1 = dCov(X1, X2, self.model.X, self.model.kern, self.model.posterior.woodbury_inv)
-        return dcov_dx1
-
     def set_data(self, X: np.ndarray, Y: np.ndarray) -> None:
         """
         Sets training data in model
@@ -120,6 +107,40 @@ def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.nd
         """
         return self.model.posterior_covariance_between_points(X1, X2)
 
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
+        with respect to x1.
+
+        :param x1: Prediction inputs of shape (q1, d)
+        :param x2: Prediction inputs of shape (q2, d)
+        :param x_train: Training inputs of shape (n_train, d)
+        :param kern: Covariance of the GP model
+        :param w_inv: Woodbury inverse of the posterior fit of the GP
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance between x1 and x2
+            with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j]) with respect to x1[i, k]
+        """
+        # Get the relevent shapes
+        q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0]
+        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train
+        cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train))
+        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X2
+        cov_X1_X2_grad = np.zeros((input_dim, q1, q2))
+        # Calculate the gradient wrt. X1 of these prior covariances. GPy API allows for doing so
+        # only one dimension at a time, hence need to iterate over all input dimensions
+        for i in range(input_dim):
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X_train
+            cov_X1_Xtrain_grad[i, :, :] = self.model.kern.dK_dX(X1, self.model.X, i)
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X2
+            cov_X1_X2_grad[i, :, :] = self.model.kern.dK_dX(X1, X2, i)
+        
+        # Get the prior covariance between outputs at x_train and X2
+        cov_Xtrain_X2 = self.model.kern.K(self.model.X, X2)
+        # Calculate the gradient of the posterior covariance between outputs at X1 and X2
+        cov_grad = cov_X1_X2_grad - cov_X1_Xtrain_grad @ self.model.posterior.woodbury_inv @ cov_Xtrain_X2
+        return cov_grad.transpose((1, 2, 0))
+
+
     @property
     def X(self) -> np.ndarray:
         """
@@ -223,33 +244,6 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
     return dmu
 
 
-def dCov(x1: np.ndarray, x2: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np.ndarray) -> np.ndarray:
-    """
-    Compute the derivative of the posterior covariance matrix between prediction inputs x1 and x2
-    (of shape (q1, q2)) with respect to x1
-
-    :param x1: Prediction inputs of shape (q1, d)
-    :param x2: Prediction inputs of shape (q2, d)
-    :param x_train: Training inputs of shape (n, d)
-    :param kern: Covariance of the GP model
-    :param w_inv: Woodbury inverse of the posterior fit of the GP
-    :return: nd array of shape (q1, q2, q1, d) representing the gradient of the posterior covariance between x1 and x2,
-        where res[:, :, i, j] is the gradient of the covariance between outputs at x1 and x2 with respect to x1[i, j]
-    """
-    q1, q2, d, n = x1.shape[0], x2.shape[0], x1.shape[1], x_train.shape[0]
-    dkx1X_dx = np.zeros((d, q1*q1, n))
-    dkx1x2_dx = np.zeros((d, q1*q1, q2))
-    for i in range(d):
-        dkx1X_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x_train, i)
-        dkx1x2_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x2, i)
-    dkx1X_dx = dkx1X_dx.reshape((d, q1, q1, n))
-    dkx1x2_dx = dkx1x2_dx.reshape((d, q1, q1, q2))
-    
-    K_Xx2 = kern.K(x_train, x2)
-    dcov = dkx1x2_dx - dkx1X_dx @ w_inv @ K_Xx2
-    return dcov.transpose((2, 3, 1, 0))
-
-
 class GPyMultiOutputWrapper(IModel, IDifferentiable, ICalculateVarianceReduction, IEntropySearchModel):
     """
     A wrapper around GPy multi-output models.
diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
index 375e9f08..cce5207f 100644
--- a/tests/emukit/models/test_gpy_model_wrappers.py
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -48,6 +48,6 @@ def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data
             perturbed_input = test_data.copy()
             perturbed_input[i, j] += epsilon
             cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
-            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            cov_dx_numerical = (cov_perturbed[i] - cov[i]) / epsilon
             # Check that numerical approx. similar to true gradient
-            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[i, :, j]

From bcf4416f0755b779ea7352bef32bec13afb710e1 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 13:26:21 +0100
Subject: [PATCH 09/14] Fix typos and remove redundant args in doc-strings

---
 emukit/model_wrappers/gpy_model_wrappers.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 6f7a14e3..dba0de69 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -123,15 +123,13 @@ def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray
         Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
         with respect to x1.
 
-        :param x1: Prediction inputs of shape (q1, d)
-        :param x2: Prediction inputs of shape (q2, d)
-        :param x_train: Training inputs of shape (n_train, d)
-        :param kern: Covariance of the GP model
-        :param w_inv: Woodbury inverse of the posterior fit of the GP
-        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance between x1 and x2
-            with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j]) with respect to x1[i, k]
-        """
-        # Get the relevent shapes
+        :param X1: Prediction inputs of shape (q1, d)
+        :param X2: Prediction inputs of shape (q2, d)
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance 
+            between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j])
+            with respect to x1[i, k]
+        """
+        # Get the relevant shapes
         q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0]
         # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train
         cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train))

From 26960ee65bedba801af849a52ddd778bce196efa Mon Sep 17 00:00:00 2001
From: Bruno Kacper Mlodozeniec <BKMlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 13:26:36 +0100
Subject: [PATCH 10/14] Fix typo in emukit/model_wrappers/gpy_model_wrappers.py

Co-authored-by: Andrei Paleyes <apaleyes@users.noreply.github.com>
---
 emukit/model_wrappers/gpy_model_wrappers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index dba0de69..be19750a 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -133,7 +133,7 @@ def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray
         q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0]
         # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train
         cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train))
-        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X2
+        # Instantiate an array to hold gradients of prior covariance between outputs at X1 and X2
         cov_X1_X2_grad = np.zeros((input_dim, q1, q2))
         # Calculate the gradient wrt. X1 of these prior covariances. GPy API allows for doing so
         # only one dimension at a time, hence need to iterate over all input dimensions

From 2bb36d21bf900f14dc6aff24bf3b893d7143cd9f Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 13:39:13 +0100
Subject: [PATCH 11/14] Rename variable names to be more informative and
 verbose in dSigma()

---
 emukit/model_wrappers/gpy_model_wrappers.py | 24 +++++++++++----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index dba0de69..5d57727b 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -212,22 +212,24 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     :return: Gradient of the posterior covariance of shape (q, q, q, d)
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    # Tensor for the gradients of (q, n) covariance matrix between x_predict and x_train with respect to
-    # x_predict (of shape (q, d))
-    dkxX_dx = np.zeros((d, q*q, n))
+    # Tensor for the gradients of (q, n) cross-covariance matrix between x_predict and x_train with respect to
+    # x_predict (of shape (q, d)):
+    d_cross_cov_xpredict_xtrain_dx = np.zeros((d, q*q, n))
     # Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to
     # x_predict (of shape (q, d))
-    dkxx_dx = np.zeros((d, q*q, q))
+    d_cov_xpredict_dx = np.zeros((d, q*q, q))
     for i in range(d):
-        dkxX_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
-        dkxx_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
-    dkxX_dx = dkxX_dx.reshape((d, q, q, n))
-    dkxx_dx = dkxx_dx.reshape((d, q, q, q))
-    dkxx_dx += dkxx_dx.transpose((0, 1, 3, 2))
-    dkxx_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
+        # Fill d_cross_cov_xpredict_xtrain_dx such that entry [i, j] is the derivative of the cross-covariance
+        # between x_predict and x_train (of shape (q, d)) with respect to scalar x_predict[j, i]
+        d_cross_cov_xpredict_xtrain_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
+        d_cov_xpredict_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
+    d_cross_cov_xpredict_xtrain_dx = d_cross_cov_xpredict_xtrain_dx.reshape((d, q, q, n))
+    d_cov_xpredict_dx = d_cov_xpredict_dx.reshape((d, q, q, q))
+    d_cov_xpredict_dx += d_cov_xpredict_dx.transpose((0, 1, 3, 2))
+    d_cov_xpredict_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
     
     K = kern.K(x_predict, x_train)
-    dsigma = dkxx_dx - K @ w_inv @ dkxX_dx.transpose((0, 1, 3, 2)) - dkxX_dx @ w_inv @ K.T
+    dsigma = d_cov_xpredict_dx - K @ w_inv @ d_cross_cov_xpredict_xtrain_dx.transpose((0, 1, 3, 2)) - d_cross_cov_xpredict_xtrain_dx @ w_inv @ K.T
     return dsigma.transpose((2, 3, 1, 0))
 
 

From 9d52426408e58defbd7dbc443eee09b7fd1148a3 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 21:01:44 +0100
Subject: [PATCH 12/14] Add futher documentation to gradients of covariance
 calculations

---
 emukit/model_wrappers/gpy_model_wrappers.py | 24 +++++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 5d57727b..7151c249 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -209,7 +209,8 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     :param x_train: Training inputs of shape (n, d)
     :param kern: Covariance of the GP model
     :param w_inv: Woodbury inverse of the posterior fit of the GP
-    :return: Gradient of the posterior covariance of shape (q, q, q, d)
+    :return: Gradient of the posterior covariance of shape (q, q, q, d). Here, res[i, j, k, l] is the derivative
+        of the [i, j]-th entry of the posterior covariance matrix with respect to x_predict[k, l]
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
     # Tensor for the gradients of (q, n) cross-covariance matrix between x_predict and x_train with respect to
@@ -219,9 +220,12 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     # x_predict (of shape (q, d))
     d_cov_xpredict_dx = np.zeros((d, q*q, q))
     for i in range(d):
-        # Fill d_cross_cov_xpredict_xtrain_dx such that entry [i, j] is the derivative of the cross-covariance
-        # between x_predict and x_train (of shape (q, d)) with respect to scalar x_predict[j, i]
+        # Fill d_cross_cov_xpredict_xtrain_dx such that after reshaping to (d, q, q, n), entry [i, j] is 
+        # the derivative of the cross-covariance between x_predict and x_train (of shape (q, n)) with respect 
+        # to scalar x_predict[j, i]
         d_cross_cov_xpredict_xtrain_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
+        # Fill d_cov_xpredict_dx such that after reshaping to (d, q, q, q), entry [i, j] is the derivative 
+        # of the prior covariance at x_predict (of shape (q, q)) with respect to the scalar x_predict[j, i]
         d_cov_xpredict_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
     d_cross_cov_xpredict_xtrain_dx = d_cross_cov_xpredict_xtrain_dx.reshape((d, q, q, n))
     d_cov_xpredict_dx = d_cov_xpredict_dx.reshape((d, q, q, q))
@@ -229,7 +233,11 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     d_cov_xpredict_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
     
     K = kern.K(x_predict, x_train)
-    dsigma = d_cov_xpredict_dx - K @ w_inv @ d_cross_cov_xpredict_xtrain_dx.transpose((0, 1, 3, 2)) - d_cross_cov_xpredict_xtrain_dx @ w_inv @ K.T
+    dsigma = (
+        d_cov_xpredict_dx
+        - K @ w_inv @ d_cross_cov_xpredict_xtrain_dx.transpose((0, 1, 3, 2))
+        - d_cross_cov_xpredict_xtrain_dx @ w_inv @ K.T
+    )
     return dsigma.transpose((2, 3, 1, 0))
 
 
@@ -244,12 +252,14 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
     :return: Gradient of the posterior mean of shape (q, q, d)
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    dkxX_dx = np.empty((q, n, d))
+    # Tensor with derivative of the (prior) cross-covariance between x_predict and x_train with respect
+    # to x_predict
+    d_cross_cov_xpredict_xtrain_dx = np.empty((q, n, d))
     dmu = np.zeros((q, q, d))
     for i in range(d):
-        dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
+        d_cross_cov_xpredict_xtrain_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
         for j in range(q):
-            dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
+            dmu[j, j, i] = (d_cross_cov_xpredict_xtrain_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
     return dmu
 
 

From c8b9f83da2958ddcb5e2c8c98e69273547f5789f Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 21:13:50 +0100
Subject: [PATCH 13/14] Add an interface for differentiable cross-covariance
 models

---
 emukit/core/interfaces/models.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/emukit/core/interfaces/models.py b/emukit/core/interfaces/models.py
index 0926e699..7f813d11 100644
--- a/emukit/core/interfaces/models.py
+++ b/emukit/core/interfaces/models.py
@@ -72,6 +72,35 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         raise NotImplementedError
 
 
+class ICrossCovarianceDifferentiable:
+    def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Calculate posterior covariance between two sets of points.
+
+        :param X1: An array of shape n_points1 x n_dimensions. This is the first argument of the
+                   posterior covariance function.
+        :param X2: An array of shape n_points2 x n_dimensions. This is the second argument of the
+                   posterior covariance function.
+        :return: An array of shape n_points1 x n_points2 of posterior covariances between X1 and X2.
+            Namely, [i, j]-th entry of the returned array will represent the posterior covariance
+            between i-th point in X1 and j-th point in X2.
+        """
+        raise NotImplementedError
+
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
+        with respect to x1.
+
+        :param X1: Prediction inputs of shape (q1, d)
+        :param X2: Prediction inputs of shape (q2, d)
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance
+            between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j])
+            with respect to x1[i, k]
+        """
+        raise NotImplementedError
+
+
 class IPriorHyperparameters:
     def generate_hyperparameters_samples(self, n_samples: int, n_burnin: int,
                                          subsample_interval: int, step_size: float, leapfrog_steps: int) -> np.ndarray:

From f3453428e59dd7f2f66e6398980f59cabe5f4665 Mon Sep 17 00:00:00 2001
From: Bruno Mlodozeniec <bkmlodozeniec@gmail.com>
Date: Sat, 1 Jan 2022 21:17:12 +0100
Subject: [PATCH 14/14] Incorporate interface into GPyModel

---
 emukit/core/interfaces/__init__.py          |  1 +
 emukit/model_wrappers/gpy_model_wrappers.py | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/emukit/core/interfaces/__init__.py b/emukit/core/interfaces/__init__.py
index 4f0537ee..699e62e7 100644
--- a/emukit/core/interfaces/__init__.py
+++ b/emukit/core/interfaces/__init__.py
@@ -8,4 +8,5 @@
     IPriorHyperparameters,  # noqa: F401
     IJointlyDifferentiable,  # noqa: F401
     IModelWithNoise,  # noqa: F401
+    ICrossCovarianceDifferentiable,  # noqa: F401
 )
diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index 7151c249..bf33cddd 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -7,13 +7,21 @@
 import numpy as np
 import GPy
 
-from ..core.interfaces import IModel, IDifferentiable, IJointlyDifferentiable, IPriorHyperparameters, IModelWithNoise
+from ..core.interfaces import (
+    IModel,
+    IDifferentiable,
+    IJointlyDifferentiable,
+    IPriorHyperparameters,
+    IModelWithNoise,
+    ICrossCovarianceDifferentiable,
+)
 from ..experimental_design.interfaces import ICalculateVarianceReduction
 from ..bayesian_optimization.interfaces import IEntropySearchModel
 
 
 class GPyModelWrapper(
-    IModel, IDifferentiable, IJointlyDifferentiable, ICalculateVarianceReduction, IEntropySearchModel, IPriorHyperparameters, IModelWithNoise
+    IModel, IDifferentiable, IJointlyDifferentiable, ICrossCovarianceDifferentiable, ICalculateVarianceReduction,
+    IEntropySearchModel, IPriorHyperparameters, IModelWithNoise,
 ):
     """
     This is a thin wrapper around GPy models to allow users to plug GPy models into Emukit