diff --git a/emukit/core/interfaces/__init__.py b/emukit/core/interfaces/__init__.py
index 4f0537ee..699e62e7 100644
--- a/emukit/core/interfaces/__init__.py
+++ b/emukit/core/interfaces/__init__.py
@@ -8,4 +8,5 @@
     IPriorHyperparameters,  # noqa: F401
     IJointlyDifferentiable,  # noqa: F401
     IModelWithNoise,  # noqa: F401
+    ICrossCovarianceDifferentiable,  # noqa: F401
 )
diff --git a/emukit/core/interfaces/models.py b/emukit/core/interfaces/models.py
index 0926e699..7f813d11 100644
--- a/emukit/core/interfaces/models.py
+++ b/emukit/core/interfaces/models.py
@@ -72,6 +72,35 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np.
         raise NotImplementedError
 
 
+class ICrossCovarianceDifferentiable:
+    def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Calculate posterior covariance between two sets of points.
+
+        :param X1: An array of shape n_points1 x n_dimensions. This is the first argument of the
+                   posterior covariance function.
+        :param X2: An array of shape n_points2 x n_dimensions. This is the second argument of the
+                   posterior covariance function.
+        :return: An array of shape n_points1 x n_points2 of posterior covariances between X1 and X2.
+            Namely, [i, j]-th entry of the returned array will represent the posterior covariance
+            between i-th point in X1 and j-th point in X2.
+        """
+        raise NotImplementedError
+
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
+        with respect to x1.
+
+        :param X1: Prediction inputs of shape (q1, d)
+        :param X2: Prediction inputs of shape (q2, d)
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance
+            between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j])
+            with respect to x1[i, k]
+        """
+        raise NotImplementedError
+
+
 class IPriorHyperparameters:
     def generate_hyperparameters_samples(self, n_samples: int, n_burnin: int,
                                          subsample_interval: int, step_size: float, leapfrog_steps: int) -> np.ndarray:
diff --git a/emukit/model_wrappers/gpy_model_wrappers.py b/emukit/model_wrappers/gpy_model_wrappers.py
index c20c6e7b..caebc145 100644
--- a/emukit/model_wrappers/gpy_model_wrappers.py
+++ b/emukit/model_wrappers/gpy_model_wrappers.py
@@ -7,13 +7,21 @@
 import numpy as np
 import GPy
 
-from ..core.interfaces import IModel, IDifferentiable, IJointlyDifferentiable, IPriorHyperparameters, IModelWithNoise
+from ..core.interfaces import (
+    IModel,
+    IDifferentiable,
+    IJointlyDifferentiable,
+    IPriorHyperparameters,
+    IModelWithNoise,
+    ICrossCovarianceDifferentiable,
+)
 from ..experimental_design.interfaces import ICalculateVarianceReduction
 from ..bayesian_optimization.interfaces import IEntropySearchModel
 
 
 class GPyModelWrapper(
-    IModel, IDifferentiable, IJointlyDifferentiable, ICalculateVarianceReduction, IEntropySearchModel, IPriorHyperparameters, IModelWithNoise
+    IModel, IDifferentiable, IJointlyDifferentiable, ICrossCovarianceDifferentiable, ICalculateVarianceReduction,
+    IEntropySearchModel, IPriorHyperparameters, IModelWithNoise,
 ):
     """
     This is a thin wrapper around GPy models to allow users to plug GPy models into Emukit
@@ -118,6 +126,38 @@ def get_covariance_between_points(self, X1: np.ndarray, X2: np.ndarray) -> np.nd
         """
         return self.model.posterior_covariance_between_points(X1, X2, include_likelihood=False)
 
+    def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
+        """
+        Compute the derivative of the posterior covariance matrix between prediction at inputs x1 and x2
+        with respect to x1.
+
+        :param X1: Prediction inputs of shape (q1, d)
+        :param X2: Prediction inputs of shape (q2, d)
+        :return: nd array of shape (q1, q2, d) representing the gradient of the posterior covariance 
+            between x1 and x2 with respect to x1. res[i, j, k] is the gradient of Cov(y1[i], y2[j])
+            with respect to x1[i, k]
+        """
+        # Get the relevant shapes
+        q1, q2, input_dim, n_train = X1.shape[0], X2.shape[0], X1.shape[1], self.model.X.shape[0]
+        # Instatiate an array to hold gradients of prior covariance between outputs at X1 and X_train
+        cov_X1_Xtrain_grad = np.zeros((input_dim, q1, n_train))
+        # Instantiate an array to hold gradients of prior covariance between outputs at X1 and X2
+        cov_X1_X2_grad = np.zeros((input_dim, q1, q2))
+        # Calculate the gradient wrt. X1 of these prior covariances. GPy API allows for doing so
+        # only one dimension at a time, hence need to iterate over all input dimensions
+        for i in range(input_dim):
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X_train
+            cov_X1_Xtrain_grad[i, :, :] = self.model.kern.dK_dX(X1, self.model.X, i)
+            # Calculate the gradient wrt. X1 of the prior covariance between X1 and X2
+            cov_X1_X2_grad[i, :, :] = self.model.kern.dK_dX(X1, X2, i)
+        
+        # Get the prior covariance between outputs at x_train and X2
+        cov_Xtrain_X2 = self.model.kern.K(self.model.X, X2)
+        # Calculate the gradient of the posterior covariance between outputs at X1 and X2
+        cov_grad = cov_X1_X2_grad - cov_X1_Xtrain_grad @ self.model.posterior.woodbury_inv @ cov_Xtrain_X2
+        return cov_grad.transpose((1, 2, 0))
+
+
     @property
     def X(self) -> np.ndarray:
         """
@@ -177,27 +217,36 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np
     :param x_train: Training inputs of shape (n, d)
     :param kern: Covariance of the GP model
     :param w_inv: Woodbury inverse of the posterior fit of the GP
-    :return: Gradient of the posterior covariance of shape (q, q, q, d)
+    :return: Gradient of the posterior covariance of shape (q, q, q, d). Here, res[i, j, k, l] is the derivative
+        of the [i, j]-th entry of the posterior covariance matrix with respect to x_predict[k, l]
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    dkxX_dx = np.empty((q, n, d))
-    dkxx_dx = np.empty((q, q, d))
+    # Tensor for the gradients of (q, n) cross-covariance matrix between x_predict and x_train with respect to
+    # x_predict (of shape (q, d)):
+    d_cross_cov_xpredict_xtrain_dx = np.zeros((d, q*q, n))
+    # Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to
+    # x_predict (of shape (q, d))
+    d_cov_xpredict_dx = np.zeros((d, q*q, q))
     for i in range(d):
-        dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
-        dkxx_dx[:, :, i] = kern.dK_dX(x_predict, x_predict, i)
+        # Fill d_cross_cov_xpredict_xtrain_dx such that after reshaping to (d, q, q, n), entry [i, j] is 
+        # the derivative of the cross-covariance between x_predict and x_train (of shape (q, n)) with respect 
+        # to scalar x_predict[j, i]
+        d_cross_cov_xpredict_xtrain_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i)
+        # Fill d_cov_xpredict_dx such that after reshaping to (d, q, q, q), entry [i, j] is the derivative 
+        # of the prior covariance at x_predict (of shape (q, q)) with respect to the scalar x_predict[j, i]
+        d_cov_xpredict_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i)
+    d_cross_cov_xpredict_xtrain_dx = d_cross_cov_xpredict_xtrain_dx.reshape((d, q, q, n))
+    d_cov_xpredict_dx = d_cov_xpredict_dx.reshape((d, q, q, q))
+    d_cov_xpredict_dx += d_cov_xpredict_dx.transpose((0, 1, 3, 2))
+    d_cov_xpredict_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0.
+    
     K = kern.K(x_predict, x_train)
-
-    dsigma = np.zeros((q, q, q, d))
-    for i in range(q):
-        for j in range(d):
-            Ks = np.zeros((q, n))
-            Ks[i, :] = dkxX_dx[i, :, j]
-            dKss_dxi = np.zeros((q, q))
-            dKss_dxi[i, :] = dkxx_dx[i, :, j]
-            dKss_dxi[:, i] = dkxx_dx[i, :, j].T
-            dKss_dxi[i, i] = 0
-            dsigma[:, :, i, j] = dKss_dxi - Ks @ w_inv @ K.T - K @ w_inv @ Ks.T
-    return dsigma
+    dsigma = (
+        d_cov_xpredict_dx
+        - K @ w_inv @ d_cross_cov_xpredict_xtrain_dx.transpose((0, 1, 3, 2))
+        - d_cross_cov_xpredict_xtrain_dx @ w_inv @ K.T
+    )
+    return dsigma.transpose((2, 3, 1, 0))
 
 
 def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.ndarray) -> np.ndarray:
@@ -211,12 +260,14 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.
     :return: Gradient of the posterior mean of shape (q, q, d)
     """
     q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0]
-    dkxX_dx = np.empty((q, n, d))
+    # Tensor with derivative of the (prior) cross-covariance between x_predict and x_train with respect
+    # to x_predict
+    d_cross_cov_xpredict_xtrain_dx = np.empty((q, n, d))
     dmu = np.zeros((q, q, d))
     for i in range(d):
-        dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
+        d_cross_cov_xpredict_xtrain_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i)
         for j in range(q):
-            dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
+            dmu[j, j, i] = (d_cross_cov_xpredict_xtrain_dx[j, :, i][None, :] @ w_vec[:, None]).flatten()
     return dmu
 
 
diff --git a/tests/emukit/models/test_gpy_model_wrappers.py b/tests/emukit/models/test_gpy_model_wrappers.py
new file mode 100644
index 00000000..cce5207f
--- /dev/null
+++ b/tests/emukit/models/test_gpy_model_wrappers.py
@@ -0,0 +1,53 @@
+import GPy
+import numpy as np
+import pytest
+
+from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper
+
+
+@pytest.fixture
+def test_data(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(5, gpy_model.X.shape[1])
+
+
+@pytest.fixture
+def test_data2(gpy_model):
+    np.random.seed(42)
+    return np.random.randn(4, gpy_model.X.shape[1])
+
+
+def test_joint_prediction_gradients(gpy_model, test_data):
+    epsilon = 1e-5
+    mean, cov = gpy_model.predict_with_full_covariance(test_data)
+    # Get the gradients
+    mean_dx, cov_dx = gpy_model.get_joint_prediction_gradients(test_data)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            mean_perturbed, cov_perturbed = gpy_model.predict_with_full_covariance(perturbed_input)
+            mean_dx_numerical = (mean_perturbed - mean) / epsilon
+            cov_dx_numerical = (cov_perturbed - cov) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(mean_dx_numerical.ravel(), abs=1e-8, rel=1e-2) == mean_dx[:, i, j]
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j]
+    
+
+def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2):
+    epsilon = 1e-5
+    cov = gpy_model.get_covariance_between_points(test_data, test_data2)
+    # Get the gradients
+    cov_dx = gpy_model.get_covariance_between_points_gradients(test_data, test_data2)
+
+    for i in range(test_data.shape[0]):  # Iterate over each test point
+        for j in range(test_data.shape[1]):  # Iterate over each dimension
+            # Approximate the gradient numerically
+            perturbed_input = test_data.copy()
+            perturbed_input[i, j] += epsilon
+            cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2)
+            cov_dx_numerical = (cov_perturbed[i] - cov[i]) / epsilon
+            # Check that numerical approx. similar to true gradient
+            assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[i, :, j]