GPflow · j-wilson · Nov 13, 2020 · Nov 30, 2020 · Jun 17, 2021 · Jun 17, 2021
diff --git a/gpflow/expectations/squared_exponentials.py b/gpflow/expectations/squared_exponentials.py
@@ -170,65 +170,117 @@ def _E(p, kern1, feat1, kern2, feat2, nghp=None):
     """
     Compute the expectation:
     expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n)
-        - Ka_{.,.}, Kb_{.,.} :: RBF kernels
-    Ka and Kb as well as Z1 and Z2 can differ from each other, but this is supported
-    only if the Gaussian p is Diagonal (p.cov NxD) and Ka, Kb have disjoint active_dims
-    in which case the joint expectations simplify into a product of expectations
+            - Ka_{.,.}, Kb_{.,.} :: RBF kernels
 
-    :return: NxMxM
+    :return: NxM1xM2
     """
-    if kern1.on_separate_dims(kern2) and isinstance(
-        p, DiagonalGaussian
-    ):  # no joint expectations required
-        eKxz1 = expectation(p, (kern1, feat1))
+    if kern1.on_separate_dims(kern2) and isinstance(p, DiagonalGaussian):
+        eKxz1 = expectation(p, (kern1, feat1))  # No joint expectations required
         eKxz2 = expectation(p, (kern2, feat2))
         return eKxz1[:, :, None] * eKxz2[:, None, :]
 
-    if feat1 != feat2 or kern1 != kern2:
+    if kern1.on_separate_dims(kern2):
         raise NotImplementedError(
-            "The expectation over two kernels has only an "
-            "analytical implementation if both kernels are equal."
+            "The expectation over two kernels only has an "
+            "analytical implementation if both kernels have "
+            "the same active features."
         )
 
-    kernel = kern1
-    inducing_variable = feat1
-
-    # use only active dimensions
-    Xcov = kernel.slice_cov(tf.linalg.diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov)
-    Z, Xmu = kernel.slice(inducing_variable.Z, p.mu)
-
-    N = tf.shape(Xmu)[0]
-    D = tf.shape(Xmu)[1]
-
-    squared_lengthscales = kernel.lengthscales ** 2
-    if not kernel.ard:
-        zero_lengthscales = tf.zeros((D,), dtype=squared_lengthscales.dtype)
-        squared_lengthscales = squared_lengthscales + zero_lengthscales
-
-    sqrt_det_L = tf.reduce_prod(0.5 * squared_lengthscales) ** 0.5
-    C = tf.linalg.cholesky(0.5 * tf.linalg.diag(squared_lengthscales) + Xcov)  # NxDxD
-    dets = sqrt_det_L / tf.exp(tf.reduce_sum(tf.math.log(tf.linalg.diag_part(C)), axis=1))  # N
-
-    C_inv_mu = tf.linalg.triangular_solve(C, tf.expand_dims(Xmu, 2), lower=True)  # NxDx1
-    C_inv_z = tf.linalg.triangular_solve(
-        C, tf.tile(tf.expand_dims(0.5 * tf.transpose(Z), 0), [N, 1, 1]), lower=True
-    )  # NxDxM
-    mu_CC_inv_mu = tf.expand_dims(tf.reduce_sum(tf.square(C_inv_mu), 1), 2)  # Nx1x1
-    z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1)  # NxM
-    zm_CC_inv_zn = tf.linalg.matmul(C_inv_z, C_inv_z, transpose_a=True)  # NxMxM
-    two_z_CC_inv_mu = 2 * tf.linalg.matmul(C_inv_z, C_inv_mu, transpose_a=True)[:, :, 0]  # NxM
-    # NxMxM
-    exponent_mahalanobis = (
-        mu_CC_inv_mu
-        + tf.expand_dims(z_CC_inv_z, 1)
-        + tf.expand_dims(z_CC_inv_z, 2)
-        + 2 * zm_CC_inv_zn
-        - tf.expand_dims(two_z_CC_inv_mu, 2)
-        - tf.expand_dims(two_z_CC_inv_mu, 1)
-    )
-    exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis)  # NxMxM
+    is_same_kern = kern1 == kern2  # code branches by case for
+    is_same_feat = feat1 == feat2  # computational efficiency
+
+    mx = kern1.slice(p.mu)[0]
+    if isinstance(p, DiagonalGaussian):
+        Sxx = kern1.slice_cov(tf.linalg.diag(p.cov))
+    else:
+        Sxx = kern1.slice_cov(p.cov)
+
+    N = tf.shape(mx)[0]  # num. random inputs $x$
+    D = tf.shape(mx)[1]  # dimensionality of $x$
+
+    # First Gaussian kernel $k1(x, z) = exp(-0.5*(x - z) V1^{-1} (x - z))$
+    V1 = kern1.lengthscales ** 2  # D|1
+    z1 = kern1.slice(feat1.Z)[0]  # M1xD
+    iV1_z1 = (1 / V1) * z1
+
+    # Second Gaussian kernel $k2(x, z) = exp(-0.5*(x - z) V2^{-1} (x - z))$
+    V2 = V1 if is_same_kern else kern2.lengthscales ** 2  # D|1
+    z2 = z1 if is_same_feat else kern2.slice(feat2.Z)[0]  # M2xD
+    iV2_z2 = iV1_z1 if (is_same_kern and is_same_feat) else (1 / V2) * z2
+
+    # Product of Gaussian kernels is another Gaussian kernel $k = k1 * k2$
+    V = 0.5 * V1 if is_same_kern else (V1 * V2) / (V1 + V2)  # D|1
+    if not (kern1.ard or kern2.ard):
+        V = tf.fill((D,), V)  # D
+
+    # Product of Gaussians is an unnormalized Gaussian; compute determinant of
+    # this new Gaussian (and the Gaussian kernel) in order to normalize
+    S = Sxx + tf.linalg.diag(V)
+    L = tf.linalg.cholesky(S)  # NxDxD
+    half_logdet_L = tf.reduce_sum(tf.math.log(tf.linalg.diag_part(L)), axis=1)
+    sqrt_det_iL = tf.exp(-half_logdet_L)
+    sqrt_det_L = tf.sqrt(tf.reduce_prod(V))
+    determinant = sqrt_det_L * sqrt_det_iL  # N
+
+    # Solve for linear systems involving $S = LL^{T}$ where $S$
+    # is the covariance of an (unnormalized) Gaussian distribution
+    iL_mu = tf.linalg.triangular_solve(L, tf.expand_dims(mx, 2), lower=True)  # NxDx1
+
+    V_iV1_z1 = tf.expand_dims(tf.transpose(V * iV1_z1), 0)
+    iL_z1 = tf.linalg.triangular_solve(L, tf.tile(V_iV1_z1, [N, 1, 1]), lower=True)  # NxDxM1
+
+    z1_iS_z1 = tf.reduce_sum(tf.square(iL_z1), axis=1)  # NxM1
+    z1_iS_mu = tf.squeeze(tf.linalg.matmul(iL_z1, iL_mu, transpose_a=True), 2)  # NxM1
+    if is_same_kern and is_same_feat:
+        iL_z2 = iL_z1
+        z2_iS_z2 = z1_iS_z1
+        z2_iS_mu = z1_iS_mu
+    else:
+        V_iV2_z2 = tf.expand_dims(tf.transpose(V * iV2_z2), 0)
+        iL_z2 = tf.linalg.triangular_solve(L, tf.tile(V_iV2_z2, [N, 1, 1]), lower=True)  # NxDxM2
+
+        z2_iS_z2 = tf.reduce_sum(tf.square(iL_z2), 1)  # NxM2
+        z2_iS_mu = tf.squeeze(tf.matmul(iL_z2, iL_mu, transpose_a=True), 2)  # NxM2
+
+    z1_iS_z2 = tf.linalg.matmul(iL_z1, iL_z2, transpose_a=True)  # NxM1xM2
+    mu_iS_mu = tf.expand_dims(tf.reduce_sum(tf.square(iL_mu), 1), 2)  # Nx1x1
+
+    # Gram matrix from Gaussian integral of Gaussian kernel $k = k1 * k2$
+    exp_mahalanobis = tf.exp(
+        -0.5
+        * (
+            mu_iS_mu
+            + 2 * z1_iS_z2
+            + tf.expand_dims(z1_iS_z1 - 2 * z1_iS_mu, axis=-1)
+            + tf.expand_dims(z2_iS_z2 - 2 * z2_iS_mu, axis=-2)
+        )
+    )  # NxM1xM2
+
+    # Part of $E_{p(x)}[k1(z1, x) k2(x, z2)]$ that is independent of $x$
+    if is_same_kern:
+        ampl2 = kern1.variance ** 2
+        sq_iV = tf.math.rsqrt(V)
+        if is_same_feat:
+            matrix_term = ampl2 * tf.exp(-0.125 * square_distance(sq_iV * z1, None))
+        else:
+            matrix_term = ampl2 * tf.exp(-0.125 * square_distance(sq_iV * z1, sq_iV * z2))
+    else:
+        z1_iV1_z1 = tf.reduce_sum(z1 * iV1_z1, axis=-1)  # M1
+        z2_iV2_z2 = tf.reduce_sum(z2 * iV2_z2, axis=-1)  # M2
+        z1_iV1pV2_z1 = tf.reduce_sum(iV1_z1 * V * iV1_z1, axis=-1)
+        z2_iV1pV2_z2 = tf.reduce_sum(iV2_z2 * V * iV2_z2, axis=-1)
+        z1_iV1pV2_z2 = tf.matmul(iV1_z1, V * iV2_z2, transpose_b=True)  # M1xM2
+        matrix_term = (
+            kern1.variance
+            * kern2.variance
+            * tf.exp(
+                0.5
+                * (
+                    2 * z1_iV1pV2_z2  # implicit negative
+                    + tf.expand_dims(z1_iV1pV2_z1 - z1_iV1_z1, axis=-1)
+                    + tf.expand_dims(z2_iV1pV2_z2 - z2_iV2_z2, axis=-2)
+                )
+            )
+        )
 
-    # Compute sqrt(self(Z)) explicitly to prevent automatic gradient from
-    # being NaN sometimes, see pull request #615
-    kernel_sqrt = tf.exp(-0.25 * square_distance(Z / kernel.lengthscales, None))
-    return kernel.variance ** 2 * kernel_sqrt * tf.reshape(dets, [N, 1, 1]) * exponent_mahalanobis
+    return tf.reshape(determinant, [N, 1, 1]) * matrix_term * exp_mahalanobis
diff --git a/notebooks b/notebooks
diff --git a/tests/gpflow/expectations/test_expectations.py b/tests/gpflow/expectations/test_expectations.py
@@ -38,6 +38,7 @@
 Xcov = rng.randn(num_data, D_in, D_in)
 Xcov = ctt(Xcov @ np.transpose(Xcov, (0, 2, 1)))
 Z = rng.randn(num_ind, D_in)
+Z2 = rng.randn(num_ind, D_in)
 
 
 def markov_gauss():
@@ -67,6 +68,7 @@ def markov_gauss():
 
 _kerns = {
     "rbf": kernels.SquaredExponential(variance=rng.rand(), lengthscales=rng.rand() + 1.0),
+    "rbf2": kernels.SquaredExponential(variance=rng.rand(), lengthscales=rng.rand() + 1.0),
     "lin": kernels.Linear(variance=rng.rand()),
     "matern": kernels.Matern32(variance=rng.rand()),
     "rbf_act_dim_0": kernels.SquaredExponential(
@@ -119,6 +121,11 @@ def inducing_variable():
     return inducing_variables.InducingPoints(Z)
 
 
+@pytest.fixture
+def inducing_variable2():
+    return inducing_variables.InducingPoints(Z2)
+
+
 def _check(params):
     analytic = expectation(*params)
     quad = quadrature_expectation(*params)
@@ -242,6 +249,15 @@ def test_eKzxKxz_same_vs_different_sum_kernels(distribution, kern1, kern2, induc
     assert_allclose(same, different, rtol=RTOL)
 
 
+@pytest.mark.parametrize("distribution", distrs("gauss", "gauss_diag"))
+@pytest.mark.parametrize("kern1", kerns("rbf"))
+@pytest.mark.parametrize("kern2", kerns("rbf2"))
+def test_RBF_eKzxKxz_differentK_differentZ(
+    distribution, kern1, kern2, inducing_variable, inducing_variable2
+):
+    _check((distribution, (kern1, inducing_variable), (kern2, inducing_variable2)))
+
+
 @pytest.mark.parametrize("distribution", distrs("markov_gauss"))
 @pytest.mark.parametrize("kernel", kern_args2)
 @pytest.mark.parametrize("mean", means("identity"))