Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize implementation of Gaussian expectations of Gaussian kernels #1607

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
158 changes: 105 additions & 53 deletions gpflow/expectations/squared_exponentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,65 +170,117 @@ def _E(p, kern1, feat1, kern2, feat2, nghp=None):
"""
Compute the expectation:
expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n)
- Ka_{.,.}, Kb_{.,.} :: RBF kernels
Ka and Kb as well as Z1 and Z2 can differ from each other, but this is supported
only if the Gaussian p is Diagonal (p.cov NxD) and Ka, Kb have disjoint active_dims
in which case the joint expectations simplify into a product of expectations
- Ka_{.,.}, Kb_{.,.} :: RBF kernels

:return: NxMxM
:return: NxM1xM2
"""
if kern1.on_separate_dims(kern2) and isinstance(
p, DiagonalGaussian
): # no joint expectations required
eKxz1 = expectation(p, (kern1, feat1))
if kern1.on_separate_dims(kern2) and isinstance(p, DiagonalGaussian):
eKxz1 = expectation(p, (kern1, feat1)) # No joint expectations required
eKxz2 = expectation(p, (kern2, feat2))
return eKxz1[:, :, None] * eKxz2[:, None, :]

if feat1 != feat2 or kern1 != kern2:
if kern1.on_separate_dims(kern2):
raise NotImplementedError(
"The expectation over two kernels has only an "
"analytical implementation if both kernels are equal."
"The expectation over two kernels only has an "
"analytical implementation if both kernels have "
"the same active features."
)

kernel = kern1
inducing_variable = feat1

# use only active dimensions
Xcov = kernel.slice_cov(tf.linalg.diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov)
Z, Xmu = kernel.slice(inducing_variable.Z, p.mu)

N = tf.shape(Xmu)[0]
D = tf.shape(Xmu)[1]

squared_lengthscales = kernel.lengthscales ** 2
if not kernel.ard:
zero_lengthscales = tf.zeros((D,), dtype=squared_lengthscales.dtype)
squared_lengthscales = squared_lengthscales + zero_lengthscales

sqrt_det_L = tf.reduce_prod(0.5 * squared_lengthscales) ** 0.5
C = tf.linalg.cholesky(0.5 * tf.linalg.diag(squared_lengthscales) + Xcov) # NxDxD
dets = sqrt_det_L / tf.exp(tf.reduce_sum(tf.math.log(tf.linalg.diag_part(C)), axis=1)) # N

C_inv_mu = tf.linalg.triangular_solve(C, tf.expand_dims(Xmu, 2), lower=True) # NxDx1
C_inv_z = tf.linalg.triangular_solve(
C, tf.tile(tf.expand_dims(0.5 * tf.transpose(Z), 0), [N, 1, 1]), lower=True
) # NxDxM
mu_CC_inv_mu = tf.expand_dims(tf.reduce_sum(tf.square(C_inv_mu), 1), 2) # Nx1x1
z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1) # NxM
zm_CC_inv_zn = tf.linalg.matmul(C_inv_z, C_inv_z, transpose_a=True) # NxMxM
two_z_CC_inv_mu = 2 * tf.linalg.matmul(C_inv_z, C_inv_mu, transpose_a=True)[:, :, 0] # NxM
# NxMxM
exponent_mahalanobis = (
mu_CC_inv_mu
+ tf.expand_dims(z_CC_inv_z, 1)
+ tf.expand_dims(z_CC_inv_z, 2)
+ 2 * zm_CC_inv_zn
- tf.expand_dims(two_z_CC_inv_mu, 2)
- tf.expand_dims(two_z_CC_inv_mu, 1)
)
exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxMxM
is_same_kern = kern1 == kern2 # code branches by case for
is_same_feat = feat1 == feat2 # computational efficiency

mx = kern1.slice(p.mu)[0]
if isinstance(p, DiagonalGaussian):
Sxx = kern1.slice_cov(tf.linalg.diag(p.cov))
else:
Sxx = kern1.slice_cov(p.cov)

N = tf.shape(mx)[0] # num. random inputs $x$
D = tf.shape(mx)[1] # dimensionality of $x$

# First Gaussian kernel $k1(x, z) = exp(-0.5*(x - z) V1^{-1} (x - z))$
V1 = kern1.lengthscales ** 2 # D|1
z1 = kern1.slice(feat1.Z)[0] # M1xD
iV1_z1 = (1 / V1) * z1

# Second Gaussian kernel $k2(x, z) = exp(-0.5*(x - z) V2^{-1} (x - z))$
V2 = V1 if is_same_kern else kern2.lengthscales ** 2 # D|1
z2 = z1 if is_same_feat else kern2.slice(feat2.Z)[0] # M2xD
iV2_z2 = iV1_z1 if (is_same_kern and is_same_feat) else (1 / V2) * z2

# Product of Gaussian kernels is another Gaussian kernel $k = k1 * k2$
V = 0.5 * V1 if is_same_kern else (V1 * V2) / (V1 + V2) # D|1
if not (kern1.ard or kern2.ard):
V = tf.fill((D,), V) # D

# Product of Gaussians is an unnormalized Gaussian; compute determinant of
# this new Gaussian (and the Gaussian kernel) in order to normalize
S = Sxx + tf.linalg.diag(V)
L = tf.linalg.cholesky(S) # NxDxD
half_logdet_L = tf.reduce_sum(tf.math.log(tf.linalg.diag_part(L)), axis=1)
sqrt_det_iL = tf.exp(-half_logdet_L)
sqrt_det_L = tf.sqrt(tf.reduce_prod(V))
determinant = sqrt_det_L * sqrt_det_iL # N

# Solve for linear systems involving $S = LL^{T}$ where $S$
# is the covariance of an (unnormalized) Gaussian distribution
iL_mu = tf.linalg.triangular_solve(L, tf.expand_dims(mx, 2), lower=True) # NxDx1

V_iV1_z1 = tf.expand_dims(tf.transpose(V * iV1_z1), 0)
iL_z1 = tf.linalg.triangular_solve(L, tf.tile(V_iV1_z1, [N, 1, 1]), lower=True) # NxDxM1

z1_iS_z1 = tf.reduce_sum(tf.square(iL_z1), axis=1) # NxM1
z1_iS_mu = tf.squeeze(tf.linalg.matmul(iL_z1, iL_mu, transpose_a=True), 2) # NxM1
if is_same_kern and is_same_feat:
iL_z2 = iL_z1
z2_iS_z2 = z1_iS_z1
z2_iS_mu = z1_iS_mu
else:
V_iV2_z2 = tf.expand_dims(tf.transpose(V * iV2_z2), 0)
iL_z2 = tf.linalg.triangular_solve(L, tf.tile(V_iV2_z2, [N, 1, 1]), lower=True) # NxDxM2

z2_iS_z2 = tf.reduce_sum(tf.square(iL_z2), 1) # NxM2
z2_iS_mu = tf.squeeze(tf.matmul(iL_z2, iL_mu, transpose_a=True), 2) # NxM2

z1_iS_z2 = tf.linalg.matmul(iL_z1, iL_z2, transpose_a=True) # NxM1xM2
mu_iS_mu = tf.expand_dims(tf.reduce_sum(tf.square(iL_mu), 1), 2) # Nx1x1

# Gram matrix from Gaussian integral of Gaussian kernel $k = k1 * k2$
exp_mahalanobis = tf.exp(
-0.5
* (
mu_iS_mu
+ 2 * z1_iS_z2
+ tf.expand_dims(z1_iS_z1 - 2 * z1_iS_mu, axis=-1)
+ tf.expand_dims(z2_iS_z2 - 2 * z2_iS_mu, axis=-2)
)
) # NxM1xM2

# Part of $E_{p(x)}[k1(z1, x) k2(x, z2)]$ that is independent of $x$
if is_same_kern:
ampl2 = kern1.variance ** 2
sq_iV = tf.math.rsqrt(V)
if is_same_feat:
matrix_term = ampl2 * tf.exp(-0.125 * square_distance(sq_iV * z1, None))
else:
matrix_term = ampl2 * tf.exp(-0.125 * square_distance(sq_iV * z1, sq_iV * z2))
else:
z1_iV1_z1 = tf.reduce_sum(z1 * iV1_z1, axis=-1) # M1
z2_iV2_z2 = tf.reduce_sum(z2 * iV2_z2, axis=-1) # M2
z1_iV1pV2_z1 = tf.reduce_sum(iV1_z1 * V * iV1_z1, axis=-1)
z2_iV1pV2_z2 = tf.reduce_sum(iV2_z2 * V * iV2_z2, axis=-1)
z1_iV1pV2_z2 = tf.matmul(iV1_z1, V * iV2_z2, transpose_b=True) # M1xM2
matrix_term = (
kern1.variance
* kern2.variance
* tf.exp(
0.5
* (
2 * z1_iV1pV2_z2 # implicit negative
+ tf.expand_dims(z1_iV1pV2_z1 - z1_iV1_z1, axis=-1)
+ tf.expand_dims(z2_iV1pV2_z2 - z2_iV2_z2, axis=-2)
)
)
)

# Compute sqrt(self(Z)) explicitly to prevent automatic gradient from
# being NaN sometimes, see pull request #615
kernel_sqrt = tf.exp(-0.25 * square_distance(Z / kernel.lengthscales, None))
return kernel.variance ** 2 * kernel_sqrt * tf.reshape(dets, [N, 1, 1]) * exponent_mahalanobis
return tf.reshape(determinant, [N, 1, 1]) * matrix_term * exp_mahalanobis
1 change: 0 additions & 1 deletion notebooks

This file was deleted.

16 changes: 16 additions & 0 deletions tests/gpflow/expectations/test_expectations.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
Xcov = rng.randn(num_data, D_in, D_in)
Xcov = ctt(Xcov @ np.transpose(Xcov, (0, 2, 1)))
Z = rng.randn(num_ind, D_in)
Z2 = rng.randn(num_ind, D_in)


def markov_gauss():
Expand Down Expand Up @@ -67,6 +68,7 @@ def markov_gauss():

_kerns = {
"rbf": kernels.SquaredExponential(variance=rng.rand(), lengthscales=rng.rand() + 1.0),
"rbf2": kernels.SquaredExponential(variance=rng.rand(), lengthscales=rng.rand() + 1.0),
"lin": kernels.Linear(variance=rng.rand()),
"matern": kernels.Matern32(variance=rng.rand()),
"rbf_act_dim_0": kernels.SquaredExponential(
Expand Down Expand Up @@ -119,6 +121,11 @@ def inducing_variable():
return inducing_variables.InducingPoints(Z)


@pytest.fixture
def inducing_variable2():
return inducing_variables.InducingPoints(Z2)


def _check(params):
analytic = expectation(*params)
quad = quadrature_expectation(*params)
Expand Down Expand Up @@ -242,6 +249,15 @@ def test_eKzxKxz_same_vs_different_sum_kernels(distribution, kern1, kern2, induc
assert_allclose(same, different, rtol=RTOL)


@pytest.mark.parametrize("distribution", distrs("gauss", "gauss_diag"))
@pytest.mark.parametrize("kern1", kerns("rbf"))
@pytest.mark.parametrize("kern2", kerns("rbf2"))
def test_RBF_eKzxKxz_differentK_differentZ(
distribution, kern1, kern2, inducing_variable, inducing_variable2
):
_check((distribution, (kern1, inducing_variable), (kern2, inducing_variable2)))


@pytest.mark.parametrize("distribution", distrs("markov_gauss"))
@pytest.mark.parametrize("kernel", kern_args2)
@pytest.mark.parametrize("mean", means("identity"))
Expand Down