fix Cosine kernel for multivariate inputs (#1357)

The previous version operating on Euclidean distance was returning indefinite covariance matrices on multivariate data. This version, derived from eq. 4.7 of Wilson (2014), is always positive semidefinite. Closes #1328. This PR also changes the definition of the cosine kernel slightly, from sigma * cos(|x - x'| / l) to sigma * cos(2 * pi * (x - x') / l). This makes the lengthscale parameter directly interpretable as period length. It introduces new IsotropicStationary and AnisotropicStationary base classes.
GPflow · Mar 26, 2020 · 8d0128c · 8d0128c
1 parent c442e87
commit 8d0128c
Show file tree

Hide file tree

Showing 9 changed files with 216 additions and 60 deletions.
diff --git a/doc/source/notebooks/tailor/kernel_design.pct.py b/doc/source/notebooks/tailor/kernel_design.pct.py
@@ -33,8 +33,8 @@
 # %matplotlib inline
 
 # %% [markdown]
-# To make this new kernel class, we inherit from the base class `gpflow.kernels.Kernel` and implement the three functions below. **NOTE:** Depending on the kernel to be implemented, other classes can be more adequate. For example, if the kernel to be implemented is stationary, you can immediately subclass `gpflow.kernels.Stationary` (at which point you
-# only have to override `K_r` or `K_r2`; see the `Stationary` class docstring).
+# To make this new kernel class, we inherit from the base class `gpflow.kernels.Kernel` and implement the three functions below. **NOTE:** Depending on the kernel to be implemented, other classes can be more adequate. For example, if the kernel to be implemented is isotropic stationary, you can immediately subclass `gpflow.kernels.IsotropicStationary` (at which point you
+# only have to override `K_r` or `K_r2`; see the `IsotropicStationary` class docstring). Stationary but anisotropic kernels should subclass `gpflow.kernels.AnisotropicStationary` and override `K_d`.
 #
 # #### `__init__`
 # In this simple example, the constructor takes no argument (though it could, if that was convenient, for example to pass in an initial value for `variance`). It *must* call the constructor of the superclass with appropriate arguments. Brownian motion is only defined in one dimension, and we'll assume that the `active_dims` are `[0]`, for simplicity.

diff --git a/gpflow/kernels/__init__.py b/gpflow/kernels/__init__.py
@@ -21,6 +21,8 @@
     Matern52,
     RationalQuadratic,
     Stationary,
+    IsotropicStationary,
+    AnisotropicStationary,
 )
 
 Bias = Constant

diff --git a/gpflow/kernels/periodic.py b/gpflow/kernels/periodic.py
@@ -5,8 +5,9 @@
 
 from ..base import Parameter
 from ..utilities import positive
+from ..utilities.ops import difference_matrix
 from .base import Kernel
-from .stationaries import Stationary
+from .stationaries import Stationary, IsotropicStationary
 
 
 class Periodic(Kernel):
@@ -37,16 +38,16 @@ class Periodic(Kernel):
         the constructor doesn't have it as an argument.
     """
 
-    def __init__(self, base_kernel: Stationary, period: Union[float, List[float]] = 1.0):
+    def __init__(self, base_kernel: IsotropicStationary, period: Union[float, List[float]] = 1.0):
         """
         :param base_kernel: the base kernel to make periodic; must inherit from Stationary
             Note that `active_dims` should be specified in the base kernel.
         :param period: the period; to induce a different period per active dimension
             this must be initialized with an array the same length as the number
             of active dimensions e.g. [1., 1., 1.]
         """
-        if not isinstance(base_kernel, Stationary):
-            raise TypeError("Periodic requires a Stationary kernel as the `base_kernel`")
+        if not isinstance(base_kernel, IsotropicStationary):
+            raise TypeError("Periodic requires an IsotropicStationary kernel as the `base_kernel`")
 
         super().__init__()
         self.base_kernel = base_kernel
@@ -65,15 +66,7 @@ def K_diag(self, X: tf.Tensor) -> tf.Tensor:
         return self.base_kernel.K_diag(X)
 
     def K(self, X: tf.Tensor, X2: Optional[tf.Tensor] = None) -> tf.Tensor:
-        if X2 is None:
-            X2 = X
-
-        # Introduce dummy dimension so we can use broadcasting
-        # TODO: does not support kernel broadcasting
-        f = tf.expand_dims(X, 1)  # now [N, 1, D]
-        f2 = tf.expand_dims(X2, 0)  # now [1, M, D]
-
-        r = np.pi * (f - f2) / self.period
+        r = np.pi * (difference_matrix(X, X2)) / self.period
         scaled_sine = tf.sin(r) / self.base_kernel.lengthscales
         if hasattr(self.base_kernel, "K_r"):
             sine_r = tf.reduce_sum(tf.abs(scaled_sine), -1)

diff --git a/gpflow/kernels/stationaries.py b/gpflow/kernels/stationaries.py
@@ -3,15 +3,15 @@
 
 from ..base import Parameter
 from ..utilities import positive
-from ..utilities.ops import square_distance
+from ..utilities.ops import square_distance, difference_matrix
 from .base import Kernel
 
 
 class Stationary(Kernel):
     """
     Base class for kernels that are stationary, that is, they only depend on
 
-        r = || x - x' ||
+        d = x - x'
 
     This class handles 'ard' behaviour, which stands for 'Automatic Relevance
     Determination'. This means that the kernel has one lengthscale per
@@ -46,35 +46,76 @@ def ard(self) -> bool:
         """
         return self.lengthscales.shape.ndims > 0
 
-    def scaled_squared_euclid_dist(self, X, X2=None):
-        """
-        Returns ||(X - X2ᵀ) / ℓ||² i.e. squared L2-norm.
-        """
-        X_scaled = X / self.lengthscales
-        X2_scaled = X2 / self.lengthscales if X2 is not None else X2
-        return square_distance(X_scaled, X2_scaled)
+    def scale(self, X):
+        X_scaled = X / self.lengthscales if X is not None else X
+        return X_scaled
+
+    def K_diag(self, X):
+        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))
+
+
+class IsotropicStationary(Stationary):
+    """
+    Base class for isotropic stationary kernels, i.e. kernels that only
+    depend on
+
+        r = ‖x - x'‖
+
+    Derived classes should implement one of:
+
+        K_r2(self, r2): Returns the kernel evaluated on r² (r2), which is the
+        squared scaled Euclidean distance Should operate element-wise on r2.
+
+        K_r(self, r): Returns the kernel evaluated on r, which is the scaled
+        Euclidean distance. Should operate element-wise on r.
+    """
 
     def K(self, X, X2=None):
         r2 = self.scaled_squared_euclid_dist(X, X2)
         return self.K_r2(r2)
 
-    def K_diag(self, X):
-        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))
-
     def K_r2(self, r2):
-        """
-        Returns the kernel evaluated on r² (`r2`), which is the squared scaled Euclidean distance
-        Should operate element-wise on r²
-        """
         if hasattr(self, "K_r"):
             # Clipping around the (single) float precision which is ~1e-45.
             r = tf.sqrt(tf.maximum(r2, 1e-40))
             return self.K_r(r)  # pylint: disable=no-member
+        raise NotImplementedError
+
+    def scaled_squared_euclid_dist(self, X, X2=None):
+        """
+        Returns ‖(X - X2ᵀ) / ℓ‖², i.e. the squared L₂-norm.
+        """
+        return square_distance(self.scale(X), self.scale(X2))
+
+
+class AnisotropicStationary(Stationary):
+    """
+    Base class for anisotropic stationary kernels, i.e. kernels that only
+    depend on
 
+        d = x - x'
+
+    Derived classes should implement K_d(self, d): Returns the kernel evaluated
+    on d, which is the pairwise difference matrix, scaled by the lengthscale
+    parameter ℓ (i.e. [(X - X2ᵀ) / ℓ]). The last axis corresponds to the
+    input dimension.
+    """
+
+    def K(self, X, X2=None):
+        return self.K_d(self.scaled_difference_matrix(X, X2))
+
+    def scaled_difference_matrix(self, X, X2=None):
+        """
+        Returns [(X - X2ᵀ) / ℓ]. If X has shape [..., N, D] and
+        X2 has shape [..., M, D], the output will have shape [..., N, M, D].
+        """
+        return difference_matrix(self.scale(X), self.scale(X2))
+
+    def K_d(self, d):
         raise NotImplementedError
 
 
-class SquaredExponential(Stationary):
+class SquaredExponential(IsotropicStationary):
     """
     The radial basis function (RBF) or squared exponential kernel. The kernel equation is
 
@@ -91,7 +132,7 @@ def K_r2(self, r2):
         return self.variance * tf.exp(-0.5 * r2)
 
 
-class RationalQuadratic(Stationary):
+class RationalQuadratic(IsotropicStationary):
     """
     Rational Quadratic kernel,
 
@@ -112,7 +153,7 @@ def K_r2(self, r2):
         return self.variance * (1 + 0.5 * r2 / self.alpha) ** (-self.alpha)
 
 
-class Exponential(Stationary):
+class Exponential(IsotropicStationary):
     """
     The Exponential kernel. It is equivalent to a Matern12 kernel with doubled lengthscales.
     """
@@ -121,7 +162,7 @@ def K_r(self, r):
         return self.variance * tf.exp(-0.5 * r)
 
 
-class Matern12(Stationary):
+class Matern12(IsotropicStationary):
     """
     The Matern 1/2 kernel. Functions drawn from a GP with this kernel are not
     differentiable anywhere. The kernel equation is
@@ -137,7 +178,7 @@ def K_r(self, r):
         return self.variance * tf.exp(-r)
 
 
-class Matern32(Stationary):
+class Matern32(IsotropicStationary):
     """
     The Matern 3/2 kernel. Functions drawn from a GP with this kernel are once
     differentiable. The kernel equation is
@@ -154,7 +195,7 @@ def K_r(self, r):
         return self.variance * (1.0 + sqrt3 * r) * tf.exp(-sqrt3 * r)
 
 
-class Matern52(Stationary):
+class Matern52(IsotropicStationary):
     """
     The Matern 5/2 kernel. Functions drawn from a GP with this kernel are twice
     differentiable. The kernel equation is
@@ -171,17 +212,19 @@ def K_r(self, r):
         return self.variance * (1.0 + sqrt5 * r + 5.0 / 3.0 * tf.square(r)) * tf.exp(-sqrt5 * r)
 
 
-class Cosine(Stationary):
+class Cosine(AnisotropicStationary):
     """
     The Cosine kernel. Functions drawn from a GP with this kernel are sinusoids
     (with a random phase).  The kernel equation is
 
-        k(r) = σ² cos{r}
+        k(r) = σ² cos{2πd}
 
     where:
-    r  is the Euclidean distance between the input points, scaled by the lengthscale parameter ℓ,
+    d  is the sum of the per-dimension differences between the input points, scaled by the
+    lengthscale parameter ℓ (i.e. Σᵢ [(X - X2ᵀ) / ℓ]ᵢ),
     σ² is the variance parameter.
     """
 
-    def K_r(self, r):
-        return self.variance * tf.cos(r)
+    def K_d(self, d):
+        d = tf.reduce_sum(d, axis=-1)
+        return self.variance * tf.cos(2 * np.pi * d)
diff --git a/gpflow/utilities/ops.py b/gpflow/utilities/ops.py
@@ -85,10 +85,10 @@ def square_distance(X, X2):
     result may actually be very slightly negative for entries very
     close to each other.
 
-    This function can deal with leading dimensions in X and X2. 
-    In the sample case, where X and X2 are both 2 dimensional, 
-    for example, X is [N, D] and X2 is [M, D], then a tensor of shape 
-    [N, M] is returned. If X is [N1, S1, D] and X2 is [N2, S2, D] 
+    This function can deal with leading dimensions in X and X2.
+    In the sample case, where X and X2 are both 2 dimensional,
+    for example, X is [N, D] and X2 is [M, D], then a tensor of shape
+    [N, M] is returned. If X is [N1, S1, D] and X2 is [N2, S2, D]
     then the output will be [N1, S1, N2, S2].
     """
     if X2 is None:
@@ -103,6 +103,29 @@ def square_distance(X, X2):
     return dist
 
 
+def difference_matrix(X, X2):
+    """
+    Returns (X - X2ᵀ)
+
+    This function can deal with leading dimensions in X and X2.
+    For example, If X has shape [M, D] and X2 has shape [N, D],
+    the output will have shape [M, N, D]. If X has shape [I, J, M, D]
+    and X2 has shape [K, L, N, D], the output will have shape
+    [I, J, M, K, L, N, D].
+    """
+    if X2 is None:
+        X2 = X
+        diff = X[..., :, tf.newaxis, :] - X2[..., tf.newaxis, :, :]
+        return diff
+    Xshape = tf.shape(X)
+    X2shape = tf.shape(X2)
+    X = tf.reshape(X, (-1, Xshape[-1]))
+    X2 = tf.reshape(X2, (-1, X2shape[-1]))
+    diff = X[:, tf.newaxis, :] - X2[tf.newaxis, :, :]
+    diff = tf.reshape(diff, tf.concat((Xshape[:-1], X2shape[:-1], [Xshape[-1]]), 0))
+    return diff
+
+
 def pca_reduce(X: tf.Tensor, latent_dim: tf.Tensor) -> tf.Tensor:
     """
     A helpful function for linearly reducing the dimensionality of the input

diff --git a/tests/gpflow/kernels/test_broadcasting.py b/tests/gpflow/kernels/test_broadcasting.py
@@ -44,12 +44,19 @@
     # Following kernels do not broadcast: see https://github.com/GPflow/GPflow/issues/1339
     pytest.param(kernels.ArcCosine, marks=pytest.mark.xfail),  # broadcasting not implemented
     pytest.param(kernels.Coregion, marks=pytest.mark.xfail),  # broadcasting not implemented
-    pytest.param(kernels.Periodic, marks=pytest.mark.xfail),  # broadcasting not implemented
     pytest.param(kernels.ChangePoints, marks=pytest.mark.xfail),  # broadcasting not implemented
     pytest.param(kernels.Convolutional, marks=pytest.mark.xfail),  # broadcasting not implemented
 ]
 
 
+def check_broadcasting(kernel):
+    S, N, M, D = 5, 4, 3, 2
+    X1 = np.random.randn(S, N, D)
+    X2 = np.random.randn(M, D)
+
+    compare_vs_map(X1, X2, kernel)
+
+
 @pytest.mark.parametrize("kernel_class", gpflow.ci_utils.subclasses(kernels.Kernel))
 def test_no_kernels_missed(kernel_class):
     tested_kernel_classes = KERNEL_CLASSES + [kernels.Sum, kernels.Product]
@@ -62,26 +69,34 @@ def test_no_kernels_missed(kernel_class):
         gpflow.kernels.base.ReducingCombination,
         kernels.Static,
         kernels.Stationary,
+        kernels.IsotropicStationary,
+        kernels.AnisotropicStationary,
     ]
+    needs_constructor_parameters = [kernels.Periodic]
     if kernel_class in tested_kernel_classes:
-        return  # tested
+        return  # tested by test_broadcast_no_active_dims
     if kernel_class in skipped_kernel_classes:
         return  # not tested but currently expected to fail
     if kernel_class in abstract_base_classes:
         return  # cannot test abstract base classes
+    if kernel_class in needs_constructor_parameters:
+        return  # this has a separate test, see test_broadcast_no_active_dims_periodic
     if issubclass(kernel_class, kernels.MultioutputKernel):
         return  # TODO: cannot currently test MultioutputKernels - see https://github.com/GPflow/GPflow/issues/1339
     assert False, f"no broadcasting test for kernel class {kernel_class}"
 
 
 @pytest.mark.parametrize("kernel_class", KERNEL_CLASSES)
 def test_broadcast_no_active_dims(kernel_class):
-    S, N, M, D = 5, 4, 3, 2
-    X1 = np.random.randn(S, N, D)
-    X2 = np.random.randn(M, D)
-    kernel = kernel_class()
+    check_broadcasting(kernel_class())
 
-    compare_vs_map(X1, X2, kernel)
+
+@pytest.mark.parametrize(
+    "base_class", [kernel for kernel in gpflow.ci_utils.subclasses(kernels.IsotropicStationary)],
+)
+def test_broadcast_no_active_dims_periodic(base_class):
+    kernel = gpflow.kernels.Periodic(base_class())
+    check_broadcasting(kernel)
 
 
 @pytest.mark.parametrize("kernel_class", [gpflow.kernels.SquaredExponential])