Support determistic randomness.

GPflow · Oct 14, 2022 · 8e32933 · 8e32933
1 parent 8b6c83c
commit 8e32933
Show file tree

Hide file tree

Showing 57 changed files with 1,069 additions and 423 deletions.
diff --git a/benchmark/run.py b/benchmark/run.py
@@ -32,6 +32,7 @@
 
 import numpy as np
 import pandas as pd
+import tensorflow_probability as tfp
 from tabulate import tabulate
 
 import benchmark.benchmarks  # pylint: disable=unused-import  # Make sure registry is loaded.
@@ -63,19 +64,22 @@ def _collect_metrics(
     test_data = dataset.test
 
     rng = np.random.default_rng(random_seed)
+    mk_tf_seed = tfp.util.SeedStream(seed=rng.integers(1_000_000, size=2), salt="_collect_metrics")
     model_fac = MODEL_FACTORIES.get(task.model_name)
     model = model_fac.create_model(train_data, rng)
 
     metrics = {}
 
-    model.predict_y(test_data.X)  # Warm-up TF.
+    model.predict_y(test_data.X, seed=mk_tf_seed())  # Warm-up TF.
 
     print("Model before training:")
     gpflow.utilities.print_summary(model)
 
     if task.do_optimise:
         t_before = perf_counter()
-        loss_fn = gpflow.models.training_loss_closure(model, train_data.XY, compile=task.do_compile)
+        loss_fn = gpflow.models.training_loss_closure(
+            model, train_data.XY, seed=mk_tf_seed(), compile=task.do_compile
+        )
         opt_log = gpflow.optimizers.Scipy().minimize(
             loss_fn,
             variables=model.trainable_variables,
@@ -101,10 +105,10 @@ def _collect_metrics(
         metrics[mt.prediction_time] = t_after - t_before
 
         metrics[mt.nlpd] = -np.sum(
-            likelihood.predict_log_density(test_data.X, f_m, f_v, test_data.Y)
+            likelihood.predict_log_density(test_data.X, f_m, f_v, test_data.Y, seed=mk_tf_seed())
         )
 
-        y_m, _y_v = likelihood.predict_mean_and_var(test_data.X, f_m, f_v)
+        y_m, _y_v = likelihood.predict_mean_and_var(test_data.X, f_m, f_v, seed=mk_tf_seed())
         error = test_data.Y - y_m
         metrics[mt.mae] = np.average(np.abs(error))
         metrics[mt.rmse] = np.average(error ** 2) ** 0.5
@@ -122,10 +126,10 @@ def _collect_metrics(
         metrics[mt.posterior_prediction_time] = t_after - t_before
 
         metrics[mt.posterior_nlpd] = -np.sum(
-            likelihood.predict_log_density(test_data.X, f_m, f_v, test_data.Y)
+            likelihood.predict_log_density(test_data.X, f_m, f_v, test_data.Y, seed=mk_tf_seed())
         )
 
-        y_m, _y_v = likelihood.predict_mean_and_var(test_data.X, f_m, f_v)
+        y_m, _y_v = likelihood.predict_mean_and_var(test_data.X, f_m, f_v, seed=mk_tf_seed())
         error = test_data.Y - y_m
         metrics[mt.posterior_mae] = np.average(np.abs(error))
         metrics[mt.posterior_rmse] = np.average(error ** 2) ** 0.5

diff --git a/doc/sphinx/notebooks/advanced/random_state.pct.py b/doc/sphinx/notebooks/advanced/random_state.pct.py
@@ -0,0 +1,210 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Managing random state
+#
+# Many GPflow methods take an optional `seed` parameter. This can take three types of values:
+# * `None`.
+# * An integer.
+# * A tensor of shape `[2]` and `dtype=tf.int32`.
+#
+# Below we will go over how these are interpreted, and what that means.
+
+# %%
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
+
+import gpflow
+
+# %% [markdown]
+# Let us quickly create a model to test on:
+
+# %%
+model = gpflow.models.GPR(
+    (np.zeros((0, 1)), np.zeros((0, 1))),
+    kernel=gpflow.kernels.SquaredExponential(),
+)
+Xplot = np.linspace(0.0, 10.0, 100)[:, None]
+
+# %% [markdown]
+# ## Seed `None`
+#
+# When the seed is set to `None`, the randomness depends on the state of the TensorFlow global seed. This is the default behaviour.
+
+# %%
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(Xplot, seed=None)
+plt.plot(Xplot, Yplot)
+
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(Xplot, seed=None)
+plt.plot(Xplot, Yplot, ls=":")
+
+tf.random.set_seed(456)
+Yplot = model.predict_f_samples(Xplot, seed=None)
+_ = plt.plot(Xplot, Yplot, ls="-.")
+
+# %% [markdown]
+# ## Integer seed
+# When the seed is set to an integer, the randomness depends on both the TensorFlow global seed, and the `seed` passed to the method.
+
+# %%
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(Xplot, seed=1)
+plt.plot(Xplot, Yplot)
+
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(Xplot, seed=1)
+plt.plot(Xplot, Yplot, ls=":")
+
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(Xplot, seed=2)
+plt.plot(Xplot, Yplot, ls="-.")
+
+tf.random.set_seed(456)
+Yplot = model.predict_f_samples(Xplot, seed=2)
+_ = plt.plot(Xplot, Yplot, ls="--")
+
+# %% [markdown]
+# ## Full random state as seed
+# If you set the `seed` to a tensor of shape `[2]` and `dtype=tf.int32`, that will completely define the randomness used, and the TensorFlow global seed will be ignored.
+
+# %%
+tf.random.set_seed(123)
+Yplot = model.predict_f_samples(
+    Xplot, seed=tf.constant([12, 34], dtype=tf.int32)
+)
+plt.plot(Xplot, Yplot)
+
+tf.random.set_seed(456)
+Yplot = model.predict_f_samples(
+    Xplot, seed=tf.constant([12, 34], dtype=tf.int32)
+)
+plt.plot(Xplot, Yplot, ls=":")
+
+tf.random.set_seed(456)
+Yplot = model.predict_f_samples(
+    Xplot, seed=tf.constant([56, 78], dtype=tf.int32)
+)
+_ = plt.plot(Xplot, Yplot, ls="-.")
+
+
+# %% [markdown]
+# When using the full state as random seed it is important you are careful about when you do and do not reuse the seed. If you write a function, that takes a seed, and that function makes multiple calls, that also takes seeds, you should generally be careful to pass different seeds to the called functions. You can use [tfp.random.split_seed](https://www.tensorflow.org/probability/api_docs/python/tfp/random/split_seed) to create multiple new seeds from one seed. For example:
+
+# %%
+def calls_two(seed: tf.Tensor) -> None:
+    s1, s2 = tfp.random.split_seed(seed)
+    model.predict_f_samples(Xplot, seed=s1)
+    model.predict_f_samples(Xplot, seed=s2)
+
+
+def calls_in_loop(seed: tf.Tensor) -> None:
+    for _ in range(5):
+        seed, s = tfp.random.split_seed(seed)
+        calls_two(s)
+
+
+calls_in_loop(tf.constant([12, 34], dtype=tf.int32))
+
+# %% [markdown]
+# ## Stable randomness in model optimisation
+#
+# By default the `training_loss` method has `seed = None`, which means that every call to it will have different randomness. Some of the GPflow likelihoods rely on randomness, and this means your loss function will become noisy. You can get a deterministic loss function by using `training_loss_closure` instead, which allows you to bind a fixed seed to your loss:
+
+# %%
+X = np.array([[1.0], [3.0], [9.0]])
+Y = np.array([[0.0], [2.0], [2.0]])
+model = gpflow.models.GPR(
+    (X, Y),
+    kernel=gpflow.kernels.SquaredExponential(),
+)
+fixed_seed = tf.constant([12, 34], dtype=tf.int32)
+opt = gpflow.optimizers.Scipy()
+opt.minimize(
+    model.training_loss_closure(seed=fixed_seed), model.trainable_variables
+)
+
+
+# %% [markdown]
+# ## Stable randomness used in optimisation
+#
+# Generally random sampling with a fixed random state is stable enough to be used over changing model parameters. For example, observe how adding data to a model warps the random sample:
+
+# %%
+model1 = gpflow.models.GPR(
+    (np.zeros((0, 1)), np.zeros((0, 1))),
+    kernel=gpflow.kernels.SquaredExponential(),
+)
+
+X = np.array([[1.0], [3.0], [9.0]])
+Y = np.array([[0.0], [2.0], [2.0]])
+
+model2 = gpflow.models.GPR(
+    (X, Y),
+    kernel=gpflow.kernels.SquaredExponential(),
+)
+
+fixed_seed = tf.constant([12, 34], dtype=tf.int32)
+
+plt.scatter(X, Y, c="C1")
+
+Yplot = model1.predict_f_samples(Xplot, seed=fixed_seed)
+plt.plot(Xplot, Yplot)
+
+Yplot = model2.predict_f_samples(Xplot, seed=fixed_seed)
+_ = plt.plot(Xplot, Yplot)
+
+# %% [markdown]
+# Or, let us try optimising model parameters to fit a random sample to data:
+
+# %%
+model = gpflow.models.GPR(
+    (np.zeros((0, 1)), np.zeros((0, 1))),
+    kernel=gpflow.kernels.SquaredExponential(),
+)
+
+X = np.array([[1.0], [3.0], [9.0]])
+Y = np.array([[0.0], [-1.0], [2.0]])
+fixed_seed = tf.constant([12, 34], dtype=tf.int32)
+
+plt.scatter(X, Y, c="black")
+
+Yplot = model.predict_f_samples(Xplot, seed=fixed_seed)
+plt.plot(Xplot, Yplot)
+
+data_indices = tf.searchsorted(Xplot[:, 0], X[:, 0])
+
+
+def loss() -> tf.Tensor:
+    Yplot = model.predict_f_samples(Xplot, seed=fixed_seed)
+    Ypred = tf.gather(Yplot, data_indices)
+    delta = Ypred - Y
+    return tf.reduce_sum(delta ** 2)
+
+
+opt = gpflow.optimizers.Scipy()
+opt.minimize(loss, model.trainable_variables)
+
+Yplot = model.predict_f_samples(Xplot, seed=fixed_seed)
+_ = plt.plot(Xplot, Yplot)
+
+# %% [markdown]
+# ## Implementation details
+#
+# Behind the scenes any `seed` is pushed through [tfp.random.sanitize_seed](https://www.tensorflow.org/probability/api_docs/python/tfp/random/sanitize_seed), and fed to `tf.random.stateless_*`.
diff --git a/doc/sphinx/notebooks/basics/regression.pct.py b/doc/sphinx/notebooks/basics/regression.pct.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.0
+#       jupytext_version: 1.14.1
 #   kernelspec:
 #     display_name: Python 3
 #     language: python

diff --git a/gpflow/base.py b/gpflow/base.py
@@ -41,6 +41,18 @@
     else:
         AnyNDArray = Union[np.ndarray]  # type: ignore[misc]
 
+Seed = Union[None, int, tf.Tensor]
+"""
+Type of optional random seeds.
+
+Use a tensor of shape ``[2]`` to use that as a deterministic seed.
+Alternatively use an integer or ``None`` to use the TensorFlow global random seed.
+
+See also:
+* https://www.tensorflow.org/probability/api_docs/python/tfp/random/sanitize_seed
+* https://www.tensorflow.org/api_docs/python/tf/random/create_rng_state
+"""
+
 VariableData = Union[List[Any], Tuple[Any], AnyNDArray, int, float]  # deprecated
 Transform = Union[tfp.bijectors.Bijector]
 Prior = Union[tfp.distributions.Distribution]

diff --git a/gpflow/conditionals/multioutput/sample_conditionals.py b/gpflow/conditionals/multioutput/sample_conditionals.py
@@ -16,7 +16,7 @@
 
 import tensorflow as tf
 
-from ...base import SamplesMeanAndVariance
+from ...base import SamplesMeanAndVariance, Seed
 from ...experimental.check_shapes import check_shapes
 from ...inducing_variables import (
     SeparateIndependentInducingVariables,
@@ -50,6 +50,7 @@ def _sample_conditional(
     q_sqrt: Optional[tf.Tensor] = None,
     white: bool = False,
     num_samples: Optional[int] = None,
+    seed: Seed = None,
 ) -> SamplesMeanAndVariance:
     """
      `sample_conditional` will return a sample from the conditional distribution.
@@ -58,6 +59,8 @@ def _sample_conditional(
      However, for some combinations of Mok and Mof, more efficient sampling routines exist.
      The dispatcher will make sure that we use the most efficent one.
 
+    :param seed: Random seed. Interpreted as by
+        `tfp.random.sanitize_seed <https://www.tensorflow.org/probability/api_docs/python/tfp/random/sanitize_seed>`_\.
     :return: samples, mean, cov
     """
     if full_cov:
@@ -71,7 +74,9 @@ def _sample_conditional(
     g_mu, g_var = ind_conditional(
         Xnew, inducing_variable, kernel, f, white=white, q_sqrt=q_sqrt
     )  # [..., N, L], [..., N, L]
-    g_sample = sample_mvn(g_mu, g_var, full_cov, num_samples=num_samples)  # [..., (S), N, L]
+    g_sample = sample_mvn(
+        g_mu, g_var, full_cov, num_samples=num_samples, seed=seed
+    )  # [..., (S), N, L]
     f_mu, f_var = mix_latent_gp(kernel.W, g_mu, g_var, full_cov, full_output_cov)
     f_sample = tf.tensordot(g_sample, kernel.W, [[-1], [-1]])  # [..., N, P]
     return f_sample, f_mu, f_var
diff --git a/gpflow/conditionals/sample_conditionals.py b/gpflow/conditionals/sample_conditionals.py
@@ -16,7 +16,7 @@
 
 import tensorflow as tf
 
-from ..base import SamplesMeanAndVariance
+from ..base import SamplesMeanAndVariance, Seed
 from ..experimental.check_shapes import check_shapes
 from ..inducing_variables import InducingVariables
 from ..kernels import Kernel
@@ -48,6 +48,7 @@ def _sample_conditional(
     q_sqrt: Optional[tf.Tensor] = None,
     white: bool = False,
     num_samples: Optional[int] = None,
+    seed: Seed = None,
 ) -> SamplesMeanAndVariance:
     """
     `sample_conditional` will return a sample from the conditional distribution.
@@ -56,6 +57,8 @@ def _sample_conditional(
     However, for some combinations of Mok and Mof more efficient sampling routines exists.
     The dispatcher will make sure that we use the most efficient one.
 
+    :param seed: Random seed. Interpreted as by
+        `tfp.random.sanitize_seed <https://www.tensorflow.org/probability/api_docs/python/tfp/random/sanitize_seed>`_\.
     :return: samples, mean, cov
     """
 
@@ -78,14 +81,14 @@ def _sample_conditional(
         # cov: [..., P, N, N]
         mean_for_sample = tf.linalg.adjoint(mean)  # [..., P, N]
         samples = sample_mvn(
-            mean_for_sample, cov, full_cov=True, num_samples=num_samples
+            mean_for_sample, cov, full_cov=True, num_samples=num_samples, seed=seed
         )  # [..., (S), P, N]
         samples = tf.linalg.adjoint(samples)  # [..., (S), N, P]
     else:
         # mean: [..., N, P]
         # cov: [..., N, P] or [..., N, P, P]
         samples = sample_mvn(
-            mean, cov, full_cov=full_output_cov, num_samples=num_samples
+            mean, cov, full_cov=full_output_cov, num_samples=num_samples, seed=seed
         )  # [..., (S), N, P]
 
     return samples, mean, cov