Migrate flax from using old-style PRNG keys to new-style typed PRNG keys

Functionally, this involves changing uses of jax.random.PRNGKey to jax.random.key. For details on this change and the motivation behind it, see the draft JEP at google/jax#17297, and please feel free to offer comments and feedback! PiperOrigin-RevId: 565475405
8bitmp3 · Oct 9, 2023 · e51e71c · e51e71c
1 parent 93d31fd
commit e51e71c
Show file tree

Hide file tree

Showing 113 changed files with 612 additions and 629 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,11 @@ vNext
 -
 -
 -
--
+- Use new typed PRNG keys throughout flax: this essentially involved changing
+  uses of `jax.random.PRNGKey` to `jax.random.key`.
+  (See [JEP 9263](https://github.com/google/jax/pull/17297) for details).
+  If you notice dispatch performance regressions after this change, be sure
+  you update `jax` to version 0.4.16 or newer.
 -
 -
 -

diff --git a/README.md b/README.md
@@ -119,7 +119,7 @@ class MLP(nn.Module):
 
 model = MLP([12, 8, 4])
 batch = jnp.ones((32, 10))
-variables = model.init(jax.random.PRNGKey(0), batch)
+variables = model.init(jax.random.key(0), batch)
 output = model.apply(variables, batch)
 ```
 
@@ -142,7 +142,7 @@ class CNN(nn.Module):
 
 model = CNN()
 batch = jnp.ones((32, 64, 64, 10))  # (N, H, W, C) format
-variables = model.init(jax.random.PRNGKey(0), batch)
+variables = model.init(jax.random.key(0), batch)
 output = model.apply(variables, batch)
 ```
 
@@ -174,7 +174,7 @@ model = AutoEncoder(encoder_widths=[20, 10, 5],
                     decoder_widths=[5, 10, 20],
                     input_shape=(12,))
 batch = jnp.ones((16, 12))
-variables = model.init(jax.random.PRNGKey(0), batch)
+variables = model.init(jax.random.key(0), batch)
 encoded = model.apply(variables, batch, method=model.encode)
 decoded = model.apply(variables, encoded, method=model.decode)
 ```

diff --git a/docs/developer_notes/lift.md b/docs/developer_notes/lift.md
@@ -85,7 +85,7 @@ class ManualVmapMLP(nn.Module):
     return apply_fn({'params': mlp_params}, xs)
 
 xs = jnp.ones((3, 4))
-variables = ManualVmapMLP().init(random.PRNGKey(0), xs)
+variables = ManualVmapMLP().init(random.key(0), xs)
 print(jax.tree_util.tree_map(jnp.shape, variables['params']))
 """==>
 {
@@ -270,7 +270,7 @@ def lift_transpose(fn, target='params', variables=True, rngs=True):
       rng_filters=(rngs,))
 
 x = jnp.ones((3, 2))
-y, params = init(lift_transpose(core_nn.dense))(random.PRNGKey(0), x, 4)
+y, params = init(lift_transpose(core_nn.dense))(random.key(0), x, 4)
 ```
 
 NOTE that most users should not need to interact with `pack` directly.
@@ -310,7 +310,7 @@ class LinenVmapMLP(nn.Module):
     VmapMLP = nn.vmap(MLP, variable_axes={'params': 0}, split_rngs={'params': True}, in_axes=0)
     return VmapMLP(name='mlp')(xs)
 
-variables = LinenVmapMLP().init(random.PRNGKey(0), xs)
+variables = LinenVmapMLP().init(random.key(0), xs)
 print(jax.tree_util.tree_map(jnp.shape, variables['params']))
 """==>
 {
@@ -346,7 +346,7 @@ class LinenStatefulVmapMLP(nn.Module):
   def __call__(self, xs, *, train):
     VmapMLP = nn.vmap(StatefulMLP, variable_axes={'params': 0, 'batch_stats': 0}, split_rngs={'params': True}, in_axes=0)
     return VmapMLP(name='mlp')(xs, train=train)
-variables = LinenStatefulVmapMLP().init(random.PRNGKey(0), xs)
+variables = LinenStatefulVmapMLP().init(random.key(0), xs)
 ```
 
 All we had to add to `nn.vmap` is `'batch_stats': 0`, indicating that the batch stats are vectorized rather than shared along the first axis.

diff --git a/docs/developer_notes/module_lifecycle.rst b/docs/developer_notes/module_lifecycle.rst
@@ -59,7 +59,7 @@ Now we want to construct and use the ``MLP`` Module:
 
   mlp = MLP(hidden_size=5, out_size=3)
   x = jax.numpy.ones((1, 2))
-  variables = mlp.init(random.PRNGKey(0), x)
+  variables = mlp.init(random.key(0), x)
   y = mlp.apply(variables, x)
 
 
@@ -70,8 +70,8 @@ Let's take a closer look at initialization. Surprisingly, there actually is no s
 
 .. testcode::
 
-  # equivalent to: variables = mlp.init(random.PRNGKey(0), x)
-  _, variables = mlp.apply({}, x, rngs={"params": random.PRNGKey(0)}, mutable=True)
+  # equivalent to: variables = mlp.init(random.key(0), x)
+  _, variables = mlp.apply({}, x, rngs={"params": random.key(0)}, mutable=True)
 
 
 Thus, ``init`` is nothing more than a wrapper around ``apply`` where:
@@ -155,7 +155,7 @@ Another benefit of defining submodules and/or variables inline is that you can a
 
   mdl = CompactScaledMLP(hidden_size=4, out_size=5)
   x = jax.numpy.ones((3, 2))
-  vars = mdl.init(random.PRNGKey(0), x)
+  vars = mdl.init(random.key(0), x)
   assert vars["params"]["scale"].shape == (2,)
 
 Many of the standard Linen Modules like ``nn.Dense`` use shape inference already to avoid the need to specify input shapes (like the number of input features to a Dense layer).
@@ -207,7 +207,7 @@ The latter is done as follows:
     return mdl(z, "decode")
 
   mdl = CorrectModule()
-  vars = nn.init(init_fn, mdl)(random.PRNGKey(0))
+  vars = nn.init(init_fn, mdl)(random.key(0))
   assert vars["params"]["Dense_0"]["kernel"].shape == (2, 8)
   assert vars["params"]["Dense_1"]["kernel"].shape == (8, 4)
 
@@ -348,7 +348,7 @@ Function closure is the most common way to accidentally hide a JAX array or Line
 
   x = jax.numpy.ones((3, 2))
   mdl = Foo()
-  vars = mdl.init(random.PRNGKey(0), x)
+  vars = mdl.init(random.key(0), x)
   assert vars['params']['Dense_0']['kernel'].shape == (3, 2, 2)
 
 

diff --git a/docs/flip/1009-optimizer-api.md b/docs/flip/1009-optimizer-api.md
@@ -496,7 +496,7 @@ def get_learning_rate(step):
 
 
 model = Model()
-rng = jax.random.PRNGKey(0)
+rng = jax.random.key(0)
 ds = tfds.load('mnist')['train'].take(160).map(pp).batch(16)
 batch = next(iter(ds))
 variables = model.init(rng, jnp.array(batch['image'][:1]))

diff --git a/docs/flip/2396-rnn.md b/docs/flip/2396-rnn.md
@@ -18,7 +18,7 @@ def __call__(self, x):
     nn.LSTMCell, variable_broadcast="params", split_rngs={"params": False}
   )
   carry = LSTM.initialize_carry(
-    jax.random.PRNGKey(0), batch_dims=x.shape[:1], size=self.hidden_size
+    jax.random.key(0), batch_dims=x.shape[:1], size=self.hidden_size
   )
   carry, x = LSTM()(carry, x)
   return x
@@ -91,7 +91,7 @@ Where:
 * `initial_carry`: the initial carry, if not provided it will be initialized
   using the cell's :meth:`RNNCellBase.initialize_carry` method.
 * `init_key`: a PRNG key used to initialize the carry, if not provided
-  ``jax.random.PRNGKey(0)`` will be used. Most cells will ignore this
+  ``jax.random.key(0)`` will be used. Most cells will ignore this
   argument.
 * `seq_lengths`: an optional integer array of shape ``(*batch)`` indicating
   the length of each sequence, elements whose index in the time dimension

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
@@ -223,7 +223,7 @@
     "import jax.numpy as jnp  # JAX NumPy\n",
     "\n",
     "cnn = CNN()\n",
-    "print(cnn.tabulate(jax.random.PRNGKey(0), jnp.ones((1, 28, 28, 1))))"
+    "print(cnn.tabulate(jax.random.key(0), jnp.ones((1, 28, 28, 1))))"
    ]
   },
   {
@@ -521,7 +521,7 @@
    },
    "outputs": [],
    "source": [
-    "init_rng = jax.random.PRNGKey(0)"
+    "init_rng = jax.random.key(0)"
    ]
   },
   {

diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -131,7 +131,7 @@ import jax
 import jax.numpy as jnp  # JAX NumPy
 
 cnn = CNN()
-print(cnn.tabulate(jax.random.PRNGKey(0), jnp.ones((1, 28, 28, 1))))
+print(cnn.tabulate(jax.random.key(0), jnp.ones((1, 28, 28, 1))))
 ```
 
 +++ {"id": "4b5ac16e"}
@@ -332,7 +332,7 @@ executionInfo:
   timestamp: 1673483485436
 id: e4f6f4d3
 ---
-init_rng = jax.random.PRNGKey(0)
+init_rng = jax.random.key(0)
 ```
 
 +++ {"id": "80fbb60b"}

diff --git a/docs/guides/batch_norm.rst b/docs/guides/batch_norm.rst
@@ -81,15 +81,15 @@ The ``batch_stats`` collection must be extracted from the ``variables`` for late
 
   mlp = MLP()
   x = jnp.ones((1, 3))
-  variables = mlp.init(jax.random.PRNGKey(0), x)
+  variables = mlp.init(jax.random.key(0), x)
   params = variables['params']
 
 
   jax.tree_util.tree_map(jnp.shape, variables)
   ---
   mlp = MLP()
   x = jnp.ones((1, 3))
-  variables = mlp.init(jax.random.PRNGKey(0), x, train=False) #!
+  variables = mlp.init(jax.random.key(0), x, train=False) #!
   params = variables['params']
   batch_stats = variables['batch_stats'] #!
 

diff --git a/docs/guides/convert_pytorch_to_flax.rst b/docs/guides/convert_pytorch_to_flax.rst
@@ -31,7 +31,7 @@ and the Flax kernel has shape [inC, outC]. Transposing the kernel will do the tr
   # [outC, inC] -> [inC, outC]
   kernel = jnp.transpose(kernel, (1, 0))
 
-  key = random.PRNGKey(0)
+  key = random.key(0)
   x = random.normal(key, (1, 3))
 
   variables = {'params': {'kernel': kernel, 'bias': bias}}
@@ -62,7 +62,7 @@ and the Flax kernel has shape [kH, kW, inC, outC]. Transposing the kernel will d
   # [outC, inC, kH, kW] -> [kH, kW, inC, outC]
   kernel = jnp.transpose(kernel, (2, 3, 1, 0))
 
-  key = random.PRNGKey(0)
+  key = random.key(0)
   x = random.normal(key, (1, 6, 6, 3))
 
   variables = {'params': {'kernel': kernel, 'bias': bias}}
@@ -154,7 +154,7 @@ Other than the transpose operation before reshaping, we can convert the weights
   variables = {'params': {'conv': {'kernel': conv_kernel, 'bias': conv_bias},
                           'fc': {'kernel': fc_kernel, 'bias': fc_bias}}}
 
-  key = random.PRNGKey(0)
+  key = random.key(0)
   x = random.normal(key, (1, 6, 6, 3))
 
   j_out = j_model.apply(variables, x)
@@ -192,7 +192,7 @@ while Flax multiplies the estimated statistic with ``momentum`` and the new obse
   variables = {'params': {'scale': scale, 'bias': bias},
                'batch_stats': {'mean': mean, 'var': var}}
 
-  key = random.PRNGKey(0)
+  key = random.key(0)
   x = random.normal(key, (1, 6, 6, 3))
 
   j_bn = nn.BatchNorm(momentum=0.9, use_running_average=True)
@@ -241,7 +241,7 @@ operation. ``nn.pool()`` is the core function behind |nn.avg_pool()|_ and |nn.ma
     return y
 
 
-  key = random.PRNGKey(0)
+  key = random.key(0)
   x = random.normal(key, (1, 6, 6, 3))
 
   j_out = avg_pool(x, window_shape=(2, 2), strides=(1, 1), padding=((1, 1), (1, 1)))

diff --git a/docs/guides/dropout.rst b/docs/guides/dropout.rst
@@ -27,7 +27,7 @@ desirable properties for neural networks. To learn more, refer to the
 `Pseudorandom numbers in JAX tutorial <https://jax.readthedocs.io/en/latest/jax-101/05-random-numbers.html>`__.
 
 **Note:** Recall that JAX has an explicit way of giving you PRNG keys:
-you can fork the main PRNG state (such as ``key = jax.random.PRNGKey(seed=0)``)
+you can fork the main PRNG state (such as ``key = jax.random.key(seed=0)``)
 into multiple new PRNG keys with ``key, subkey = jax.random.split(key)``. You
 can refresh your memory in
 `🔪 JAX - The Sharp Bits 🔪 Randomness and PRNG keys <https://jax.readthedocs.io/en/latest/notebooks/Common_Gotchas_in_JAX.html#jax-prng>`__.
@@ -41,10 +41,10 @@ into three keys, including one for Flax Linen ``Dropout``.
   :title_right: With Dropout
   :sync:
 
-  root_key = jax.random.PRNGKey(seed=0)
+  root_key = jax.random.key(seed=0)
   main_key, params_key = jax.random.split(key=root_key)
   ---
-  root_key = jax.random.PRNGKey(seed=0)
+  root_key = jax.random.key(seed=0)
   main_key, params_key, dropout_key = jax.random.split(key=root_key, num=3) #!
 
 **Note:** In Flax, you provide *PRNG streams* with *names*, so that you can use them later

diff --git a/docs/guides/ensembling.rst b/docs/guides/ensembling.rst
@@ -224,7 +224,7 @@ directly.
 
   train_ds, test_ds = get_datasets()
   #!
-  rng = jax.random.PRNGKey(0)
+  rng = jax.random.key(0)
 
   rng, init_rng = jax.random.split(rng)
   state = create_train_state(init_rng, learning_rate, momentum)  #!
@@ -246,7 +246,7 @@ directly.
   ---
   train_ds, test_ds = get_datasets()
   test_ds = jax_utils.replicate(test_ds)  #!
-  rng = jax.random.PRNGKey(0)
+  rng = jax.random.key(0)
 
   rng, init_rng = jax.random.split(rng)
   state = create_train_state(jax.random.split(init_rng, jax.device_count()), #!

diff --git a/docs/guides/extracting_intermediates.rst b/docs/guides/extracting_intermediates.rst
@@ -124,7 +124,7 @@ Note that, by default ``sow`` appends values every time it is called:
     return output, features
 
   batch = jnp.ones((1,28,28,1))
-  variables = init(jax.random.PRNGKey(0), batch)
+  variables = init(jax.random.key(0), batch)
   preds, feats = predict(variables, batch)
 
   assert len(feats) == 2  # Tuple with two values since module was called twice.
@@ -180,7 +180,7 @@ avoid using ``nn.compact`` altogether.
     return RefactoredCNN().apply({"params": params}, x,
       method=lambda module, x: module.features(x))
 
-  params = init(jax.random.PRNGKey(0), batch)
+  params = init(jax.random.key(0), batch)
 
   features(params, batch)
 
@@ -209,7 +209,7 @@ In the following code example we check if any intermediate activations are non-f
     fin = jax.tree_util.tree_map(lambda xs: jnp.all(jnp.isfinite(xs)), intermediates)
     return y, fin
 
-  variables = init(jax.random.PRNGKey(0), batch)
+  variables = init(jax.random.key(0), batch)
   y, is_finite = predict(variables, batch)
   all_finite = all(jax.tree_util.tree_leaves(is_finite))
   assert all_finite, "non-finite intermediate detected!"
@@ -250,8 +250,8 @@ non-layer intermediates, but the filter function won't be applied to it.
   def predict(params, x):
     return Model().apply({"params": params}, x, capture_intermediates=True)
 
-  batch = jax.random.uniform(jax.random.PRNGKey(1), (1,3))
-  params = init(jax.random.PRNGKey(0), batch)
+  batch = jax.random.uniform(jax.random.key(1), (1,3))
+  params = init(jax.random.key(0), batch)
   preds, feats = predict(params, batch)
   feats # intermediate c in Model was not stored because it's not a Flax layer
   ---
@@ -276,8 +276,8 @@ non-layer intermediates, but the filter function won't be applied to it.
     filter_fn = lambda mdl, method_name: isinstance(mdl.name, str) and (mdl.name in {'Dense_0', 'Dense_2'}) #!
     return Model().apply({"params": params}, x, capture_intermediates=filter_fn) #!
 
-  batch = jax.random.uniform(jax.random.PRNGKey(1), (1,3))
-  params = init(jax.random.PRNGKey(0), batch)
+  batch = jax.random.uniform(jax.random.key(1), (1,3))
+  params = init(jax.random.key(0), batch)
   preds, feats = predict(params, batch)
   feats # intermediate c in Model is stored and isn't filtered out by the filter function #!
 
@@ -337,7 +337,7 @@ your model more explicitly.
     return Sequential(SeqCNN().layers[0:7]).apply({"params": params}, x)
 
   batch = jnp.ones((1,28,28,1))
-  params = init(jax.random.PRNGKey(0), batch)
+  params = init(jax.random.key(0), batch)
   features(params, batch)
 
 Extracting gradients of intermediate values
@@ -367,7 +367,7 @@ the model:
   y = jnp.empty((1, 2)) # random data
 
   model = Model()
-  variables = model.init(jax.random.PRNGKey(1), x)
+  variables = model.init(jax.random.key(1), x)
   params, perturbations = variables['params'], variables['perturbations']
 
 Finally compute the gradients of the loss with respect to the perturbations,