NNX post-train fixes: unpack MultimodalInput for NNX decoder; support scalar LR in adam_pax

ecnal-cienet · ecnal-cienet · commit 2fbc8c2f1de2 · 2026-04-17T00:07:33.000Z
- models.py: NNX Transformer was passing `multimodal_input=MultimodalInput(...)` to
  NNXDecoder, which expects individual keyword args (image_embeddings, image_masks,
  audio_embeddings, audio_masks, bidirectional_mask). Unpack the object at the call site.

- optimizers.py: adam_pax called `learning_rate_fn(count)` unconditionally, failing when
  `optax.inject_hyperparams` passes a pre-evaluated scalar instead of a callable schedule.
  Add `callable()` guard to handle both cases.
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -520,7 +520,11 @@ def __call__(
           previous_chunk=previous_chunk,
           slot=slot,
           page_state=page_state,
-          multimodal_input=multimodal_input,
+          image_embeddings=multimodal_input.image_embeddings if multimodal_input is not None else None,
+          image_masks=multimodal_input.image_masks if multimodal_input is not None else None,
+          audio_embeddings=multimodal_input.audio_embeddings if multimodal_input is not None else None,
+          audio_masks=multimodal_input.audio_masks if multimodal_input is not None else None,
+          bidirectional_mask=multimodal_input.bidirectional_mask if multimodal_input is not None else None,
           kv_caches=kv_caches,
           attention_metadata=attention_metadata,
           deepstack_visual_embeds=deepstack_visual_embeds,
diff --git a/src/maxtext/optimizers/optimizers.py b/src/maxtext/optimizers/optimizers.py
@@ -336,7 +336,9 @@ def _update_momentum(update, mu, nu):
       else:
         updates = jax.tree_util.tree_map(lambda x, v: x + weight_decay * v, updates, params)
 
-    step_size = -1.0 * learning_rate_fn(count)
+    # learning_rate_fn may be a callable schedule or a scalar (e.g. when wrapped
+    # by optax.inject_hyperparams, it is passed as a pre-evaluated scalar).
+    step_size = -1.0 * (learning_rate_fn(count) if callable(learning_rate_fn) else learning_rate_fn)
     # Finally, fold in step size.
     updates = jax.tree_util.tree_map(lambda x: step_size * x, updates)
 
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -266,29 +266,44 @@ def _safe_shard(x, pspec):
     nnx.update(self.optimizer, optimizer_sharded_state)
 
   def _train_step(self, model, optimizer, inputs):
-    """Overrides the main JIT block to natively handle ModelBundle module."""
+    """Overrides the main JIT block to natively handle ModelBundle module.
 
+    Uses jax.value_and_grad with explicit split/merge to avoid nesting
+    nnx.value_and_grad inside nnx.jit, which causes Flax NNX to assign
+    conflicting outer_index values and raises:
+      ValueError: The graph structure of a node added to cached_partial was
+      mutated inside the transformation.
+    """
     batch = self.gen_model_input_fn(inputs)
+    student = model.student_model
+    teacher = model.teacher_model
+
+    # Run teacher inference outside of value_and_grad.
+    # The teacher is frozen (stop_gradient), so its output is a constant
+    # from the perspective of the student gradient computation.
+    if "teacher_output" in batch:
+      teacher_output = batch["teacher_output"]
+    else:
+      teacher_output = self.strategy.teacher_forward_fn(
+          model=teacher,
+          input_tokens=batch["input_tokens"],
+          positions=batch["positions"],
+          attention_mask=batch.get("attention_mask"),
+          decoder_segment_ids=batch.get("decoder_segment_ids"),
+          decoder_target_tokens=batch.get("targets", None),
+          decoder_target_mask=batch.get("targets_segmentation", None),
+          cache=None,
+      )
+    teacher_output = jax.tree.map(jax.lax.stop_gradient, teacher_output)
 
-    def loss_wrapper(student, teacher, batch):
-      if "teacher_output" in batch:
-        teacher_output = batch["teacher_output"]
-      else:
-        teacher_output = self.strategy.teacher_forward_fn(
-            model=teacher,
-            input_tokens=batch["input_tokens"],
-            positions=batch["positions"],
-            attention_mask=batch.get("attention_mask"),
-            decoder_segment_ids=batch.get("decoder_segment_ids"),
-            decoder_target_tokens=batch.get("targets", None),
-            decoder_target_mask=batch.get("targets_segmentation", None),
-            cache=None,
-        )
-
-      teacher_output = jax.tree.map(jax.lax.stop_gradient, teacher_output)
+    # Split student into differentiable params and non-differentiable rest.
+    # Capture graphdef outside of jax.value_and_grad for stable graph tracking.
+    student_graphdef, diff_params, rest = nnx.split(student, self.wrt_filter, ...)
 
+    def loss_wrapper_pure(diff_params, rest):
+      local_student = nnx.merge(student_graphdef, diff_params, rest, copy=True)
       student_output = self.strategy.student_forward_fn(
-          model=student,
+          model=local_student,
           input_tokens=batch["input_tokens"],
           positions=batch["positions"],
           attention_mask=batch.get("attention_mask"),
@@ -297,27 +312,24 @@ def loss_wrapper(student, teacher, batch):
           decoder_target_mask=batch.get("targets_segmentation", None),
           cache=None,
       )
-      # we should apply a mask for labels to disable segment-separator tokens
       labels = self.strategy.create_labels(batch["targets"], targets_segmentation=batch.get("targets_segmentation", None))
-      return self.strategy.compute_loss(student_output, teacher_output, labels)
-
-    # Because student is the 0th argument, argnums=0 guarantees
-    # we only compute gradients for the student.
-    grad_fn = nnx.value_and_grad(
-        loss_wrapper,
-        argnums=nnx.DiffState(0, self.wrt_filter),
-        has_aux=True,
-    )
+      loss, aux = self.strategy.compute_loss(student_output, teacher_output, labels)
+      # Capture updated non-param state (e.g. RNG counters) from local_student.
+      _, _, new_rest = nnx.split(local_student, self.wrt_filter, ...)
+      return loss, (aux, new_rest)
 
-    out, grads = grad_fn(model.student_model, model.teacher_model, batch)
+    grad_fn = jax.value_and_grad(loss_wrapper_pure, argnums=0, has_aux=True)
+    (loss, (aux, new_rest)), grads = grad_fn(diff_params, rest)
 
-    tunix_expects_grad_norm = getattr(self, "_tunix_expects_grad_norm", True)
+    # Propagate updated non-param state back to student.
+    nnx.update(student, new_rest)
 
-    optimizer.update(model.student_model, grads)
+    optimizer.update(student, grads)
 
+    tunix_expects_grad_norm = getattr(self, "_tunix_expects_grad_norm", True)
     if tunix_expects_grad_norm:
-      return out[0], out[1], optax.global_norm(grads)
-    return out[0], out[1]
+      return loss, aux, optax.global_norm(grads)
+    return loss, aux
 
   def _eval_step(self, model, inputs):
     """Evaluation only needs the student."""
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -56,6 +56,42 @@
 import pathwaysutils
 import tensorflow_datasets as tfds
 
+# JAX 0.9+ changed with_sharding_constraint to assert (not reshard) when all
+# mesh axes are Explicit. tpu_inference still expects resharding semantics.
+# Patch: try the original (works for Auto axes); on AssertionError (Explicit
+# mesh) fall back to jax.sharding.reshard.
+_orig_wsc = jax.lax.with_sharding_constraint
+
+
+def _compat_wsc(x, shardings):
+  try:
+    return _orig_wsc(x, shardings)
+  except AssertionError:
+    return jax.sharding.reshard(x, shardings)
+
+
+jax.lax.with_sharding_constraint = _compat_wsc
+
+# tpu_inference JaxEinsum defaults param_dtype=float32, so tpu_inference model weights
+# initialize as float32. During weight sync, tunix._apply_dtype_cast then upcasts the
+# incoming bfloat16 MaxText weights → float32 to match the target. This leaves v_proj
+# as float32 while k_proj output appears bfloat16 (due to k_norm dtype promotion),
+# causing a dtype mismatch in the ragged paged attention kernel.
+# Fix: skip bfloat16→float32 upcasts during weight sync so synced weights stay bfloat16.
+import jax.numpy as _jnp
+import tunix.generate.utils as _tunix_utils
+
+_orig_apply_dtype_cast = _tunix_utils._apply_dtype_cast  # pylint: disable=protected-access
+
+
+def _no_bf16_to_f32_cast(val, tgt_dtype, src_key):
+  if hasattr(val, "dtype") and val.dtype == _jnp.bfloat16 and tgt_dtype == _jnp.float32:
+    return val  # keep bfloat16; tpu_inference model dtype is bfloat16 despite float32 init
+  return _orig_apply_dtype_cast(val, tgt_dtype, src_key)
+
+
+_tunix_utils._apply_dtype_cast = _no_bf16_to_f32_cast  # pylint: disable=protected-access
+
 from absl import app
 from absl import logging as absl_logging
 from etils import epath
@@ -543,6 +579,8 @@ def create_rl_components(
               "hf_overrides": trainer_config.vllm_hf_overrides,
               "enable_expert_parallel": sampler_config.rollout_expert_parallelism > 1,
               "enable_prefix_caching": True,  # Enable prefix caching to speed up generation for long prompts
+              # Ensures vLLM model initializes with correct dtype (not float32 default)
+              "dtype": trainer_config.weight_dtype,
           },
           rollout_vllm_sampling_kwargs={
               "stop": trainer_config.stop_strings,
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -35,14 +35,15 @@
     eval_interval=-1 steps=10 profiler=xplane weight_dtype=bfloat16
 """
 
-from typing import Sequence
+from typing import Any, Sequence
 
 from absl import app
 import os
 import jax
 import optax
 import pathwaysutils
 
+from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 
 from orbax import checkpoint as ocp
@@ -68,6 +69,69 @@
 from maxtext.utils import model_creation_utils
 
 
+class MaxTextPeftTrainer(peft_trainer.PeftTrainer):
+  """MaxText-specific PeftTrainer that avoids nested NNX transformations.
+
+  Tunix's default PeftTrainer._train_step creates nnx.value_and_grad inside
+  nnx.jit. This nesting causes Flax NNX to assign conflicting outer_index
+  values to graph nodes, resulting in:
+    ValueError: The graph structure of a node added to cached_partial was
+    mutated inside the transformation.
+
+  This subclass overrides create_train_step_fn to use jax.value_and_grad
+  with an explicit split/merge pattern (matching MaxText's pre-training NNX
+  train_step), which avoids the nested NNX transformation issue entirely.
+  """
+
+  def create_train_step_fn(self):
+    """Creates a train step using jax.value_and_grad with explicit NNX split/merge."""
+    loss_fn_ref = self.loss_fn
+    has_aux = self._has_aux
+    gen_fn = self.gen_model_input_fn
+    is_lora_enabled = self._lora_enabled
+    wrt = nnx.LoRAParam if is_lora_enabled else nnx.Param
+
+    # Capture the graphdef once outside of JIT so that split/merge inside
+    # jax.value_and_grad can use a stable (non-traced) structural descriptor.
+    graphdef, _, _ = nnx.split(self.model, wrt, ...)
+
+    def train_step(model: nnx.Module, optimizer: nnx.Optimizer, inputs: Any):
+      inputs = gen_fn(inputs)
+
+      # Split model into differentiable params and non-differentiable rest.
+      # Using jax.value_and_grad (not nnx.value_and_grad) avoids nesting NNX
+      # transforms inside nnx.jit, which would corrupt outer_index tracking.
+      _, diff_params, rest = nnx.split(model, wrt, ...)
+
+      def loss_wrapper(diff_params, rest, **inputs_kw):
+        local_model = nnx.merge(graphdef, diff_params, rest, copy=True)
+        out = loss_fn_ref(local_model, **inputs_kw)
+        # Capture updated non-param state (e.g. RNG counters) from local_model.
+        _, _, new_rest = nnx.split(local_model, wrt, ...)
+        if has_aux:
+          loss, aux = out
+          return loss, (aux, new_rest)
+        else:
+          return out, (None, new_rest)
+
+      grad_fn = jax.value_and_grad(loss_wrapper, argnums=0, has_aux=True)
+      (out_val, (aux, new_rest)), grads = grad_fn(diff_params, rest, **inputs)
+
+      # Propagate updated non-param state (RNG counters, etc.) back to model.
+      nnx.update(model, new_rest)
+
+      # Apply optimizer update. grads has the same nnx.State(wrt) structure
+      # as diff_params, which is compatible with optimizer.update.
+      optimizer.update(model, grads)
+
+      if has_aux:
+        return out_val, aux
+      else:
+        return out_val, None
+
+    return train_step
+
+
 def get_tunix_config(mt_config):
   """Gets the Tunix training configurations from the MaxText config.
 
@@ -161,7 +225,7 @@ def setup_trainer_state(mt_config, goodput_recorder=None):
     training_hooks = hooks.SFTTrainingHooks(mt_config, mesh, learning_rate_schedule, goodput_recorder)
     data_hooks = hooks.SFTDataHooks(mt_config, mesh, goodput_recorder)
 
-    trainer = peft_trainer.PeftTrainer(model, optimizer, tunix_config)
+    trainer = MaxTextPeftTrainer(model, optimizer, tunix_config)
     trainer.with_training_hooks(training_hooks)
     trainer.with_data_hooks(data_hooks)
     trainer = use_maxtext_loss_function(trainer, mt_config)
diff --git a/src/maxtext/utils/model_creation_utils.py b/src/maxtext/utils/model_creation_utils.py
@@ -361,6 +361,13 @@ def create_nnx_model(config, mesh=None, devices=None, model_mode=MODEL_MODE_TRAI
         # Get the structure of checkpoint in `config.load_parameters_path`
         metadata = ckptr.metadata(config.load_parameters_path)
 
+        if metadata is None or metadata.item_metadata is None:
+          raise ValueError(
+              f"Cannot read checkpoint metadata from '{config.load_parameters_path}'. "
+              "The checkpoint directory may be empty or the save did not complete "
+              "(missing _CHECKPOINT_METADATA). Ensure the checkpoint save finished successfully."
+          )
+
         is_nnx_checkpoint = True
         if (
             "params" in metadata.item_metadata.tree.keys()
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py