Fp8 unit test error fix

hsuan-lun-chiang · hsuan-lun-chiang · commit 4f008c550cb2 · 2026-04-21T06:21:21.000Z
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -30,7 +30,6 @@
 from jax.sharding import Mesh
 
 from maxtext.common.common_types import (
-    EP_AS_CONTEXT,
     MODEL_MODE_AUTOREGRESSIVE,
     MODEL_MODE_PREFILL,
     MODEL_MODE_TRAIN,
@@ -171,8 +170,6 @@ def __call__(
 
     if self.model_mode == MODEL_MODE_PREFILL:
       logical_axis_names = ("activation_batch", "prefill_activation_length", "activation_embed")
-    elif self.config.expert_shard_attention_option == EP_AS_CONTEXT and self.model_mode == MODEL_MODE_TRAIN:
-      logical_axis_names = ("activation_batch_no_exp", "activation_length", "activation_embed")
     else:
       logical_axis_names = ("activation_batch", "activation_length_no_exp", "activation_embed")
 
diff --git a/src/maxtext/layers/nnx_wrappers.py b/src/maxtext/layers/nnx_wrappers.py
@@ -286,13 +286,18 @@ def __call__(
       # TODO(cgarciae): ideally we just do an update but currently dictionaries don't allow
       # insertion of new keys, we need to enable this in NNX to simplify the code below
       # to the simple nnx.update(self, nnx_attrs) above.
+      def _to_nnx_dict(d):
+        if isinstance(d, dict):
+          return nnx.Dict({k: _to_nnx_dict(v) for k, v in d.items()})
+        return d
+
       for attr_name, value in nnx_attrs.items():
-        if hasattr(self, attr_name) and isinstance(value, dict):
+        if hasattr(self, attr_name) and isinstance(value, (dict, nnx.Dict)):
           original_value = getattr(self, attr_name)
           new_values = _recursive_merge(original_value, value)
-          setattr(self, attr_name, nnx.data(new_values))
+          setattr(self, attr_name, _to_nnx_dict(new_values))
         else:
-          setattr(self, attr_name, nnx.data(value))
+          setattr(self, attr_name, _to_nnx_dict(value))
 
     return out
 
@@ -466,7 +471,9 @@ def maybe_unbox(x):
 
       warnings.warn(f"Found unknown module paths in incoming state:{paths_str}")
 
-    nnx.update(module, new_state)
+    filtered_state_flat = {path: v for path, v in new_state_flat.items() if path in current_state_flat}
+    filtered_state = nnx.traversals.unflatten_mapping(filtered_state_flat)
+    nnx.update(module, filtered_state)
 
     _fix_for_qwix_quantization(module)
     method_fn = _get_module_method(module, nnx_method)
diff --git a/src/maxtext/layers/quantizations.py b/src/maxtext/layers/quantizations.py
@@ -592,7 +592,7 @@ def _get_quant_config(config):
     with open(config.quant_cfg_path, "rt", encoding="utf8") as config_file:
       mixed_precision_config = json.load(config_file)
     return _get_mixed_precision_quant_config(mixed_precision_config)
-  if config.quantization == "fp8":
+  if getattr(config.quantization, "name", str(config.quantization)) in ("FP8", "FP8_GPU"):
     return "fp8"
   if config.quantization == "nanoo_fp8":
     return "nanoo_fp8"
@@ -636,7 +636,7 @@ def configure_quantization(config: Config, quant_mode_str: str = "train"):
         bwd_calibration_method=config.bwd_quantization_calibration_method,
     )
 
-  if config.use_qwix_quantization:
+  if config.use_qwix_quantization and not getattr(config, "enable_nnx", False):
     return None
   quant_cfg = _get_quant_config(config)
   if quant_cfg: