diff --git a/tests/integration/train_tests.py b/tests/integration/train_tests.py index 2e2727d65d..2511fbe463 100644 --- a/tests/integration/train_tests.py +++ b/tests/integration/train_tests.py @@ -505,6 +505,7 @@ def test_tpu_zero1_gradient_accumulation(self): @pytest.mark.integration_test @pytest.mark.gpu_only @pytest.mark.scheduled_only + @pytest.mark.skip(reason="b/489133823. Previously transient in b/462548581.") def test_gpu_zero1_gradient_accumulation(self): os.environ["NVTE_FUSED_ATTN"] = "1" # Enable fused attention zero1_ga = [ # tests Zero-1 optimizer sharding with gradient accumulation