diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 44f8b50dd7..064ef33235 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -48,23 +48,47 @@ steps:
         if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 120
 
-  # - label: "AMDGPU Julia v{{matrix.version}}"
+  # - label: ":julia: :linux: AMDGPU Julia v{{matrix.version}} -- {{matrix.group}} -- {{matrix.runtime}}"
   #   matrix:
   #     setup:
   #       version:
   #         - "1.10"
+  #       group:
+  #         - core
+  #         - neural_networks
+  #         - integration
+  #       runtime:
+  #         - "IFRT"
   #   plugins:
   #     - JuliaCI/julia#v1:
   #         version: "{{matrix.version}}"
-  #     - JuliaCI/julia-test#v1:
-  #         test_args: "--gpu"
   #     - JuliaCI/julia-coverage#v1:
   #         codecov: true
   #         dirs:
   #           - src
   #           - ext
+  #           - lib/ReactantCore/src
   #   agents:
   #     queue: "juliagpu"
   #     rocm: "*"
-  #   if: build.message !~ /\[skip tests\]/
-  #   timeout_in_minutes: 60
+  #   commands: |
+  #     touch LocalPreferences.toml
+
+  #     echo "[Reactant]" >> LocalPreferences.toml
+  #     echo "xla_runtime = \"{{matrix.runtime}}\"" >> LocalPreferences.toml
+
+  #     cat LocalPreferences.toml
+
+  #     julia --project=. -e 'println("--- :julia: Instantiating project")
+  #         using Pkg
+  #         Pkg.develop([PackageSpec(path="lib/ReactantCore")])'
+
+  #     julia --project=. -e 'println("--- :julia: Run Tests")
+  #         using Pkg
+  #         Pkg.test(; coverage="user")'
+  #   env:
+  #     REACTANT_TEST_GROUP: "{{matrix.group}}"
+  #     JULIA_DEBUG: "Reactant,Reactant_jll"
+  #     CUDA_VISIBLE_DEVICES: 0
+    # if: build.message !~ /\[skip tests\]/
+    # timeout_in_minutes: 120
diff --git a/Project.toml b/Project.toml
index 16a71a180c..7c3055bfa8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -105,7 +105,7 @@ PythonCall = "0.9.25"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.16"
-Reactant_jll = "0.0.257"
+Reactant_jll = "0.0.258"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
 Sockets = "1.10"
diff --git a/src/Compiler.jl b/src/Compiler.jl
index e0262ac329..bea2f19472 100644
--- a/src/Compiler.jl
+++ b/src/Compiler.jl
@@ -905,10 +905,9 @@ function optimization_passes(
         "self_mul_to_convolution_like($(Int(backend == "tpu")))",
         "subtract_multiply_const_to_add_mul_const",
         "trivial_reduce_window_to_reduce_op",
+        "case_to_if",
         "dot_general_add_distributive_simplify",
         "dot_general_subtract_distributive_simplify",
-        "dus_to_dynamic_pad",
-        "dynamic_pad_to_pad",
         "remove_no_ops_from_while_loop",
         "while_is_copy_simplify",
         "split_variadic_scatter_op",
@@ -960,6 +959,7 @@ function optimization_passes(
                 "dot_general_licm(0)",
                 "reduce_licm(0)",
                 "reduce_window_licm(0)",
+                "reverse_licm(0)",
             ],
         )
     end
@@ -1029,6 +1029,8 @@ function optimization_passes(
                 "rotate_pad",
                 "concat_multipad",
                 "speculate_if_pad_to_select",
+                "dus_to_dynamic_pad",
+                "dynamic_pad_to_pad",
             ],
         )
 
@@ -1315,7 +1317,7 @@ end
 
 # TODO we want to be able to run the more advanced passes via transform dialect as an enzyme intermediate
 # However, this errs as we cannot attach the transform with to the funcop itself [as we run a functionpass].
-const enzyme_pass::String = "enzyme{postpasses=\"arith-raise{stablehlo=true},canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,cse,canonicalize\"}"
+const enzyme_pass::String = "enzyme{postpasses=\"arith-raise{stablehlo=true},canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,cse,canonicalize,arith-raise{stablehlo=true}\"}"
 
 function run_pass_pipeline!(mod, pass_pipeline, key=""; enable_verifier=true)
     pm = MLIR.IR.PassManager()