From 5e338f52b572d4b4148df42e497120ac18694146 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 11:00:45 -0500
Subject: [PATCH 1/5] vendor optimize

---
 ext/ReactantCUDAExt.jl | 52 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index 1b1a470cb3..04301b495e 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -412,6 +412,56 @@ AddKernelStatePass() = LLVM.NewPMModulePass("AddKernelStatePass", kern_pass)
 LowerKernelStatePass() = LLVM.NewPMFunctionPass("LowerKernelStatePass", noop_pass)
 CleanupKernelStatePass() = LLVM.NewPMModulePass("CleanupKernelStatePass", noop_pass)
 
+# From https://github.com/JuliaGPU/GPUCompiler.jl/blob/7b9322faa34685026c4601a5084eecf5a5d7f3fe/src/ptx.jl#L149
+function vendored_optimize_module!(@nospecialize(job),
+                                   mod::LLVM.Module,
+                                   instcombine::Bool=false
+    )
+    tm = GPUCompiler.llvm_machine(job.config.target)
+    # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450)
+    LLVM.@dispose pb=LLVM.NewPMPassBuilder() begin
+        LLVM.register!(pb, LLVM.NVVMReflectPass())
+
+        LLVM.add!(pb, LLVM.NewPMFunctionPassManager()) do fpm
+            # TODO: need to run this earlier; optimize_module! is called after addOptimizationPasses!
+            LLVM.add!(fpm, LLVM.NVVMReflectPass())
+
+            # needed by GemmKernels.jl-like code
+            LLVM.add!(fpm, LLVM.SpeculativeExecutionPass())
+
+            # NVPTX's target machine info enables runtime unrolling,
+            # but Julia's pass sequence only invokes the simple unroller.
+            LLVM.add!(fpm, LLVM.LoopUnrollPass(; job.config.opt_level))
+            if instcombine
+                LLVM.add!(fpm, LLVM.InstCombinePass())        # clean-up redundancy
+            else
+                LLVM.add!(fpm, LLVM.InstSimplifyPass())        # clean-up redundancy
+            end
+            LLVM.add!(fpm, LLVM.NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
+                LLVM.add!(lpm, LICMPass())           # the inner runtime check might be
+                                                # outer loop invariant
+            end
+
+            # the above loop unroll pass might have unrolled regular, non-runtime nested loops.
+            # that code still needs to be optimized (arguably, multiple unroll passes should be
+            # scheduled by the Julia optimizer). do so here, instead of re-optimizing entirely.
+            if job.config.opt_level == 2
+                LLVM.add!(fpm, LLVM.GVNPass())
+            elseif job.config.opt_level == 1
+                LLVM.add!(fpm, LLVM.EarlyCSEPass())
+            end
+            LLVM.add!(fpm, LLVM.DSEPass())
+
+            LLVM.add!(fpm, LLVM.SimplifyCFGPass())
+        end
+
+        # get rid of the internalized functions; now possible unused
+        LLVM.add!(pb, LLVM.GlobalDCEPass())
+
+        LLVM.run!(pb, mod, tm)
+    end
+end
+
 # compile to executable machine code
 function compile(job)
     # lower to PTX
@@ -452,7 +502,7 @@ function compile(job)
             end
             LLVM.run!(pb, mod, tm)
         end
-        GPUCompiler.optimize_module!(job, mod)
+        vendored_optimize_module!(job, mod)
         LLVM.run!(CUDA.GPUCompiler.DeadArgumentEliminationPass(), mod, tm)
 
         for fname in ("gpu_report_exception", "gpu_signal_exception")

From 2026b617d471f940fa8386465c742ab82c343598 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 17:01:47 -0600
Subject: [PATCH 2/5] Update ReactantCUDAExt.jl

---
 ext/ReactantCUDAExt.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index 04301b495e..39ee6b1000 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -420,11 +420,11 @@ function vendored_optimize_module!(@nospecialize(job),
     tm = GPUCompiler.llvm_machine(job.config.target)
     # TODO: Use the registered target passes (JuliaGPU/GPUCompiler.jl#450)
     LLVM.@dispose pb=LLVM.NewPMPassBuilder() begin
-        LLVM.register!(pb, LLVM.NVVMReflectPass())
+        LLVM.register!(pb, GPUCompiler.NVVMReflectPass())
 
         LLVM.add!(pb, LLVM.NewPMFunctionPassManager()) do fpm
             # TODO: need to run this earlier; optimize_module! is called after addOptimizationPasses!
-            LLVM.add!(fpm, LLVM.NVVMReflectPass())
+            LLVM.add!(fpm, GPUCompiler.NVVMReflectPass())
 
             # needed by GemmKernels.jl-like code
             LLVM.add!(fpm, LLVM.SpeculativeExecutionPass())

From 063488e0709a4e8e0dc176830b0952d744b0af2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?=
 <765740+giordano@users.noreply.github.com>
Date: Fri, 7 Feb 2025 23:10:44 +0000
Subject: [PATCH 3/5] Update ext/ReactantCUDAExt.jl

---
 ext/ReactantCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index 39ee6b1000..c157dd6b23 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -438,7 +438,7 @@ function vendored_optimize_module!(@nospecialize(job),
                 LLVM.add!(fpm, LLVM.InstSimplifyPass())        # clean-up redundancy
             end
             LLVM.add!(fpm, LLVM.NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
-                LLVM.add!(lpm, LICMPass())           # the inner runtime check might be
+                LLVM.add!(lpm, GPUCompiler.LICMPass())           # the inner runtime check might be
                                                 # outer loop invariant
             end
 

From 6b4e2c7ddb52f5662a12d769fd1609b8cecf96f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?=
 <765740+giordano@users.noreply.github.com>
Date: Fri, 7 Feb 2025 23:11:42 +0000
Subject: [PATCH 4/5] Update ext/ReactantCUDAExt.jl

---
 ext/ReactantCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index c157dd6b23..bb2d9bb7b4 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -438,7 +438,7 @@ function vendored_optimize_module!(@nospecialize(job),
                 LLVM.add!(fpm, LLVM.InstSimplifyPass())        # clean-up redundancy
             end
             LLVM.add!(fpm, LLVM.NewPMLoopPassManager(; use_memory_ssa=true)) do lpm
-                LLVM.add!(lpm, GPUCompiler.LICMPass())           # the inner runtime check might be
+                LLVM.add!(lpm, LLVM.LICMPass())           # the inner runtime check might be
                                                 # outer loop invariant
             end
 

From 921165f02f7ac7fe07432824e24a579a72fc2f40 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 18:18:02 -0500
Subject: [PATCH 5/5] try forcing random seed for basic test

---
 test/basic.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/basic.jl b/test/basic.jl
index 48a2002359..0526ae662d 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -2,6 +2,8 @@ using Reactant
 using Test
 using Enzyme
 using Statistics
+using Random
+Random.seed!(123)
 
 fastmax(x::AbstractArray{T}) where {T} = reduce(max, x; dims=1, init=float(T)(-Inf))