diff --git a/Project.toml b/Project.toml index c99a7a2e44..2129ac0564 100644 --- a/Project.toml +++ b/Project.toml @@ -60,7 +60,7 @@ PythonCall = "0.9" Random = "1.10" Random123 = "1.7" ReactantCore = "0.1.3" -Reactant_jll = "0.0.32" +Reactant_jll = "0.0.33" Scratch = "1.2" Statistics = "1.10" YaoBlocks = "0.13" diff --git a/deps/ReactantExtra/.bazelrc b/deps/ReactantExtra/.bazelrc index a8b84e0f0d..ba56a9d61a 100644 --- a/deps/ReactantExtra/.bazelrc +++ b/deps/ReactantExtra/.bazelrc @@ -18,8 +18,8 @@ build -c opt build:cuda --repo_env TF_NEED_CUDA=1 build:cuda --repo_env TF_NVCC_CLANG=1 build:cuda --repo_env TF_NCCL_USE_STUB=1 -build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2" -build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1" +build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.6.2" +build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.6.0" # "sm" means we emit only cubin, which is forward compatible within a GPU generation. # "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations. build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90" diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD index 0a512067a9..7815488090 100644 --- a/deps/ReactantExtra/BUILD +++ b/deps/ReactantExtra/BUILD @@ -54,6 +54,9 @@ cc_toolchain_config( coverage_link_flags = ["--coverage"], cpu = "k8", cxx_builtin_include_directories = [ + "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/13.2.0", + "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/13.2.0/x86_64-linux-musl", + "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/13.2.0/backward", "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/10.2.0", "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/10.2.0/x86_64-linux-musl", "/opt/x86_64-linux-musl/x86_64-linux-musl/include/c++/10.2.0/backward", @@ -149,14 +152,14 @@ cc_toolchain_config( abi_libc_version = "local", abi_version = "local", cxx_builtin_include_directories = [ - "/opt/BB_TARGET/lib/gcc/BB_TARGET/10.2.0/include", - "/opt/BB_TARGET/lib/gcc/BB_TARGET/10.2.0/include-fixed", + "/opt/BB_TARGET/lib/gcc/BB_TARGET/13.2.0/include", + "/opt/BB_TARGET/lib/gcc/BB_TARGET/13.2.0/include-fixed", "/opt/BB_TARGET/BB_TARGET/include", "/opt/BB_TARGET/BB_TARGET/sys-root/usr/include", - "/opt/BB_TARGET/BB_TARGET/include/c++/10.2.0", - "/opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/BB_TARGET", - "/opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/backward", - "/opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/parallel" + "/opt/BB_TARGET/BB_TARGET/include/c++/13.2.0", + "/opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/BB_TARGET", + "/opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/backward", + "/opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/parallel" ], tool_paths = { "ar": "/opt/bin/BB_FULL_TARGET/ar", @@ -193,14 +196,14 @@ cc_toolchain_config( "-Wno-free-nonheap-object", "-fno-omit-frame-pointer", # TODO cxx_builtin_include_directories doesn't seem to be working, so we add the INCLUDE_PATHs manually - "-isystem /opt/BB_TARGET/lib/gcc/BB_TARGET/10.2.0/include", - "-isystem /opt/BB_TARGET/lib/gcc/BB_TARGET/10.2.0/include-fixed", + "-isystem /opt/BB_TARGET/lib/gcc/BB_TARGET/13.2.0/include", + "-isystem /opt/BB_TARGET/lib/gcc/BB_TARGET/13.2.0/include-fixed", "-isystem /opt/BB_TARGET/BB_TARGET/include", "-isystem /opt/BB_TARGET/BB_TARGET/sys-root/usr/include", - "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/10.2.0", - "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/BB_TARGET", - "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/backward", - "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/10.2.0/parallel", + "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/13.2.0", + "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/BB_TARGET", + "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/backward", + "-isystem /opt/BB_TARGET/BB_TARGET/include/c++/13.2.0/parallel", ], opt_compile_flags = [ "-g0", @@ -361,6 +364,7 @@ cc_library( ) + [ "@enzyme_ad//src/enzyme_ad/jax:RegistryUtils.cpp", + "@enzyme_ad//src/enzyme_ad/jax:gpu.cc", # "@com_google_protobuf//:src/google/protobuf/io/coded_stream.cc", # "@xla//xla:xla.pb.cc", "@xla//xla:xla_data.pb.cc", @@ -429,7 +433,7 @@ cc_library( "-Wl,-exported_symbol,_ifrt_*", "-Wl,-exported_symbol,_RegisterCustomCallTarget", "-Wl,-exported_symbol,_ConvertLLVMToMLIR", -"-Wl,-exported_symbol,_EnzymeGPUCustomCall", +"-Wl,-exported_symbol,_RegisterEnzymeXLAGPUHandler", "-Wl,-exported_symbol,_ReactantThrowError", ]}), deps = [ @@ -469,6 +473,9 @@ cc_library( "@llvm-project//llvm:X86CodeGen", "@enzyme_ad//src/enzyme_ad/jax:TransformOps", "@enzyme_ad//src/enzyme_ad/jax:XLADerivatives", + # "@enzyme_ad//src/enzyme_ad/jax:gpu", + "@xla//xla/ffi/api:ffi", + "@xla//xla/ffi:ffi_api", "@stablehlo//:chlo_ops", "@xla//xla/pjrt:pjrt_api", "@xla//xla/pjrt:pjrt_c_api_client", diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE index dc72ecba92..95c2b7bb9e 100644 --- a/deps/ReactantExtra/WORKSPACE +++ b/deps/ReactantExtra/WORKSPACE @@ -9,7 +9,7 @@ http_archive( urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)], ) -ENZYMEXLA_COMMIT = "b6d6563aa3a3050474a4250bf18322f7ebf0b486" +ENZYMEXLA_COMMIT = "74046d05089c02946058f8fd94ed23efd0bf3ccc" ENZYMEXLA_SHA256 = "" http_archive( diff --git a/deps/build_local.jl b/deps/build_local.jl index 4138d2b6c6..8a0c03e96f 100644 --- a/deps/build_local.jl +++ b/deps/build_local.jl @@ -93,6 +93,8 @@ else run( Cmd( `bazel build $(arg) -c $(build_kind) --action_env=JULIA=$(Base.julia_cmd().exec[1]) + --repo_env=GCC_HOST_COMPILER_PATH=/usr/bin/gcc + --repo_env=CC=/home/wmoses/llvms/llvm16-r/clang+llvm-16.0.2-x86_64-linux-gnu-ubuntu-22.04/bin/clang --repo_env HERMETIC_PYTHON_VERSION="3.10" --check_visibility=false --verbose_failures :libReactantExtra.so`; dir=source_dir, diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl index ba0765af5f..2d709d05ff 100644 --- a/ext/ReactantCUDAExt.jl +++ b/ext/ReactantCUDAExt.jl @@ -218,11 +218,11 @@ function Adapt.adapt_storage(::CUDA.KernelAdaptor, xs::TracedRArray{T,N}) where return res end -const _kernel_instances = Dict{Any,Any}() - +# Since we cache these objects we cannot cache data containing MLIR operations (e.g. the entry must be a string +# and not the operation itself). struct LLVMFunc{F,tt} f::Union{F,Nothing} - entry::MLIR.IR.Operation + entry::String end const GPUCompiler = CUDA.GPUCompiler @@ -324,9 +324,9 @@ function compile(job) )::MLIR.API.MlirOperation entry = MLIR.IR.Operation(linkRes) - - entry + String(Reactant.TracedUtils.get_attribute_by_name(linkRes, "sym_name")) end + return LLVMFunc{job.source.specTypes.parameters[1],job.source.specTypes}(nothing, entry) end @@ -378,9 +378,7 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})( output_operand_aliases = MLIR.IR.Attribute(aliases) - fname = Reactant.TracedUtils.get_attribute_by_name(func.entry, "sym_name") - # Force public for now while we don't have real users - # MLIR.IR.rmattr!(func.entry, "sym_visibility") + fname = func.entry operands = MLIR.IR.Value[] for idx in @@ -460,25 +458,27 @@ Reactant.@reactant_overlay @noinline function CUDA.cufunction( end function __init__() - handle = Reactant.XLA.Libdl.dlopen(CUDA.CUDA_Driver_jll.libcuda; throw_error=false) - if handle === nothing - handle = C_NULL - end - ptr1 = Reactant.XLA.Libdl.dlsym(handle, "cuLaunchKernel"; throw_error=false) - if ptr1 === nothing - ptr1 = C_NULL - end - ptr2 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleLoadData"; throw_error=false) - if ptr2 === nothing - ptr2 = C_NULL - end - ptr3 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleGetFunction"; throw_error=false) - if ptr3 === nothing - ptr3 = C_NULL + if CUDA.CUDA_Driver_jll.libcuda !== nothing + handle = Reactant.XLA.Libdl.dlopen(CUDA.CUDA_Driver_jll.libcuda; throw_error=false) + if handle === nothing + handle = C_NULL + end + ptr1 = Reactant.XLA.Libdl.dlsym(handle, "cuLaunchKernel"; throw_error=false) + if ptr1 === nothing + ptr1 = C_NULL + end + ptr2 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleLoadData"; throw_error=false) + if ptr2 === nothing + ptr2 = C_NULL + end + ptr3 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleGetFunction"; throw_error=false) + if ptr3 === nothing + ptr3 = C_NULL + end + Reactant.Compiler.cuLaunch[] = Base.reinterpret(UInt, ptr1) + Reactant.Compiler.cuModule[] = Base.reinterpret(UInt, ptr2) + Reactant.Compiler.cuFunc[] = Base.reinterpret(UInt, ptr3) end - Reactant.Compiler.cuLaunch[] = Base.reinterpret(UInt, ptr1) - Reactant.Compiler.cuModule[] = Base.reinterpret(UInt, ptr2) - Reactant.Compiler.cuFunc[] = Base.reinterpret(UInt, ptr3) return nothing end diff --git a/src/XLA.jl b/src/XLA.jl index 54b45cd00b..6255737e45 100644 --- a/src/XLA.jl +++ b/src/XLA.jl @@ -144,11 +144,7 @@ function __init__() end end - @ccall MLIR.API.mlir_c.RegisterCustomCallTarget( - "enzymexla_gpu"::Cstring, - cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid}, - "CUDA"::Cstring, - )::Cvoid + @ccall MLIR.API.mlir_c.RegisterEnzymeXLAGPUHandler()::Cvoid # This wasn't properly exported on macos, we'll remove the try once macOS JLL # has the fix. diff --git a/src/utils.jl b/src/utils.jl index 56fa7587b4..83c9d51b64 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -364,7 +364,10 @@ function call_with_reactant_generator( ir, rt = CC.typeinf_ircode(interp, mi, nothing) end - ir, any_changed = rewrite_insts!(ir, interp) + if !is_reactant_method(mi::Core.MethodInstance) + ir, any_changed = rewrite_insts!(ir, interp) + end + src = ccall(:jl_new_code_info_uninit, Ref{CC.CodeInfo}, ()) src.slotnames = fill(:none, length(ir.argtypes) + 1) src.slotflags = fill(zero(UInt8), length(ir.argtypes)) diff --git a/test/cuda.jl b/test/cuda.jl deleted file mode 100644 index 549002e4f1..0000000000 --- a/test/cuda.jl +++ /dev/null @@ -1,36 +0,0 @@ -using Reactant -using Test -using CUDA - -using Reactant_jll -@show Reactant_jll.libReactantExtra_path - -function square_kernel!(x) - #i = threadIdx().x - #x[i] *= x[i] - #@cuprintf("overwrote value of %f was thrown during kernel execution on thread (%d, %d, %d) in block (%d, %d, %d).\n", - # 0.0, threadIdx().x, threadIdx().y, threadIdx().z, blockIdx().x, blockIdx().y, blockIdx().z) - #x[i], threadIdx().x, threadIdx().y, threadIdx().z, blockIdx().x, blockIdx().y, blockIdx().z) - - # sync_threads() - return nothing -end - -# basic squaring on GPU -function square!(x) - @cuda blocks = 1 threads = length(x) square_kernel!(x) - return nothing -end - -@testset "Square Kernel" begin - oA = collect(1:1:64) - A = Reactant.to_rarray(oA) - # @show @code_hlo optimize = false square!(A) - # @show @code_hlo optimize = :before_kernel square!(A) - # @show @code_hlo square!(A) - func! = @compile square!(A) - func!(A) - @show A - @show oA - @test all(Array(A) .≈ (oA .* oA)) -end diff --git a/test/integration/cuda.jl b/test/integration/cuda.jl new file mode 100644 index 0000000000..47bb8c23a7 --- /dev/null +++ b/test/integration/cuda.jl @@ -0,0 +1,29 @@ +using Reactant +using Test +using CUDA + +function square_kernel!(x, y) + i = threadIdx().x + x[i] *= y[i] + sync_threads() + return nothing +end + +# basic squaring on GPU +function square!(x, y) + @cuda blocks = 1 threads = length(x) square_kernel!(x, y) + return nothing +end + +@testset "Square Kernel" begin + oA = collect(1:1:64) + A = Reactant.to_rarray(oA) + B = Reactant.to_rarray(100 .* oA) + if CUDA.functional() + @jit square!(A, B) + @test all(Array(A) .≈ (oA .* oA .* 100)) + @test all(Array(B) .≈ (oA .* 100)) + else + @code_hlo optimize = :before_kernel square!(A, B) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 2b3238d101..834d9b504d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -59,6 +59,8 @@ const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all")) end if REACTANT_TEST_GROUP == "all" || REACTANT_TEST_GROUP == "integration" + # Temporarily disabled as minutia are debugged + # @safetestset "CUDA" include("integration/cuda.jl") @safetestset "Linear Algebra" include("integration/linear_algebra.jl") @safetestset "AbstractFFTs" include("integration/fft.jl") @safetestset "Random" include("integration/random.jl")