From e486bb1cbac4a8ccf2bd4ed4da812ab3a362218a Mon Sep 17 00:00:00 2001 From: "enzyme-ci-bot[bot]" <78882869+enzyme-ci-bot[bot]@users.noreply.github.com> Date: Sat, 27 Sep 2025 04:33:40 +0000 Subject: [PATCH] Regenerate MLIR Bindings --- src/mlir/Dialects/Enzyme.jl | 53 +++++ src/mlir/Dialects/Gpu.jl | 42 ++++ src/mlir/Dialects/Llvm.jl | 22 +- src/mlir/Dialects/Nvvm.jl | 435 ++++++++++++++++++++++++++++++++---- src/mlir/libMLIR_h.jl | 35 +++ 5 files changed, 545 insertions(+), 42 deletions(-) diff --git a/src/mlir/Dialects/Enzyme.jl b/src/mlir/Dialects/Enzyme.jl index 9d897e3356..405824dc7b 100755 --- a/src/mlir/Dialects/Enzyme.jl +++ b/src/mlir/Dialects/Enzyme.jl @@ -175,6 +175,40 @@ function autodiff( ) end +function autodiff_region( + inputs::Vector{Value}; + outputs::Vector{IR.Type}, + activity, + ret_activity, + width=nothing, + strong_zero=nothing, + fn=nothing, + body::Region, + location=Location(), +) + op_ty_results = IR.Type[outputs...,] + operands = Value[inputs...,] + owned_regions = Region[body,] + successors = Block[] + attributes = NamedAttribute[ + namedattribute("activity", activity), namedattribute("ret_activity", ret_activity) + ] + !isnothing(width) && push!(attributes, namedattribute("width", width)) + !isnothing(strong_zero) && push!(attributes, namedattribute("strong_zero", strong_zero)) + !isnothing(fn) && push!(attributes, namedattribute("fn", fn)) + + return create_operation( + "enzyme.autodiff_region", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + function batch( inputs::Vector{Value}; outputs::Vector{IR.Type}, fn, batch_shape, location=Location() ) @@ -648,4 +682,23 @@ function untracedCall( ) end +function yield(operands::Vector{Value}; location=Location()) + op_ty_results = IR.Type[] + operands = Value[operands...,] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[] + + return create_operation( + "enzyme.yield", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + end # enzyme diff --git a/src/mlir/Dialects/Gpu.jl b/src/mlir/Dialects/Gpu.jl index 76e09a20fa..4f5109b17e 100755 --- a/src/mlir/Dialects/Gpu.jl +++ b/src/mlir/Dialects/Gpu.jl @@ -2729,6 +2729,48 @@ function spmat_get_size( ) end +""" +`subgroup_broadcast` + +Broadcasts a value from one lane to all active lanes in a subgroup. The +result is guaranteed to be uniform across the active lanes in subgroup. + +The possible broadcast types are: + +* `first_active_lane` - broadcasts the value from the first active lane +in the subgroup. +* `specific_lane` - broadcasts from the specified lane. The lane index +must be uniform and within the subgroup size. The result is poison if the +lane index is invalid, non subgroup-uniform, or if the source lane is not +active. +""" +function subgroup_broadcast( + src::Value, + lane=nothing::Union{Nothing,Value}; + result=nothing::Union{Nothing,IR.Type}, + broadcast_type, + location=Location(), +) + op_ty_results = IR.Type[] + operands = Value[src,] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[namedattribute("broadcast_type", broadcast_type),] + !isnothing(lane) && push!(operands, lane) + !isnothing(result) && push!(op_ty_results, result) + + return create_operation( + "gpu.subgroup_broadcast", + location; + operands, + owned_regions, + successors, + attributes, + results=(length(op_ty_results) == 0 ? nothing : op_ty_results), + result_inference=(length(op_ty_results) == 0 ? true : false), + ) +end + """ `subgroup_id` diff --git a/src/mlir/Dialects/Llvm.jl b/src/mlir/Dialects/Llvm.jl index f6374c8afd..9afa56daad 100755 --- a/src/mlir/Dialects/Llvm.jl +++ b/src/mlir/Dialects/Llvm.jl @@ -1459,6 +1459,22 @@ Examples: // Alignment is optional llvm.mlir.global private constant @y(dense<1.0> : tensor<8xf32>) { alignment = 32 : i64 } : !llvm.array<8 x f32> ``` + +The `target_specific_attrs` attribute provides a mechanism to preserve +target-specific LLVM IR attributes that are not explicitly modeled in the +LLVM dialect. + +The attribute is an array containing either string attributes or +two-element array attributes of strings. The value of a standalone string +attribute is interpreted as the name of an LLVM IR attribute on the global. +A two-element array is interpreted as a key-value pair. + +# Example + +```mlir +llvm.mlir.global external @example() { + target_specific_attrs = [\"value-less-attr\", [\"int-attr\", \"4\"], [\"string-attr\", \"string\"]]} : f64 +``` """ function mlir_global(; global_type, @@ -1476,6 +1492,7 @@ function mlir_global(; comdat=nothing, dbg_exprs=nothing, visibility_=nothing, + target_specific_attrs=nothing, initializer::Region, location=Location(), ) @@ -1503,6 +1520,8 @@ function mlir_global(; !isnothing(comdat) && push!(attributes, namedattribute("comdat", comdat)) !isnothing(dbg_exprs) && push!(attributes, namedattribute("dbg_exprs", dbg_exprs)) !isnothing(visibility_) && push!(attributes, namedattribute("visibility_", visibility_)) + !isnothing(target_specific_attrs) && + push!(attributes, namedattribute("target_specific_attrs", target_specific_attrs)) return create_operation( "llvm.mlir.global", @@ -1933,7 +1952,6 @@ function func(; unsafe_fp_math=nothing, no_infs_fp_math=nothing, no_nans_fp_math=nothing, - approx_func_fp_math=nothing, no_signed_zeros_fp_math=nothing, denormal_fp_math=nothing, denormal_fp_math_f32=nothing, @@ -2014,8 +2032,6 @@ function func(; push!(attributes, namedattribute("no_infs_fp_math", no_infs_fp_math)) !isnothing(no_nans_fp_math) && push!(attributes, namedattribute("no_nans_fp_math", no_nans_fp_math)) - !isnothing(approx_func_fp_math) && - push!(attributes, namedattribute("approx_func_fp_math", approx_func_fp_math)) !isnothing(no_signed_zeros_fp_math) && push!( attributes, namedattribute("no_signed_zeros_fp_math", no_signed_zeros_fp_math) ) diff --git a/src/mlir/Dialects/Nvvm.jl b/src/mlir/Dialects/Nvvm.jl index 4e7d23e8a0..edac938a1f 100755 --- a/src/mlir/Dialects/Nvvm.jl +++ b/src/mlir/Dialects/Nvvm.jl @@ -13,6 +13,15 @@ import ...IR: import ..Dialects: namedattribute, operandsegmentsizes import ...API +""" +`barrier0` + +The `nvvm.barrier0` operation is a convenience operation that performs barrier +synchronization and communication within a CTA (Cooperative Thread Array) using +barrier ID 0. It is functionally equivalent to `nvvm.barrier` or `nvvm.barrier id=0`. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar) +""" function barrier0(; location=Location()) op_ty_results = IR.Type[] operands = Value[] @@ -65,6 +74,35 @@ function barrier_arrive( ) end +""" +`barrier` + +The `nvvm.barrier` operation performs barrier synchronization and communication +within a CTA (Cooperative Thread Array). It causes executing threads to wait for +all non-exited threads participating in the barrier to arrive. + +The operation takes two optional operands: + +- `barrierId`: Specifies a logical barrier resource with value 0 through 15. + Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified. +- `numberOfThreads`: Specifies the number of threads participating in the barrier. + When specified, the value must be a multiple of the warp size. If not specified, + all threads in the CTA participate in the barrier. + +The barrier operation guarantees that when the barrier completes, prior memory +accesses requested by participating threads are performed relative to all threads +participating in the barrier. It also ensures that no new memory access is +requested by participating threads before the barrier completes. + +When a barrier completes, the waiting threads are restarted without delay, and +the barrier is reinitialized so that it can be immediately reused. + +This operation generates an aligned barrier, indicating that all threads in the CTA +will execute the same barrier instruction. Behavior is undefined if all threads in the +CTA do not reach this instruction. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar) +""" function barrier( barrierId=nothing::Union{Nothing,Value}; numberOfThreads=nothing::Union{Nothing,Value}, @@ -1294,11 +1332,25 @@ function cp_async_bulk_tensor_reduce( ) end +""" +`cp_async_bulk_tensor_global_shared_cta` + +Initiates an asynchronous copy of the tensor data from shared::cta +memory to global memory. This Op supports all the store modes specified in +`TMAStoreMode`. + +The `l2CacheHint` operand is optional, and it is used to specify cache +eviction policy that may be used during the memory access. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor) +""" function cp_async_bulk_tensor_global_shared_cta( tmaDescriptor::Value, srcMem::Value, coordinates::Vector{Value}, - predicate=nothing::Union{Nothing,Value}; + l2CacheHint=nothing::Union{Nothing,Value}; + predicate=nothing::Union{Nothing,Value}, + mode=nothing, location=Location(), ) op_ty_results = IR.Type[] @@ -1306,11 +1358,24 @@ function cp_async_bulk_tensor_global_shared_cta( owned_regions = Region[] successors = Block[] attributes = NamedAttribute[] + !isnothing(l2CacheHint) && push!(operands, l2CacheHint) !isnothing(predicate) && push!(operands, predicate) push!( attributes, - operandsegmentsizes([1, 1, length(coordinates), (predicate == nothing) ? 0 : 1]), + operandsegmentsizes([ + 1, + 1, + length(coordinates), + if (l2CacheHint == nothing) + 0 + elseif 1(predicate == nothing) + 0 + else + 1 + end, + ]), ) + !isnothing(mode) && push!(attributes, namedattribute("mode", mode)) return create_operation( "nvvm.cp.async.bulk.tensor.global.shared.cta", @@ -1381,9 +1446,9 @@ end """ `cp_async_mbarrier_arrive` -The `cp.async.mbarrier.arrive` Op makes the mbarrier object track +The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. -The `addr` operand specifies the address of the mbarrier object +The `addr` operand specifies the address of the *mbarrier object* in generic address space. The `noinc` attr impacts how the mbarrier\'s state is updated. @@ -1412,9 +1477,9 @@ end """ `cp_async_mbarrier_arrive_shared` -The `cp.async.mbarrier.arrive.shared` Op makes the mbarrier object +The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. -The `addr` operand specifies the address of the mbarrier object in +The `addr` operand specifies the address of the *mbarrier object* in shared memory. The `noinc` attr impacts how the mbarrier\'s state is updated. @@ -2762,6 +2827,33 @@ function ldmatrix( ) end +""" +`mbarrier_arrive_expect_tx` + +The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation +followed by an arrive-on operation on the *mbarrier object*. Uses the default +`.release.cta` semantics. This release pattern establishes memory ordering for +operations occurring in program order before this arrive instruction by making +operations from the current thread visible to subsequent operations in other +threads within the CTA. When other threads perform corresponding acquire operations +(like \'mbarrier.test.wait\'), they synchronize with this release pattern. + +This operation first performs an expect-tx operation with the specified transaction +count, then performs an arrive-on operation with an implicit count of 1. The +expect-tx operation increases the tx-count of the *mbarrier object* by the specified +expectCount value, setting the current phase to expect and tracks the completion +of additional asynchronous transactions. + +The operation takes the following operands: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. +- `txcount`: An unsigned integer specifying the expected transaction count + for the expect-tx operation. This represents the number of asynchronous transactions + expected to complete before the barrier phase completes. +- `predicate`: Optional predicate for conditional execution. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive_expect_tx( addr::Value, txcount::Value, @@ -2787,6 +2879,14 @@ function mbarrier_arrive_expect_tx( ) end +""" +`mbarrier_arrive_expect_tx_shared` + +This Op is the same as `nvvm.mbarrier.arrive.expect_tx` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive_expect_tx_shared( addr::Value, txcount::Value, @@ -2812,6 +2912,31 @@ function mbarrier_arrive_expect_tx_shared( ) end +""" +`mbarrier_arrive_nocomplete` + +The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation +on the *mbarrier object* with the guarantee that it will not cause the barrier to +complete its current phase. Uses the default `.release.cta` semantics. This release +pattern establishes memory ordering for operations occurring in program order before +this arrive instruction by making operations from the current thread visible to +subsequent operations in other threads within the CTA. When other threads perform +corresponding acquire operations (like \'mbarrier.test.wait\'), they synchronize with +this release pattern. + +This operation causes the executing thread to signal its arrival at the barrier +with a specified count, but ensures that the barrier phase will not complete as +a result of this operation. The operation returns an opaque value that +captures the phase of the *mbarrier object* prior to the arrive-on operation. + +The operation takes the following operands: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. +- `count`: Integer specifying the count argument to the arrive-on operation. + Must be in the valid range as specified in the *mbarrier object* contents. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive_nocomplete( addr::Value, count::Value; res::IR.Type, location=Location() ) @@ -2833,6 +2958,14 @@ function mbarrier_arrive_nocomplete( ) end +""" +`mbarrier_arrive_nocomplete_shared` + +This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive_nocomplete_shared( addr::Value, count::Value; res::IR.Type, location=Location() ) @@ -2854,6 +2987,28 @@ function mbarrier_arrive_nocomplete_shared( ) end +""" +`mbarrier_arrive` + +The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the +*mbarrier object* at the specified address. Uses the default `.release.cta` semantics. +This release pattern establishes memory ordering for operations occurring in program +order before this arrive instruction by making operations from the current thread +visible to subsequent operations in other threads within the CTA. When other threads +perform corresponding acquire operations (like \'mbarrier.test.wait\'), they synchronize +with this release pattern. + +This operation causes the executing thread to signal its arrival at the barrier. +The operation returns an opaque value that captures the phase of the +*mbarrier object* prior to the arrive-on operation. The contents of this state +value are implementation-specific. + +The operation takes the following operand: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive(addr::Value; res::IR.Type, location=Location()) op_ty_results = IR.Type[res,] operands = Value[addr,] @@ -2873,6 +3028,14 @@ function mbarrier_arrive(addr::Value; res::IR.Type, location=Location()) ) end +""" +`mbarrier_arrive_shared` + +This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +""" function mbarrier_arrive_shared(addr::Value; res::IR.Type, location=Location()) op_ty_results = IR.Type[res,] operands = Value[addr,] @@ -2892,6 +3055,27 @@ function mbarrier_arrive_shared(addr::Value; res::IR.Type, location=Location()) ) end +""" +`mbarrier_init` + +The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified +memory location. + +This operation initializes the *mbarrier object* with the following state: +- Current phase: 0 +- Expected arrival count: `count` +- Pending arrival count: `count` +- Transaction count (tx-count): 0 + +The operation takes the following operands: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. +- `count`: Integer specifying the number of threads that will participate in barrier + synchronization. Must be in the range [1, 2²⁰ - 1]. +- `predicate`: Optional predicate for conditional execution. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) +""" function mbarrier_init( addr::Value, count::Value, predicate=nothing::Union{Nothing,Value}; location=Location() ) @@ -2914,6 +3098,14 @@ function mbarrier_init( ) end +""" +`mbarrier_init_shared` + +This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) +""" function mbarrier_init_shared( addr::Value, count::Value, predicate=nothing::Union{Nothing,Value}; location=Location() ) @@ -2936,6 +3128,22 @@ function mbarrier_init_shared( ) end +""" +`mbarrier_inval` + +The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the +specified memory location. + +This operation marks the *mbarrier object* as invalid, making it safe to repurpose +the memory location for other uses or to reinitialize it as a new *mbarrier object*. +It is undefined behavior if the *mbarrier object* is already invalid. + +The operation takes the following operand: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) +""" function mbarrier_inval(addr::Value; location=Location()) op_ty_results = IR.Type[] operands = Value[addr,] @@ -2955,6 +3163,14 @@ function mbarrier_inval(addr::Value; location=Location()) ) end +""" +`mbarrier_inval_shared` + +This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) +""" function mbarrier_inval_shared(addr::Value; location=Location()) op_ty_results = IR.Type[] operands = Value[addr,] @@ -2974,6 +3190,53 @@ function mbarrier_inval_shared(addr::Value; location=Location()) ) end +""" +`mbarrier_test_wait` + +The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the +completion of a specific phase of an *mbarrier object*. It uses the default +`.acquire.cta` semantics. This acquire pattern establishes memory ordering for +operations occurring in program order after this wait instruction by making +operations from other threads in the CTA visible to subsequent operations in the current +thread. When this wait completes, it synchronizes with the corresponding release +pattern from the `mbarrier.arrive` operation, establishing memory ordering within +the CTA. + +This operation tests whether the mbarrier phase specified by the state operand +has completed. It is a non-blocking instruction that immediately returns the +completion status without suspending the executing thread. + +The operation takes the following operands: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. +- `state`: An opaque value returned by a previous `mbarrier.arrive` + operation on the same *mbarrier object* during the current or immediately + preceding phase. + +The operation returns a boolean value indicating whether the specified phase +has completed: +- `true`: The immediately preceding phase has completed +- `false`: The phase is still incomplete (current phase) + +**Memory ordering guarantees**: When this wait returns true, the following +ordering guarantees hold: + +1. All memory accesses (except async operations) requested prior to + `mbarrier.arrive` having release semantics by participating CTA threads + are visible to the executing thread. +2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive` + by participating CTA threads are visible to the executing thread. +3. All `cp.async.bulk` operations using the same *mbarrier object* requested + prior to `mbarrier.arrive` having release semantics by participating CTA + threads are visible to the executing thread. +4. Memory accesses requested after this wait are not visible to memory + accesses performed prior to `mbarrier.arrive` by other participating + threads. +5. No ordering guarantee exists for memory accesses by the same thread + between `mbarrier.arrive` and this wait. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) +""" function mbarrier_test_wait(addr::Value, state::Value; res::IR.Type, location=Location()) op_ty_results = IR.Type[res,] operands = Value[addr, state] @@ -2993,6 +3256,14 @@ function mbarrier_test_wait(addr::Value, state::Value; res::IR.Type, location=Lo ) end +""" +`mbarrier_test_wait_shared` + +This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) +""" function mbarrier_test_wait_shared( addr::Value, state::Value; res::IR.Type, location=Location() ) @@ -3014,6 +3285,58 @@ function mbarrier_test_wait_shared( ) end +""" +`mbarrier_try_wait_parity` + +The `nvvm.mbarrier.try_wait.parity` operation performs a potentially-blocking +test for the completion of a specific phase of an *mbarrier object* using phase +parity. It uses the default `.acquire.cta` semantics. This acquire pattern +establishes memory ordering for operations occurring in program order after this +wait instruction by making operations from other threads in the CTA visible to subsequent +operations in the current thread. When this wait completes, it synchronizes with +the corresponding release pattern from the `mbarrier.arrive` operation, establishing +memory ordering within the CTA. + +This operation waits for the completion of the mbarrier phase indicated by the +phase parity. While it uses the underlying PTX `mbarrier.try_wait.parity` +instruction, this MLIR operation generates a loop that enforces the test to +complete before continuing execution, ensuring the barrier phase is actually +completed rather than potentially timing out. + +The operation takes the following operands: +- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic + addressing, but the address must still be in the shared memory space. +- `phase`: An integer specifying the phase parity (0 or 1). Even phases + have parity 0, odd phases have parity 1. +- `ticks`: An unsigned integer specifying the suspend time hint in + nanoseconds. This may be used instead of the system-dependent time limit. + +**Memory ordering guarantees**: When this wait returns true, the following +ordering guarantees hold: + +1. All memory accesses (except async operations) requested prior to + `mbarrier.arrive` having release semantics by participating CTA threads + are visible to the executing thread. +2. All `cp.async` operations requested prior to `cp.async.mbarrier.arrive` + by participating CTA threads are visible to the executing thread. +3. All `cp.async.bulk` operations using the same *mbarrier object* requested + prior to `mbarrier.arrive` having release semantics by participating CTA + threads are visible to the executing thread. +4. Memory accesses requested after this wait are not visible to memory + accesses performed prior to `mbarrier.arrive` by other participating + threads. +5. No ordering guarantee exists for memory accesses by the same thread + between `mbarrier.arrive` and this wait. + +**Implementation behavior**: +This operation generates a PTX loop that repeatedly calls the underlying +`mbarrier.try_wait.parity` instruction until the barrier phase completes. +Unlike the raw PTX instruction which may return without completion after a +timeout, this MLIR operation guarantees completion by continuing to loop until +the specified phase is reached. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) +""" function mbarrier_try_wait_parity( addr::Value, phase::Value, ticks::Value; location=Location() ) @@ -3035,6 +3358,14 @@ function mbarrier_try_wait_parity( ) end +""" +`mbarrier_try_wait_parity_shared` + +This Op is the same as `nvvm.mbarrier.try_wait.parity` except that the *mbarrier object* +should be accessed using a shared-memory pointer instead of a generic-memory pointer. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) +""" function mbarrier_try_wait_parity_shared( addr::Value, phase::Value, ticks::Value; location=Location() ) @@ -3297,11 +3628,20 @@ end """ `prefetch` -Operand `addr` can be a global, local or generic address pointer. No -operation is performed if `addr` maps to a `shared` memory location. +Prefetches the cache line containing the address given by `addr`. The +operand may be a global, local, or generic pointer. When `tensormap` is +specified, the operand may instead be a constant or generic pointer. If the +address maps to shared memory, the operation has no effect. + +At most one of `cacheLevel` or `tensormap` may be present. The `cacheLevel` +attribute selects the target cache level. When combined with `uniform`, the +prefetch is performed to the uniform cache, in which case `addr` must be a +generic pointer. -The `cacheLevel` attribute specifies the cache level to which the cache line -containing the specified address is brought. +When `tensormap` is used, the line containing `addr` is brought from the +constant or parameter state space for later use by `cp.async.bulk.tensor`. +If `in_param_space` is specified, the generic pointer is interpreted as +referring to the parameter state space. `uniform` can be specified after the `cacheLevel` to indicate that the prefetch is performed to the specified uniform cache level. If `uniform` is @@ -3314,16 +3654,28 @@ priority when `cacheLevel` is L2. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu) """ function prefetch( - addr::Value; cacheLevel, uniform=nothing, evictPriority=nothing, location=Location() + addr::Value, + predicate=nothing::Union{Nothing,Value}; + cacheLevel=nothing, + evictPriority=nothing, + tensormap=nothing, + uniform=nothing, + in_param_space=nothing, + location=Location(), ) op_ty_results = IR.Type[] operands = Value[addr,] owned_regions = Region[] successors = Block[] - attributes = NamedAttribute[namedattribute("cacheLevel", cacheLevel),] - !isnothing(uniform) && push!(attributes, namedattribute("uniform", uniform)) + attributes = NamedAttribute[] + !isnothing(predicate) && push!(operands, predicate) + !isnothing(cacheLevel) && push!(attributes, namedattribute("cacheLevel", cacheLevel)) !isnothing(evictPriority) && push!(attributes, namedattribute("evictPriority", evictPriority)) + !isnothing(tensormap) && push!(attributes, namedattribute("tensormap", tensormap)) + !isnothing(uniform) && push!(attributes, namedattribute("uniform", uniform)) + !isnothing(in_param_space) && + push!(attributes, namedattribute("in_param_space", in_param_space)) return create_operation( "nvvm.prefetch", @@ -3337,28 +3689,6 @@ function prefetch( ) end -function prefetch_tensormap( - tmaDescriptor::Value, predicate=nothing::Union{Nothing,Value}; location=Location() -) - op_ty_results = IR.Type[] - operands = Value[tmaDescriptor,] - owned_regions = Region[] - successors = Block[] - attributes = NamedAttribute[] - !isnothing(predicate) && push!(operands, predicate) - - return create_operation( - "nvvm.prefetch.tensormap", - location; - operands, - owned_regions, - successors, - attributes, - results=op_ty_results, - result_inference=false, - ) -end - function rcp_approx_ftz_f(arg::Value; res::IR.Type, location=Location()) op_ty_results = IR.Type[res,] operands = Value[arg,] @@ -3447,7 +3777,7 @@ end The `shfl.sync` Op implements data shuffle within threads of a warp. The `thread_mask` denotes the threads participating in the Op where -the bit position corresponds to a particular thread’s laneid. +the bit position corresponds to a particular thread\'s laneid. The `offset` specifies a source lane or source lane offset (depending on `kind`). The `val` is the input value to be copied from the source. The `mask_and_clamp` contains two packed values specifying @@ -3561,6 +3891,33 @@ function stmatrix( ) end +""" +`bar_warp_sync` + +The `nvvm.bar.warp.sync` operation performs barrier synchronization for threads +within a warp. + +This operation causes the executing thread to wait until all threads corresponding +to the `mask` operand have executed a `bar.warp.sync` with the same mask value +before resuming execution. + +The `mask` operand specifies the threads participating in the barrier, where each +bit position corresponds to the thread\'s lane ID within the warp. Only threads with +their corresponding bit set in the mask participate in the barrier synchronization. + +**Important constraints**: +- The behavior is undefined if the executing thread is not included in the mask + (i.e., the bit corresponding to the thread\'s lane ID is not set) +- For compute capability sm_6x or below, all threads in the mask must execute + the same `bar.warp.sync` instruction in convergence + +This operation also guarantees memory ordering among participating threads. +Threads within the warp that wish to communicate via memory can store to memory, +execute `bar.warp.sync`, and then safely read values stored by other threads +in the warp. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync) +""" function bar_warp_sync(mask::Value; location=Location()) op_ty_results = IR.Type[] operands = Value[mask,] @@ -3612,10 +3969,10 @@ end """ `tcgen05_commit` -The `tcgen05.commit` makes the mbarrier object, specified by +The `tcgen05.commit` makes the *mbarrier object*, specified by the operand `addr`, track the completion of all the prior async-tcgen05 operations initiated by the executing thread. -The multicast variants allow signaling on the mbarrier objects +The multicast variants allow signaling on the *mbarrier objects* of multiple CTAs within the cluster. Operand `multicastMask`, when present, specifies the destination CTAs in the cluster such that each bit position in the 16-bit `multicastMask` operand @@ -4146,7 +4503,7 @@ The vote operation kinds are: - `ballot`: In the ballot form, the destination result is a 32 bit integer. In this form, the predicate from each thread in membermask are copied into the corresponding bit position of the result, where the bit position - corresponds to the thread’s lane id. + corresponds to the thread\'s lane id. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync) """ diff --git a/src/mlir/libMLIR_h.jl b/src/mlir/libMLIR_h.jl index 74885fc07c..e87b136406 100755 --- a/src/mlir/libMLIR_h.jl +++ b/src/mlir/libMLIR_h.jl @@ -996,6 +996,24 @@ function mlirModuleFromOperation(op) @ccall mlir_c.mlirModuleFromOperation(op::MlirOperation)::MlirModule end +""" + mlirModuleEqual(lhs, rhs) + +Checks if two modules are equal. +""" +function mlirModuleEqual(lhs, rhs) + @ccall mlir_c.mlirModuleEqual(lhs::MlirModule, rhs::MlirModule)::Bool +end + +""" + mlirModuleHashValue(mod) + +Compute a hash for the given module. +""" +function mlirModuleHashValue(mod) + @ccall mlir_c.mlirModuleHashValue(mod::MlirModule)::Csize_t +end + """ MlirOperationState @@ -1301,6 +1319,15 @@ function mlirOperationEqual(op, other) @ccall mlir_c.mlirOperationEqual(op::MlirOperation, other::MlirOperation)::Bool end +""" + mlirOperationHashValue(op) + +Compute a hash for the given operation. +""" +function mlirOperationHashValue(op) + @ccall mlir_c.mlirOperationHashValue(op::MlirOperation)::Csize_t +end + """ mlirOperationGetContext(op) @@ -9570,6 +9597,14 @@ function mlirFrozenRewritePatternSetDestroy(op) @ccall mlir_c.mlirFrozenRewritePatternSetDestroy(op::MlirFrozenRewritePatternSet)::Cvoid end +function mlirApplyPatternsAndFoldGreedilyWithOp(op, patterns, arg3) + @ccall mlir_c.mlirApplyPatternsAndFoldGreedilyWithOp( + op::MlirOperation, + patterns::MlirFrozenRewritePatternSet, + arg3::MlirGreedyRewriteDriverConfig, + )::MlirLogicalResult +end + function mlirApplyPatternsAndFoldGreedily(op, patterns, arg3) @ccall mlir_c.mlirApplyPatternsAndFoldGreedily( op::MlirModule,