diff --git a/src/mlir/Dialects/EnzymeXLA.jl b/src/mlir/Dialects/EnzymeXLA.jl index 4de6d8a13f..b02327cbdf 100755 --- a/src/mlir/Dialects/EnzymeXLA.jl +++ b/src/mlir/Dialects/EnzymeXLA.jl @@ -902,11 +902,11 @@ function subindex(source::Value, index::Value; result::IR.Type, location=Locatio end """ -`lapack_symm` +`blas_symm` C := alpha*A*B + beta*C, or C := alpha*B*A + beta*C, where alpha and beta are scalars, A is a symmetric matrix\" """ -function lapack_symm( +function blas_symm( A::Value, B::Value, C::Value, @@ -924,7 +924,7 @@ function lapack_symm( attributes = NamedAttribute[namedattribute("side", side), namedattribute("uplo", uplo)] return create_operation( - "enzymexla.lapack.symm", + "enzymexla.blas.symm", location; operands, owned_regions, diff --git a/src/mlir/Dialects/MosaicGPU.jl b/src/mlir/Dialects/MosaicGPU.jl index 23559557d0..c2bbd49cb7 100644 --- a/src/mlir/Dialects/MosaicGPU.jl +++ b/src/mlir/Dialects/MosaicGPU.jl @@ -32,6 +32,25 @@ function arrive_expect_tx(barrier::Value; expect_tx, location=Location()) ) end +function arrive(barrier::Value; orders_tensor_core, location=Location()) + op_ty_results = IR.Type[] + operands = Value[barrier,] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[namedattribute("orders_tensor_core", orders_tensor_core),] + + return create_operation( + "mosaic_gpu.arrive", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + """ `async_load` @@ -280,6 +299,31 @@ function broadcast_in_dim( ) end +""" +`broadcasted_iota` + +Creates an array that has the specified shape and holds values starting at +zero and incrementing by one along the specified dimension. +""" +function broadcasted_iota(; result_0::IR.Type, dimension, location=Location()) + op_ty_results = IR.Type[result_0,] + operands = Value[] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[namedattribute("dimension", dimension),] + + return create_operation( + "mosaic_gpu.broadcasted_iota", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + """ `custom_primitive` @@ -423,6 +467,25 @@ function optimization_barrier( ) end +function print_layout(value::Value; format, location=Location()) + op_ty_results = IR.Type[] + operands = Value[value,] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[namedattribute("format", format),] + + return create_operation( + "mosaic_gpu.print_layout", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + """ `return_` diff --git a/src/mlir/Dialects/Nvvm.jl b/src/mlir/Dialects/Nvvm.jl index 536aca3100..1db4e7e1fa 100755 --- a/src/mlir/Dialects/Nvvm.jl +++ b/src/mlir/Dialects/Nvvm.jl @@ -1853,8 +1853,10 @@ end The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. The `addr` operand specifies the address of the *mbarrier object* -in generic address space. The `noinc` attr impacts how the -mbarrier\'s state is updated. +in generic or shared::cta address space. When it is generic, the +underlying memory should fall within the shared::cta space; +otherwise the behavior is undefined. The `noinc` attr impacts +how the mbarrier\'s state is updated. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) """ @@ -1878,37 +1880,6 @@ function cp_async_mbarrier_arrive(addr::Value; noinc=nothing, location=Location( ) end -""" -`cp_async_mbarrier_arrive_shared` - -The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object* -track all prior cp.async operations initiated by the executing thread. -The `addr` operand specifies the address of the *mbarrier object* in -shared memory. The `noinc` attr impacts how the mbarrier\'s state -is updated. - -[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) -""" -function cp_async_mbarrier_arrive_shared(addr::Value; noinc=nothing, location=Location()) - op_ty_results = IR.Type[] - operands = Value[addr,] - owned_regions = Region[] - successors = Block[] - attributes = NamedAttribute[] - !isnothing(noinc) && push!(attributes, namedattribute("noinc", noinc)) - - return create_operation( - "nvvm.cp.async.mbarrier.arrive.shared", - location; - operands, - owned_regions, - successors, - attributes, - results=op_ty_results, - result_inference=false, - ) -end - function cp_async_shared_global( dst::Value, src::Value, @@ -3334,8 +3305,10 @@ a result of this operation. The operation returns an opaque value that captures the phase of the *mbarrier object* prior to the arrive-on operation. The operation takes the following operands: -- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. +- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the count argument to the arrive-on operation. Must be in the valid range as specified in the *mbarrier object* contents. @@ -3362,35 +3335,6 @@ function mbarrier_arrive_nocomplete( ) end -""" -`mbarrier_arrive_nocomplete_shared` - -This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object* -should be accessed using a shared-memory pointer instead of a generic-memory pointer. - -[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) -""" -function mbarrier_arrive_nocomplete_shared( - addr::Value, count::Value; res::IR.Type, location=Location() -) - op_ty_results = IR.Type[res,] - operands = Value[addr, count] - owned_regions = Region[] - successors = Block[] - attributes = NamedAttribute[] - - return create_operation( - "nvvm.mbarrier.arrive.nocomplete.shared", - location; - operands, - owned_regions, - successors, - attributes, - results=op_ty_results, - result_inference=false, - ) -end - """ `mbarrier_arrive` @@ -3408,8 +3352,10 @@ The operation returns an opaque value that captures the phase of the value are implementation-specific. The operation takes the following operand: -- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. +- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) """ @@ -3432,33 +3378,6 @@ function mbarrier_arrive(addr::Value; res::IR.Type, location=Location()) ) end -""" -`mbarrier_arrive_shared` - -This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object* -should be accessed using a shared-memory pointer instead of a generic-memory pointer. - -[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) -""" -function mbarrier_arrive_shared(addr::Value; res::IR.Type, location=Location()) - op_ty_results = IR.Type[res,] - operands = Value[addr,] - owned_regions = Region[] - successors = Block[] - attributes = NamedAttribute[] - - return create_operation( - "nvvm.mbarrier.arrive.shared", - location; - operands, - owned_regions, - successors, - attributes, - results=op_ty_results, - result_inference=false, - ) -end - """ `mbarrier_init` @@ -3607,35 +3526,6 @@ function mbarrier_test_wait(addr::Value, state::Value; res::IR.Type, location=Lo ) end -""" -`mbarrier_test_wait_shared` - -This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object* -should be accessed using a shared-memory pointer instead of a generic-memory pointer. - -[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) -""" -function mbarrier_test_wait_shared( - addr::Value, state::Value; res::IR.Type, location=Location() -) - op_ty_results = IR.Type[res,] - operands = Value[addr, state] - owned_regions = Region[] - successors = Block[] - attributes = NamedAttribute[] - - return create_operation( - "nvvm.mbarrier.test.wait.shared", - location; - operands, - owned_regions, - successors, - attributes, - results=op_ty_results, - result_inference=false, - ) -end - """ `mbarrier_try_wait_parity` @@ -3793,6 +3683,34 @@ function match_sync(thread_mask::Value, val::Value; res::IR.Type, kind, location ) end +""" +`memory_barrier` + +`membar` operation guarantees that prior memory accesses requested by this +thread are performed at the specified `scope`, before later memory +operations requested by this thread following the membar instruction. + +[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar) +""" +function memory_barrier(; scope, location=Location()) + op_ty_results = IR.Type[] + operands = Value[] + owned_regions = Region[] + successors = Block[] + attributes = NamedAttribute[namedattribute("scope", scope),] + + return create_operation( + "nvvm.memory.barrier", + location; + operands, + owned_regions, + successors, + attributes, + results=op_ty_results, + result_inference=false, + ) +end + """ `mma_sync` diff --git a/src/mlir/Dialects/TPU.jl b/src/mlir/Dialects/TPU.jl index f0ca75c0eb..20f9c1533d 100755 --- a/src/mlir/Dialects/TPU.jl +++ b/src/mlir/Dialects/TPU.jl @@ -476,12 +476,15 @@ function enqueue_indirect_dma( ) end -function erase_memref_layout(operand::Value; result::IR.Type, location=Location()) - op_ty_results = IR.Type[result,] +function erase_memref_layout( + operand::Value; result=nothing::Union{Nothing,IR.Type}, location=Location() +) + op_ty_results = IR.Type[] operands = Value[operand,] owned_regions = Region[] successors = Block[] attributes = NamedAttribute[] + !isnothing(result) && push!(op_ty_results, result) return create_operation( "tpu.erase_memref_layout", @@ -490,8 +493,8 @@ function erase_memref_layout(operand::Value; result::IR.Type, location=Location( owned_regions, successors, attributes, - results=op_ty_results, - result_inference=false, + results=(length(op_ty_results) == 0 ? nothing : op_ty_results), + result_inference=(length(op_ty_results) == 0 ? true : false), ) end diff --git a/src/mlir/libMLIR_h.jl b/src/mlir/libMLIR_h.jl index 33c37243a7..fa656d4f07 100755 --- a/src/mlir/libMLIR_h.jl +++ b/src/mlir/libMLIR_h.jl @@ -11623,6 +11623,12 @@ function mlirGetDialectHandle__mosaic_gpu__() @ccall mlir_c.mlirGetDialectHandle__mosaic_gpu__()::MlirDialectHandle end +function mlirDialectRegistryInsertMosaicGpuInlinerExtensions(registry) + @ccall mlir_c.mlirDialectRegistryInsertMosaicGpuInlinerExtensions( + registry::MlirDialectRegistry + )::Cvoid +end + function enzymexlaLapackLayoutAttrGet(ctx, col_major) @ccall mlir_c.enzymexlaLapackLayoutAttrGet( ctx::MlirContext, col_major::UInt8