Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/mlir/Dialects/EnzymeXLA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -902,11 +902,11 @@ function subindex(source::Value, index::Value; result::IR.Type, location=Locatio
end

"""
`lapack_symm`
`blas_symm`

C := alpha*A*B + beta*C, or C := alpha*B*A + beta*C, where alpha and beta are scalars, A is a symmetric matrix\"
"""
function lapack_symm(
function blas_symm(
A::Value,
B::Value,
C::Value,
Expand All @@ -924,7 +924,7 @@ function lapack_symm(
attributes = NamedAttribute[namedattribute("side", side), namedattribute("uplo", uplo)]

return create_operation(
"enzymexla.lapack.symm",
"enzymexla.blas.symm",
location;
operands,
owned_regions,
Expand Down
63 changes: 63 additions & 0 deletions src/mlir/Dialects/MosaicGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ function arrive_expect_tx(barrier::Value; expect_tx, location=Location())
)
end

function arrive(barrier::Value; orders_tensor_core, location=Location())
op_ty_results = IR.Type[]
operands = Value[barrier,]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[namedattribute("orders_tensor_core", orders_tensor_core),]

return create_operation(
"mosaic_gpu.arrive",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`async_load`

Expand Down Expand Up @@ -280,6 +299,31 @@ function broadcast_in_dim(
)
end

"""
`broadcasted_iota`

Creates an array that has the specified shape and holds values starting at
zero and incrementing by one along the specified dimension.
"""
function broadcasted_iota(; result_0::IR.Type, dimension, location=Location())
op_ty_results = IR.Type[result_0,]
operands = Value[]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[namedattribute("dimension", dimension),]

return create_operation(
"mosaic_gpu.broadcasted_iota",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`custom_primitive`

Expand Down Expand Up @@ -423,6 +467,25 @@ function optimization_barrier(
)
end

function print_layout(value::Value; format, location=Location())
op_ty_results = IR.Type[]
operands = Value[value,]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[namedattribute("format", format),]

return create_operation(
"mosaic_gpu.print_layout",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`return_`

Expand Down
162 changes: 40 additions & 122 deletions src/mlir/Dialects/Nvvm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1853,8 +1853,10 @@ end
The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
all prior cp.async operations initiated by the executing thread.
The `addr` operand specifies the address of the *mbarrier object*
in generic address space. The `noinc` attr impacts how the
mbarrier\'s state is updated.
in generic or shared::cta address space. When it is generic, the
underlying memory should fall within the shared::cta space;
otherwise the behavior is undefined. The `noinc` attr impacts
how the mbarrier\'s state is updated.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
"""
Expand All @@ -1878,37 +1880,6 @@ function cp_async_mbarrier_arrive(addr::Value; noinc=nothing, location=Location(
)
end

"""
`cp_async_mbarrier_arrive_shared`

The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
track all prior cp.async operations initiated by the executing thread.
The `addr` operand specifies the address of the *mbarrier object* in
shared memory. The `noinc` attr impacts how the mbarrier\'s state
is updated.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
"""
function cp_async_mbarrier_arrive_shared(addr::Value; noinc=nothing, location=Location())
op_ty_results = IR.Type[]
operands = Value[addr,]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[]
!isnothing(noinc) && push!(attributes, namedattribute("noinc", noinc))

return create_operation(
"nvvm.cp.async.mbarrier.arrive.shared",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

function cp_async_shared_global(
dst::Value,
src::Value,
Expand Down Expand Up @@ -3334,8 +3305,10 @@ a result of this operation. The operation returns an opaque value that
captures the phase of the *mbarrier object* prior to the arrive-on operation.

The operation takes the following operands:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
addressing, but the address must still be in the shared memory space.
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
must be a pointer to generic or shared::cta memory. When it is generic, the
underlying address must be within the shared::cta memory space; otherwise
the behavior is undefined.
- `count`: Integer specifying the count argument to the arrive-on operation.
Must be in the valid range as specified in the *mbarrier object* contents.

Expand All @@ -3362,35 +3335,6 @@ function mbarrier_arrive_nocomplete(
)
end

"""
`mbarrier_arrive_nocomplete_shared`

This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
"""
function mbarrier_arrive_nocomplete_shared(
addr::Value, count::Value; res::IR.Type, location=Location()
)
op_ty_results = IR.Type[res,]
operands = Value[addr, count]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[]

return create_operation(
"nvvm.mbarrier.arrive.nocomplete.shared",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`mbarrier_arrive`

Expand All @@ -3408,8 +3352,10 @@ The operation returns an opaque value that captures the phase of the
value are implementation-specific.

The operation takes the following operand:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
addressing, but the address must still be in the shared memory space.
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
must be a pointer to generic or shared::cta memory. When it is generic, the
underlying address must be within the shared::cta memory space; otherwise
the behavior is undefined.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
"""
Expand All @@ -3432,33 +3378,6 @@ function mbarrier_arrive(addr::Value; res::IR.Type, location=Location())
)
end

"""
`mbarrier_arrive_shared`

This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
"""
function mbarrier_arrive_shared(addr::Value; res::IR.Type, location=Location())
op_ty_results = IR.Type[res,]
operands = Value[addr,]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[]

return create_operation(
"nvvm.mbarrier.arrive.shared",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`mbarrier_init`

Expand Down Expand Up @@ -3607,35 +3526,6 @@ function mbarrier_test_wait(addr::Value, state::Value; res::IR.Type, location=Lo
)
end

"""
`mbarrier_test_wait_shared`

This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
"""
function mbarrier_test_wait_shared(
addr::Value, state::Value; res::IR.Type, location=Location()
)
op_ty_results = IR.Type[res,]
operands = Value[addr, state]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[]

return create_operation(
"nvvm.mbarrier.test.wait.shared",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`mbarrier_try_wait_parity`

Expand Down Expand Up @@ -3793,6 +3683,34 @@ function match_sync(thread_mask::Value, val::Value; res::IR.Type, kind, location
)
end

"""
`memory_barrier`

`membar` operation guarantees that prior memory accesses requested by this
thread are performed at the specified `scope`, before later memory
operations requested by this thread following the membar instruction.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
"""
function memory_barrier(; scope, location=Location())
op_ty_results = IR.Type[]
operands = Value[]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[namedattribute("scope", scope),]

return create_operation(
"nvvm.memory.barrier",
location;
operands,
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
)
end

"""
`mma_sync`

Expand Down
11 changes: 7 additions & 4 deletions src/mlir/Dialects/TPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -476,12 +476,15 @@ function enqueue_indirect_dma(
)
end

function erase_memref_layout(operand::Value; result::IR.Type, location=Location())
op_ty_results = IR.Type[result,]
function erase_memref_layout(
operand::Value; result=nothing::Union{Nothing,IR.Type}, location=Location()
)
op_ty_results = IR.Type[]
operands = Value[operand,]
owned_regions = Region[]
successors = Block[]
attributes = NamedAttribute[]
!isnothing(result) && push!(op_ty_results, result)

return create_operation(
"tpu.erase_memref_layout",
Expand All @@ -490,8 +493,8 @@ function erase_memref_layout(operand::Value; result::IR.Type, location=Location(
owned_regions,
successors,
attributes,
results=op_ty_results,
result_inference=false,
results=(length(op_ty_results) == 0 ? nothing : op_ty_results),
result_inference=(length(op_ty_results) == 0 ? true : false),
)
end

Expand Down
6 changes: 6 additions & 0 deletions src/mlir/libMLIR_h.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11623,6 +11623,12 @@ function mlirGetDialectHandle__mosaic_gpu__()
@ccall mlir_c.mlirGetDialectHandle__mosaic_gpu__()::MlirDialectHandle
end

function mlirDialectRegistryInsertMosaicGpuInlinerExtensions(registry)
@ccall mlir_c.mlirDialectRegistryInsertMosaicGpuInlinerExtensions(
registry::MlirDialectRegistry
)::Cvoid
end

function enzymexlaLapackLayoutAttrGet(ctx, col_major)
@ccall mlir_c.enzymexlaLapackLayoutAttrGet(
ctx::MlirContext, col_major::UInt8
Expand Down