diff --git a/src/mlir/Dialects/EnzymeXLA.jl b/src/mlir/Dialects/EnzymeXLA.jl
index 4de6d8a13f..b02327cbdf 100755
--- a/src/mlir/Dialects/EnzymeXLA.jl
+++ b/src/mlir/Dialects/EnzymeXLA.jl
@@ -902,11 +902,11 @@ function subindex(source::Value, index::Value; result::IR.Type, location=Locatio
 end
 
 """
-`lapack_symm`
+`blas_symm`
 
 C := alpha*A*B + beta*C, or C := alpha*B*A + beta*C, where alpha and beta are scalars,  A is a symmetric matrix\"
 """
-function lapack_symm(
+function blas_symm(
     A::Value,
     B::Value,
     C::Value,
@@ -924,7 +924,7 @@ function lapack_symm(
     attributes = NamedAttribute[namedattribute("side", side), namedattribute("uplo", uplo)]
 
     return create_operation(
-        "enzymexla.lapack.symm",
+        "enzymexla.blas.symm",
         location;
         operands,
         owned_regions,
diff --git a/src/mlir/Dialects/MosaicGPU.jl b/src/mlir/Dialects/MosaicGPU.jl
index 23559557d0..c2bbd49cb7 100644
--- a/src/mlir/Dialects/MosaicGPU.jl
+++ b/src/mlir/Dialects/MosaicGPU.jl
@@ -32,6 +32,25 @@ function arrive_expect_tx(barrier::Value; expect_tx, location=Location())
     )
 end
 
+function arrive(barrier::Value; orders_tensor_core, location=Location())
+    op_ty_results = IR.Type[]
+    operands = Value[barrier,]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[namedattribute("orders_tensor_core", orders_tensor_core),]
+
+    return create_operation(
+        "mosaic_gpu.arrive",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
 """
 `async_load`
 
@@ -280,6 +299,31 @@ function broadcast_in_dim(
     )
 end
 
+"""
+`broadcasted_iota`
+
+Creates an array that has the specified shape and holds values starting at
+zero and incrementing by one along the specified dimension.
+"""
+function broadcasted_iota(; result_0::IR.Type, dimension, location=Location())
+    op_ty_results = IR.Type[result_0,]
+    operands = Value[]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[namedattribute("dimension", dimension),]
+
+    return create_operation(
+        "mosaic_gpu.broadcasted_iota",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
 """
 `custom_primitive`
 
@@ -423,6 +467,25 @@ function optimization_barrier(
     )
 end
 
+function print_layout(value::Value; format, location=Location())
+    op_ty_results = IR.Type[]
+    operands = Value[value,]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[namedattribute("format", format),]
+
+    return create_operation(
+        "mosaic_gpu.print_layout",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
 """
 `return_`
 
diff --git a/src/mlir/Dialects/Nvvm.jl b/src/mlir/Dialects/Nvvm.jl
index 536aca3100..1db4e7e1fa 100755
--- a/src/mlir/Dialects/Nvvm.jl
+++ b/src/mlir/Dialects/Nvvm.jl
@@ -1853,8 +1853,10 @@ end
 The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
 all prior cp.async operations initiated by the executing thread.
 The `addr` operand specifies the address of the *mbarrier object*
-in generic address space. The `noinc` attr impacts how the
-mbarrier\'s state is updated.
+in generic or shared::cta address space. When it is generic, the
+underlying memory should fall within the shared::cta space;
+otherwise the behavior is undefined. The `noinc` attr impacts
+how the mbarrier\'s state is updated.
 
 [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
 """
@@ -1878,37 +1880,6 @@ function cp_async_mbarrier_arrive(addr::Value; noinc=nothing, location=Location(
     )
 end
 
-"""
-`cp_async_mbarrier_arrive_shared`
-
-The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
-track all prior cp.async operations initiated by the executing thread.
-The `addr` operand specifies the address of the *mbarrier object* in
-shared memory. The `noinc` attr impacts how the mbarrier\'s state
-is updated. 
-
-[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
-"""
-function cp_async_mbarrier_arrive_shared(addr::Value; noinc=nothing, location=Location())
-    op_ty_results = IR.Type[]
-    operands = Value[addr,]
-    owned_regions = Region[]
-    successors = Block[]
-    attributes = NamedAttribute[]
-    !isnothing(noinc) && push!(attributes, namedattribute("noinc", noinc))
-
-    return create_operation(
-        "nvvm.cp.async.mbarrier.arrive.shared",
-        location;
-        operands,
-        owned_regions,
-        successors,
-        attributes,
-        results=op_ty_results,
-        result_inference=false,
-    )
-end
-
 function cp_async_shared_global(
     dst::Value,
     src::Value,
@@ -3334,8 +3305,10 @@ a result of this operation. The operation returns an opaque value that
 captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
 The operation takes the following operands:
-- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-  addressing, but the address must still be in the shared memory space.
+- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+  must be a pointer to generic or shared::cta memory. When it is generic, the
+  underlying address must be within the shared::cta memory space; otherwise
+  the behavior is undefined.
 - `count`: Integer specifying the count argument to the arrive-on operation. 
   Must be in the valid range as specified in the *mbarrier object* contents.
 
@@ -3362,35 +3335,6 @@ function mbarrier_arrive_nocomplete(
     )
 end
 
-"""
-`mbarrier_arrive_nocomplete_shared`
-
-This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
-should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
-"""
-function mbarrier_arrive_nocomplete_shared(
-    addr::Value, count::Value; res::IR.Type, location=Location()
-)
-    op_ty_results = IR.Type[res,]
-    operands = Value[addr, count]
-    owned_regions = Region[]
-    successors = Block[]
-    attributes = NamedAttribute[]
-
-    return create_operation(
-        "nvvm.mbarrier.arrive.nocomplete.shared",
-        location;
-        operands,
-        owned_regions,
-        successors,
-        attributes,
-        results=op_ty_results,
-        result_inference=false,
-    )
-end
-
 """
 `mbarrier_arrive`
 
@@ -3408,8 +3352,10 @@ The operation returns an opaque value that captures the phase of the
 value are implementation-specific.
 
 The operation takes the following operand:
-- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-  addressing, but the address must still be in the shared memory space.
+- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+  must be a pointer to generic or shared::cta memory. When it is generic, the
+  underlying address must be within the shared::cta memory space; otherwise
+  the behavior is undefined.
 
 [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
 """
@@ -3432,33 +3378,6 @@ function mbarrier_arrive(addr::Value; res::IR.Type, location=Location())
     )
 end
 
-"""
-`mbarrier_arrive_shared`
-
-This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
-should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
-"""
-function mbarrier_arrive_shared(addr::Value; res::IR.Type, location=Location())
-    op_ty_results = IR.Type[res,]
-    operands = Value[addr,]
-    owned_regions = Region[]
-    successors = Block[]
-    attributes = NamedAttribute[]
-
-    return create_operation(
-        "nvvm.mbarrier.arrive.shared",
-        location;
-        operands,
-        owned_regions,
-        successors,
-        attributes,
-        results=op_ty_results,
-        result_inference=false,
-    )
-end
-
 """
 `mbarrier_init`
 
@@ -3607,35 +3526,6 @@ function mbarrier_test_wait(addr::Value, state::Value; res::IR.Type, location=Lo
     )
 end
 
-"""
-`mbarrier_test_wait_shared`
-
-This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
-should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
-"""
-function mbarrier_test_wait_shared(
-    addr::Value, state::Value; res::IR.Type, location=Location()
-)
-    op_ty_results = IR.Type[res,]
-    operands = Value[addr, state]
-    owned_regions = Region[]
-    successors = Block[]
-    attributes = NamedAttribute[]
-
-    return create_operation(
-        "nvvm.mbarrier.test.wait.shared",
-        location;
-        operands,
-        owned_regions,
-        successors,
-        attributes,
-        results=op_ty_results,
-        result_inference=false,
-    )
-end
-
 """
 `mbarrier_try_wait_parity`
 
@@ -3793,6 +3683,34 @@ function match_sync(thread_mask::Value, val::Value; res::IR.Type, kind, location
     )
 end
 
+"""
+`memory_barrier`
+
+`membar` operation guarantees that prior memory accesses requested by this
+thread are performed at the specified `scope`, before later memory
+operations requested by this thread following the membar instruction.
+
+[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
+"""
+function memory_barrier(; scope, location=Location())
+    op_ty_results = IR.Type[]
+    operands = Value[]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[namedattribute("scope", scope),]
+
+    return create_operation(
+        "nvvm.memory.barrier",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
 """
 `mma_sync`
 
diff --git a/src/mlir/Dialects/TPU.jl b/src/mlir/Dialects/TPU.jl
index f0ca75c0eb..20f9c1533d 100755
--- a/src/mlir/Dialects/TPU.jl
+++ b/src/mlir/Dialects/TPU.jl
@@ -476,12 +476,15 @@ function enqueue_indirect_dma(
     )
 end
 
-function erase_memref_layout(operand::Value; result::IR.Type, location=Location())
-    op_ty_results = IR.Type[result,]
+function erase_memref_layout(
+    operand::Value; result=nothing::Union{Nothing,IR.Type}, location=Location()
+)
+    op_ty_results = IR.Type[]
     operands = Value[operand,]
     owned_regions = Region[]
     successors = Block[]
     attributes = NamedAttribute[]
+    !isnothing(result) && push!(op_ty_results, result)
 
     return create_operation(
         "tpu.erase_memref_layout",
@@ -490,8 +493,8 @@ function erase_memref_layout(operand::Value; result::IR.Type, location=Location(
         owned_regions,
         successors,
         attributes,
-        results=op_ty_results,
-        result_inference=false,
+        results=(length(op_ty_results) == 0 ? nothing : op_ty_results),
+        result_inference=(length(op_ty_results) == 0 ? true : false),
     )
 end
 
diff --git a/src/mlir/libMLIR_h.jl b/src/mlir/libMLIR_h.jl
index 33c37243a7..fa656d4f07 100755
--- a/src/mlir/libMLIR_h.jl
+++ b/src/mlir/libMLIR_h.jl
@@ -11623,6 +11623,12 @@ function mlirGetDialectHandle__mosaic_gpu__()
     @ccall mlir_c.mlirGetDialectHandle__mosaic_gpu__()::MlirDialectHandle
 end
 
+function mlirDialectRegistryInsertMosaicGpuInlinerExtensions(registry)
+    @ccall mlir_c.mlirDialectRegistryInsertMosaicGpuInlinerExtensions(
+        registry::MlirDialectRegistry
+    )::Cvoid
+end
+
 function enzymexlaLapackLayoutAttrGet(ctx, col_major)
     @ccall mlir_c.enzymexlaLapackLayoutAttrGet(
         ctx::MlirContext, col_major::UInt8