[MLIR][NVVM] Add TMA linear prefetch Op #141211

durga4github · 2025-05-23T08:42:16Z

This patch adds an Op for the TMA prefetch
(non-tensor) variant. llvm-lit tests are added
to verify the lowering to the intrinsics.

llvmbot · 2025-05-23T08:42:51Z

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-llvm

Author: Durgadoss R (durga4github)

Changes

This patch adds an Op for the TMA prefetch
(non-tensor) variant. llvm-lit tests are added
to verify the lowering to the intrinsics.

Full diff: https://github.com/llvm/llvm-project/pull/141211.diff

3 Files Affected:

(modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+43)
(modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+20)
(modified) mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir (+9)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 0c5c87cfe002f..c6c8f59db8c0d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2344,6 +2344,49 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
   }];
 }
 
+def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
+  let summary = "Async bulk prefetch from global memory to L2 cache";
+  let description = [{
+    Initiates an asynchronous prefetch of data from the location
+    specified by `srcMem` to the L2 cache.
+
+    The `l2CacheHint` operand is optional, and it is used to specify cache
+    eviction policy that may be used during the memory access.
+
+    Example:
+    ```mlir
+      nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
+
+      // with l2_cache_hint
+      nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
+    ```
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch)
+  }];
+
+  let arguments = (ins
+    LLVM_PointerGlobal:$srcMem,
+    I32:$size,
+    Optional<I64>:$l2CacheHint);
+
+  let assemblyFormat = [{
+    $srcMem `,` $size (`l2_cache_hint` `=` $l2CacheHint^ )?
+    attr-dict  `:` type($srcMem)
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
+  }];
+
+  string llvmBuilder = [{
+    auto [id, args] = NVVM::CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
+  }];
+}
+
 def NVVM_CpAsyncBulkTensorPrefetchOp :
   NVVM_Op<"cp.async.bulk.tensor.prefetch", [AttrSizedOperandSegments]> {
   let arguments = (ins
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 9f55fe315106c..ad98dfc59e029 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1254,6 +1254,26 @@ CpAsyncOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
   return id;
 }
 
+mlir::NVVM::IDArgPair CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::CpAsyncBulkPrefetchOp>(op);
+  llvm::SmallVector<llvm::Value *> args;
+  llvm::Intrinsic::ID id = llvm::Intrinsic::nvvm_cp_async_bulk_prefetch_L2;
+
+  // Fill the Intrinsic Args
+  args.push_back(mt.lookupValue(thisOp.getSrcMem()));
+  args.push_back(mt.lookupValue(thisOp.getSize()));
+
+  mlir::Value cacheHint = thisOp.getL2CacheHint();
+  const bool hasCacheHint = static_cast<bool>(cacheHint);
+  llvm::Value *i64Unused =
+      llvm::ConstantInt::get(llvm::Type::getInt64Ty(mt.getLLVMContext()), 0);
+  args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Unused);
+  args.push_back(builder.getInt1(hasCacheHint));
+
+  return {id, std::move(args)};
+}
+
 mlir::NVVM::IDArgPair CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::CpAsyncBulkSharedCTAToGlobalOp>(op);
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
index f1fa3b61f2dd9..bfd952636ffbe 100644
--- a/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
@@ -1,5 +1,14 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
+// CHECK-LABEL: @tma_bulk_prefetch
+llvm.func @tma_bulk_prefetch(%src : !llvm.ptr<1>, %size : i32, %ch : i64) {
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 0, i1 false)
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 %{{.*}}, i1 true)
+  nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
+  nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
+  llvm.return
+}
+
 // CHECK-LABEL: @tma_prefetch_1d
 llvm.func @tma_prefetch_1d(%tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
   // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %0, i32 %{{.*}}, i64 0, i1 false)

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

This patch adds an Op for the TMA prefetch (non-tensor) variant. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

This patch adds an Op for the TMA prefetch (non-tensor) variant. llvm-lit tests are added to verify the lowering to the intrinsics. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

durga4github requested a review from grypp as a code owner May 23, 2025 08:42

llvmbot added mlir:llvm mlir labels May 23, 2025

durga4github removed the request for review from grypp May 23, 2025 08:42

durga4github requested a review from grypp May 23, 2025 08:43

grypp reviewed May 23, 2025

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Show resolved Hide resolved

durga4github changed the title ~~[MLIR][NVVM] Add TMA prefetch Op~~ [MLIR][NVVM] Add TMA linear prefetch Op May 23, 2025

[MLIR][NVVM] Add TMA prefetch Op

a25016f

This patch adds an Op for the TMA prefetch (non-tensor) variant. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

durga4github force-pushed the durgadossr/mlir_tma_prefetch branch from 490b526 to a25016f Compare May 26, 2025 09:44

grypp approved these changes May 26, 2025

View reviewed changes

durga4github merged commit b038dc2 into llvm:main May 26, 2025
11 checks passed

durga4github deleted the durgadossr/mlir_tma_prefetch branch May 26, 2025 10:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[MLIR][NVVM] Add TMA linear prefetch Op #141211

[MLIR][NVVM] Add TMA linear prefetch Op #141211

Uh oh!

durga4github commented May 23, 2025

Uh oh!

llvmbot commented May 23, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

[MLIR][NVVM] Add TMA linear prefetch Op #141211

[MLIR][NVVM] Add TMA linear prefetch Op #141211

Uh oh!

Conversation

durga4github commented May 23, 2025

Uh oh!

llvmbot commented May 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

llvmbot commented May 23, 2025 •

edited

Loading