Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][NPM] Port SIInsertWaitcnts to NPM #130061

Merged
merged 2 commits into from
Mar 24, 2025

Conversation

optimisan
Copy link
Contributor

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Mar 10, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Akshat Oke (optimisan)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/130061.diff

7 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+8-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+2-2)
  • (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+61-30)
  • (modified) llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir (+1)
  • (modified) llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir (+1)
  • (modified) llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir (+2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4197a60e77014..4dcfaf9b12b5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -371,6 +371,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> {
   static bool isRequired() { return true; }
 };
 
+class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
 FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
 
 ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -447,7 +454,7 @@ extern char &AMDGPUInsertDelayAluID;
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 
-void initializeSIInsertWaitcntsPass(PassRegistry&);
+void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
 extern char &SIInsertWaitcntsID;
 
 void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index de959f8a2aa62..c4641cba60e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
 MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
 MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
 MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
 MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
 
 DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
-DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
 DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
 DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
 // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbe212ad0a216..c3cc1dc6e495b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluLegacyPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
-  initializeSIInsertWaitcntsPass(*PR);
+  initializeSIInsertWaitcntsLegacyPass(*PR);
   initializeSIModeRegisterLegacyPass(*PR);
   initializeSIWholeQuadModeLegacyPass(*PR);
   initializeSILowerControlFlowLegacyPass(*PR);
@@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
   }
 
   addPass(SIMemoryLegalizerPass());
-  // TODO: addPass(SIInsertWaitcntsPass());
+  addPass(SIInsertWaitcntsPass());
 
   // TODO: addPass(SIModeRegisterPass());
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee263f58bcaf2..8951a4144bd68 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
-class SIInsertWaitcnts : public MachineFunctionPass {
+class SIInsertWaitcnts {
 private:
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
@@ -633,9 +634,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
 
 public:
-  static char ID;
-
-  SIInsertWaitcnts() : MachineFunctionPass(ID) {
+  SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+                   AliasAnalysis *AA)
+      : MLI(MLI), PDT(PDT), AA(AA) {
     (void)ForceExpCounter;
     (void)ForceLgkmCounter;
     (void)ForceVMCounter;
@@ -645,20 +646,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   bool isPreheaderToFlush(MachineBasicBlock &MBB,
                           WaitcntBrackets &ScoreBrackets);
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "SI insert wait instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfoWrapperPass>();
-    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
-    AU.addUsedIfAvailable<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
+  bool run(MachineFunction &MF);
 
   bool isForceEmitWaitcnt() const {
     for (auto T : inst_counter_types())
@@ -742,6 +730,36 @@ class SIInsertWaitcnts : public MachineFunctionPass {
                             WaitcntBrackets &ScoreBrackets);
 };
 
+class SIInsertWaitcntsLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+  SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+    auto *PDT =
+        &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+    AliasAnalysis *AA = nullptr;
+    if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+      AA = &AAR->getAAResults();
+
+    return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
+  }
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
+    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // end anonymous namespace
 
 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
@@ -1124,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   return hasMixedPendingEvents(T);
 }
 
-INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
-                      false)
+INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
-                    false)
+INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
+                    false, false)
 
-char SIInsertWaitcnts::ID = 0;
+char SIInsertWaitcntsLegacy::ID = 0;
 
-char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
 
 FunctionPass *llvm::createSIInsertWaitcntsPass() {
-  return new SIInsertWaitcnts();
+  return new SIInsertWaitcntsLegacy();
 }
 
 static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
@@ -2406,16 +2424,29 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
 }
 
-bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+SIInsertWaitcntsPass::run(MachineFunction &MF,
+                          MachineFunctionAnalysisManager &MFAM) {
+  auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
+  auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
+  auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+                 .getManager()
+                 .getCachedResult<AAManager>(MF.getFunction());
+
+  if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
+    return PreservedAnalyses::all();
+
+  return getMachineFunctionPassPreservedAnalyses()
+      .preserveSet<CFGAnalyses>()
+      .preserve<AAManager>();
+}
+
+bool SIInsertWaitcnts::run(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
-  if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
-    AA = &AAR->getAAResults();
 
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
index b6dc75db3edc1..0456d5cc463f1 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
 
 # $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
 # that the return address is not clobbered in the callee by the outstanding load.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
index 28d79efc00b0d..2834ca5fa6858 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s
 
 ---
 name: test
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 17e3d93ed393b..f5321591a3c88 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -2,6 +2,8 @@
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+
+# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32

@optimisan optimisan force-pushed the users/optimisan/preemit/port-si-insert-waitcnts branch from 11b7833 to 5f9af25 Compare March 10, 2025 05:21
@optimisan optimisan force-pushed the users/optimisan/preemit/port-si-memory-legalizer branch from 861804a to f80cce0 Compare March 11, 2025 09:21
@optimisan optimisan force-pushed the users/optimisan/preemit/port-si-insert-waitcnts branch from 5f9af25 to ee1cee2 Compare March 11, 2025 09:28
@optimisan optimisan force-pushed the users/optimisan/preemit/port-si-memory-legalizer branch from f80cce0 to b61adcc Compare March 12, 2025 06:06
Base automatically changed from users/optimisan/preemit/port-si-memory-legalizer to main March 12, 2025 09:00
@optimisan optimisan force-pushed the users/optimisan/preemit/port-si-insert-waitcnts branch from ee1cee2 to caa13cc Compare March 24, 2025 09:46
@optimisan optimisan merged commit f10dc76 into main Mar 24, 2025
11 checks passed
@optimisan optimisan deleted the users/optimisan/preemit/port-si-insert-waitcnts branch March 24, 2025 16:06
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants