-
Notifications
You must be signed in to change notification settings - Fork 14k
[RISCV][InsertVSETVLI] Remove redundant vsetvli by coalescing blocks bottom-up #141298
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV][InsertVSETVLI] Remove redundant vsetvli by coalescing blocks bottom-up #141298
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesI ran into a relatively rare case in RISCVInsertVSETVLIPass, where right after the
After the bb.first:
- %46:gprnox0 = PseudoVSETIVLI %30:gprnox0, 199 /* e8, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ dead %46:gprnox0 = PseudoVSETIVLI %30:gprnox0, 199 /* e8, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
%76:gpr = PseudoVSETVLIX0 killed $x0, ..., implicit-def $vl, implicit-def $vtype
$v10m2 = PseudoVMV_V_I_M2 undef renamable $v10m2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
...
bb.second:
- $x0 = PseudoVSETVLI %46, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ $x0 = PseudoVSETVLI %30, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
$v10 = PseudoVMV_S_X undef $v10(tied-def 0), undef %53:gpr, $noreg, 5, implicit $vl, implicit $vtype
- $x0 = PseudoVSETVLI %30, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
$v8 = PseudoVREDSUM_VS_M2_E32 undef $v8(tied-def 0), killed $v8m2, killed $v10, $noreg, 5, 0, implicit $vl, implicit $vtype We forwarded This will produce assembly code like this:
This patch fixes this issue by revisiting the related blocks when we're dropping AVL from a vsetvli instruction. Full diff: https://github.com/llvm/llvm-project/pull/141298.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 2d79ced1cc163..97a415a3b0c6f 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -26,6 +26,7 @@
#include "RISCV.h"
#include "RISCVSubtarget.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -895,7 +896,8 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
bool canMutatePriorConfig(const MachineInstr &PrevMI, const MachineInstr &MI,
const DemandedFields &Used) const;
- void coalesceVSETVLIs(MachineBasicBlock &MBB) const;
+ void coalesceVSETVLIs(SetVector<MachineBasicBlock *> &Worklist,
+ MachineBasicBlock &MBB) const;
VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
@@ -1642,7 +1644,8 @@ bool RISCVInsertVSETVLI::canMutatePriorConfig(
return areCompatibleVTYPEs(PriorVType, VType, Used);
}
-void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
+void RISCVInsertVSETVLI::coalesceVSETVLIs(
+ SetVector<MachineBasicBlock *> &Worklist, MachineBasicBlock &MBB) const {
MachineInstr *NextMI = nullptr;
// We can have arbitrary code in successors, so VL and VTYPE
// must be considered demanded.
@@ -1661,9 +1664,18 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
LIS->shrinkToUses(&LIS->getInterval(OldVLReg));
MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
- if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
- MRI->use_nodbg_empty(OldVLReg))
- ToDelete.push_back(VLOpDef);
+ if (VLOpDef && MRI->use_nodbg_empty(OldVLReg)) {
+ if (TII->isAddImmediate(*VLOpDef, OldVLReg))
+ ToDelete.push_back(VLOpDef);
+ // If the destination register of a vset* instruction becomes dead because
+ // of this, there might be a chance to eliminate it. Put into the worklist
+ // so that we can revisit it.
+ // Note that since this is a virtual register, the definition instruction
+ // is always placed earlier in the program order. Thus, we avoid
+ // enqueuing blocks in cycle and therefore guarantee to terminate.
+ if (RISCVInstrInfo::isVectorConfigInstr(*VLOpDef))
+ Worklist.insert(VLOpDef->getParent());
+ }
};
for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
@@ -1840,8 +1852,14 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
// any cross block analysis within the dataflow. We can't have both
// demanded fields based mutation and non-local analysis in the
// dataflow at the same time without introducing inconsistencies.
- for (MachineBasicBlock &MBB : MF)
- coalesceVSETVLIs(MBB);
+ using BBPtrIterator = pointer_iterator<MachineFunction::iterator>;
+ SetVector<MachineBasicBlock *> Worklist(BBPtrIterator(MF.begin()),
+ BBPtrIterator(MF.end()));
+ while (!Worklist.empty()) {
+ MachineBasicBlock *MBB = Worklist.front();
+ Worklist.erase(Worklist.begin());
+ coalesceVSETVLIs(Worklist, *MBB);
+ }
// Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output
// of VLEFF/VLSEGFF.
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-coalesce.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-coalesce.mir
new file mode 100644
index 0000000000000..b4e98c8117f75
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-coalesce.mir
@@ -0,0 +1,266 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v,+m,+b -O3 -start-after=riscv-isel -stop-after=riscv-insert-vsetvli %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+ target triple = "riscv64-unknown-linux-gnu"
+
+ @g0 = external global ptr
+ @fooo.g1 = external constant [16 x ptr]
+
+ define fastcc i32 @fooo(i32 signext %s, i64 %conv63.i, i64 %idxprom144.i, <vscale x 4 x i1> %0, i1 %1, i32 %shr.i23, <vscale x 4 x i1> %2) {
+ ret i32 4
+ }
+
+...
+---
+name: fooo
+alignment: 2
+tracksRegLiveness: true
+isSSA: true
+body: |
+ ; CHECK-LABEL: name: fooo
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $v0, $x10, $x11, $x12, $x13, $x14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[ADDIW:%[0-9]+]]:gpr = ADDIW undef %10:gpr, 0
+ ; CHECK-NEXT: [[PseudoVSETVLI:%[0-9]+]]:gprnox0 = PseudoVSETVLI undef %13:gprnox0, 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: PseudoVSE8_V_M8 undef renamable $v8m8, undef %15:gpr, undef $noreg, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1)
+ ; CHECK-NEXT: dead [[ADD:%[0-9]+]]:gpr = ADD undef %15:gpr, [[PseudoVSETVLI]]
+ ; CHECK-NEXT: BNE undef %14:gpr, $x0, %bb.1
+ ; CHECK-NEXT: PseudoBR %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[SLLI:%[0-9]+]]:gpr = SLLI undef %5:gpr, 2
+ ; CHECK-NEXT: dead [[SUB:%[0-9]+]]:gpr = SUB undef %18:gpr, undef [[SLLI]]
+ ; CHECK-NEXT: dead [[SH2ADD:%[0-9]+]]:gpr = SH2ADD undef %0:gpr, undef %18:gpr
+ ; CHECK-NEXT: dead [[OR:%[0-9]+]]:gpr = OR undef [[SH2ADD]], undef [[SUB]]
+ ; CHECK-NEXT: dead [[MULW:%[0-9]+]]:gpr = MULW undef [[OR]], undef %9:gpr
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+ ; CHECK-NEXT: $x10 = COPY undef [[ADDIW]]
+ ; CHECK-NEXT: PseudoCALLIndirect undef %3:gprjalr, csr_ilp32d_lp64d, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit-def $x2, implicit-def $x10
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; CHECK-NEXT: SW undef %3:gprjalr, undef [[SLLI]], 0 :: (store (s32))
+ ; CHECK-NEXT: dead [[PseudoCCMOVGPR:%[0-9]+]]:gprjalr = PseudoCCMOVGPR undef [[MULW]], undef [[COPY]], 1, undef [[PseudoCCMOVGPR]], undef %7:gpr
+ ; CHECK-NEXT: dead [[PseudoCCMOVGPR1:%[0-9]+]]:gprjalr = PseudoCCMOVGPR undef [[MULW]], [[COPY]], 1, undef [[PseudoCCMOVGPR1]], undef %9:gpr
+ ; CHECK-NEXT: dead [[ADD_UW:%[0-9]+]]:gpr = ADD_UW undef %10:gpr, $x0
+ ; CHECK-NEXT: [[ADD_UW1:%[0-9]+]]:gprnox0 = ADD_UW undef %5:gpr, $x0
+ ; CHECK-NEXT: dead [[SLLI1:%[0-9]+]]:gpr = SLLI undef %32:gpr, 2
+ ; CHECK-NEXT: dead [[OR1:%[0-9]+]]:gpr = OR undef [[SLLI1]], undef [[PseudoCCMOVGPR]]
+ ; CHECK-NEXT: dead [[OR2:%[0-9]+]]:gpr = OR undef [[SLLI1]], undef [[PseudoCCMOVGPR1]]
+ ; CHECK-NEXT: dead [[PseudoMovAddr:%[0-9]+]]:gpr = PseudoMovAddr target-flags(riscv-hi) @fooo.g1, target-flags(riscv-lo) @fooo.g1
+ ; CHECK-NEXT: dead [[LUI:%[0-9]+]]:gpr = LUI target-flags(riscv-hi) @g0
+ ; CHECK-NEXT: dead [[MINU:%[0-9]+]]:gprnox0 = MINU undef [[ADD_UW1]], undef %50:gpr
+ ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S undef renamable $v8, 5 /* e32 */, implicit $vtype
+ ; CHECK-NEXT: dead [[SLLI2:%[0-9]+]]:gpr = SLLI undef %6:gpr, 3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: BEQ undef %32:gpr, $x0, %bb.5
+ ; CHECK-NEXT: PseudoBR %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.7(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: PseudoBR %bb.7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.6(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.6:
+ ; CHECK-NEXT: successors: %bb.6(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: PseudoBR %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.7:
+ ; CHECK-NEXT: successors: %bb.8(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+ ; CHECK-NEXT: $x10 = COPY undef [[ADDIW]]
+ ; CHECK-NEXT: $x12 = COPY undef [[OR2]]
+ ; CHECK-NEXT: $x13 = COPY undef [[OR1]]
+ ; CHECK-NEXT: PseudoCALLIndirect undef %3:gprjalr, csr_ilp32d_lp64d, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit $x12, implicit $x13, implicit-def $x2, implicit-def dead $x10
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+ ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 202 /* e16, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead renamable $v8m4 = PseudoVLE16_V_M4 undef renamable $v8m4, undef %44, 1, 4 /* e16 */, 2 /* tu, ma */, implicit $vl, implicit $vtype :: (load unknown-size, align 32)
+ ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:gpr = INIT_UNDEF
+ ; CHECK-NEXT: renamable $v8m4 = INIT_UNDEF
+ ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:gprnox0 = INIT_UNDEF
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[INIT_UNDEF1]], 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead early-clobber renamable $v12m2 = PseudoVLUXEI64_V_M4_M2_MASK undef renamable $v12m2, [[INIT_UNDEF]], killed renamable $v8m4, undef $v0, $noreg, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype :: (load unknown-size, align 4)
+ ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: renamable $v12m2 = PseudoVMV_V_I_M2 undef renamable $v12m2, 1, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead [[SH3ADD:%[0-9]+]]:gpr = SH3ADD undef %40:gpr, undef [[PseudoMovAddr]]
+ ; CHECK-NEXT: dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: renamable $v10m2 = PseudoVMV_V_I_M2 undef renamable $v10m2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.8:
+ ; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.8(0x7c000000)
+ ; CHECK-NEXT: liveins: $v10m2, $v12m2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 192 /* e8, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: renamable $v8m2 = COPY killed renamable $v10m2, implicit $vtype
+ ; CHECK-NEXT: renamable $v10m2 = COPY renamable $v12m2, implicit $vtype
+ ; CHECK-NEXT: BEQ undef %53:gpr, $x0, %bb.8
+ ; CHECK-NEXT: PseudoBR %bb.9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.9:
+ ; CHECK-NEXT: successors: %bb.7(0x7c000000), %bb.10(0x04000000)
+ ; CHECK-NEXT: liveins: $v8m2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 145 /* e32, m2, tu, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: renamable $v8m2 = PseudoVMV_V_V_M2 killed renamable $v8m2, undef renamable $v12m2, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $x0 = PseudoVSETVLI [[ADD_UW1]], 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: renamable $v10 = PseudoVMV_S_X undef renamable $v10, undef %53:gpr, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead renamable $v8 = PseudoVREDSUM_VS_M2_E32 undef renamable $v8, killed renamable $v8m2, killed renamable $v10, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: BNE undef [[ADD_UW]], $x0, %bb.7
+ ; CHECK-NEXT: PseudoBR %bb.10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.10:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SW undef [[DEF]], undef [[SLLI2]], 0 :: (store (s32))
+ ; CHECK-NEXT: PseudoBR %bb.3
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $x10, $x11, $x12, $v0, $x13, $x14
+
+ %111:gpr = IMPLICIT_DEF
+ %112:gprnox0 = IMPLICIT_DEF
+ %113:vrm8 = IMPLICIT_DEF
+ %114:gprjalr = IMPLICIT_DEF
+ %115:vmv0 = IMPLICIT_DEF
+ %30:gpr = IMPLICIT_DEF
+ %36:gpr = COPY undef %30
+ %39:gpr = IMPLICIT_DEF
+ %1:gpr = IMPLICIT_DEF
+ %2:gpr = IMPLICIT_DEF
+ %40:gpr = IMPLICIT_DEF
+ %4:gpr = ADDIW undef %40, 0
+ %5:gpr = COPY undef %2
+
+ bb.1:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+
+ %9:gprnox0 = PHI undef %5, %bb.0, undef %12, %bb.1
+ %10:gpr = PHI undef %112, %bb.0, undef %11, %bb.1
+ %45:gprnox0 = PseudoVSETVLI undef %9, 195 /* e8, m8, ta, ma */, implicit-def dead $vl, implicit-def dead $vtype
+ PseudoVSE8_V_M8 undef %113, undef %10, undef %45, 3 /* e8 */ :: (store unknown-size, align 1)
+ %11:gpr = ADD %10, %45
+ %12:gpr = IMPLICIT_DEF
+ BNE undef %12, $x0, %bb.1
+ PseudoBR %bb.2
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+
+ %47:gpr = IMPLICIT_DEF
+ %13:gpr = COPY undef %36
+ %48:gpr = SLLI undef %36, 2
+ %49:gpr = SUB undef %47, undef %48
+ %51:gpr = SH2ADD undef %111, undef %47
+ %53:gpr = OR killed undef %51, killed undef %49
+ %54:gpr = MULW killed undef %53, undef %2
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+ %55:gpr = IMPLICIT_DEF
+ $x10 = COPY undef %4
+ $x11 = COPY undef %1
+ $x13 = COPY undef %39
+ $x14 = COPY undef %55
+ $x15 = COPY undef %55
+ PseudoCALLIndirect undef %114, csr_ilp32d_lp64d, implicit-def dead $x1, implicit $x10, implicit $x11, implicit $x13, implicit $x14, implicit $x15, implicit-def $x2, implicit-def $x10
+ ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+ %57:gpr = COPY $x10
+ SW undef %114, undef %48, 0 :: (store (s32))
+ %14:gpr = PseudoCCMOVGPR undef %54, undef %57, 1, undef %114, undef %39
+ %15:gpr = PseudoCCMOVGPR undef %54, %57, 1, undef %114, undef %5
+ %16:gpr = ADD_UW undef %40, $x0
+ %17:gprnox0 = ADD_UW undef %30, $x0
+ %46:gpr = COPY undef %114
+
+ bb.3:
+ successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+
+ %18:gpr = PHI undef %46, %bb.2, undef %93, %bb.10
+ %60:gpr = SLLI undef %18, 2
+ %19:gpr = OR undef %60, undef %14
+ %20:gpr = OR undef %60, undef %15
+ BEQ killed undef %18, $x0, %bb.5
+ PseudoBR %bb.4
+
+ bb.4:
+ successors: %bb.7(0x80000000)
+
+ %64:gpr = COPY $x0
+ %63:gpr = COPY undef %64
+ %62:gpr = COPY undef %64
+ PseudoBR %bb.7
+
+ bb.5:
+ successors: %bb.6(0x80000000)
+
+ bb.6:
+ successors: %bb.6(0x80000000)
+
+ PseudoBR %bb.6
+
+ bb.7:
+ successors: %bb.8(0x80000000)
+
+ %21:gpr = PHI undef %62, %bb.4, undef %111, %bb.9
+ %22:gpr = PHI undef %63, %bb.4, undef %28, %bb.9
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $x2, implicit $x2
+ $x10 = COPY undef %4
+ $x11 = COPY undef %1
+ $x12 = COPY undef %20
+ $x13 = COPY undef %19
+ PseudoCALLIndirect undef %114, csr_ilp32d_lp64d, implicit-def dead $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit-def $x2, implicit-def $x10
+ ADJCALLSTACKUP 0, 0, implicit-def dead $x2, implicit $x2
+ %68:gpr = PseudoMovAddr target-flags(riscv-hi) @fooo.g1, target-flags(riscv-lo) @fooo.g1
+ %23:gpr = SH3ADD %21, killed undef %68
+ %69:gpr = LUI target-flags(riscv-hi) @g0
+ %25:gprnox0 = PseudoVSETVLI %17, 199 /* e8, mf2, ta, ma */, implicit-def dead $vl, implicit-def dead $vtype
+ %65:vrm2 = PseudoVMV_V_I_M2 undef $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
+
+ bb.8:
+ successors: %bb.9(0x04000000), %bb.8(0x7c000000)
+
+ %26:vrm2nov0 = PHI %65, %bb.7, %70, %bb.8
+ %71:gpr = IMPLICIT_DEF
+ %72:gprnox0 = MINU undef %17, killed undef %71
+ %73:vrm4 = PseudoVLE16_V_M4 undef $noreg, undef %23, 1, 4 /* e16 */, 2 /* tu, ma */ :: (load unknown-size, align 32)
+ %74:gpr = IMPLICIT_DEF
+ early-clobber %84:vrm2nov0 = PseudoVLUXEI64_V_M4_M2_MASK undef $noreg, undef %69, killed undef %73, undef %115, killed undef %72, 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size, align 4)
+ %86:vrm2nov0 = PseudoVMERGE_VVM_M2 %26, undef %26, killed undef %84, killed undef %115, 0, 5 /* e32 */
+ %70:vrm2 = PseudoVMV_V_I_M2 undef $noreg, 1, -1, 5 /* e32 */, 0 /* tu, mu */
+ %27:vrm2 = COPY undef %86
+ BEQ killed undef %74, $x0, %bb.8
+ PseudoBR %bb.9
+
+ bb.9:
+ successors: %bb.7(0x7c000000), %bb.10(0x04000000)
+
+ %90:vr = PseudoVMV_S_X undef $noreg, undef %74, %25, 5 /* e32 */
+ %91:vr = PseudoVREDSUM_VS_M2_E32 undef $noreg, %27, killed %90, %25, 5 /* e32 */, 0 /* tu, mu */
+ %28:gpr = PseudoVMV_X_S killed undef %91, 5 /* e32 */
+ BNE undef %16, $x0, %bb.7
+ PseudoBR %bb.10
+
+ bb.10:
+ successors: %bb.3(0x80000000)
+
+ %93:gpr = SLLI undef %13, 3
+ SW undef %22, killed undef %93, 0 :: (store (s32))
+ PseudoBR %bb.3
+...
|
for (MachineBasicBlock &MBB : MF) | ||
coalesceVSETVLIs(MBB); | ||
using BBPtrIterator = pointer_iterator<MachineFunction::iterator>; | ||
SetVector<MachineBasicBlock *> Worklist(BBPtrIterator(MF.begin()), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we don't always traverse the blocks in topological order, we have to use a set here to avoid enqueuing blocks that will be visit anyway later.
@@ -26,6 +26,7 @@ | |||
|
|||
#include "RISCV.h" | |||
#include "RISCVSubtarget.h" | |||
#include "llvm/ADT/SetVector.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this include perhaps unnecessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd just changed the method and we're no longer using SetVector.
@@ -1840,8 +1852,14 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { | |||
// any cross block analysis within the dataflow. We can't have both | |||
// demanded fields based mutation and non-local analysis in the | |||
// dataflow at the same time without introducing inconsistencies. | |||
for (MachineBasicBlock &MBB : MF) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does traversing the blocks in post-order also fix this? E.g. can we do
for (MachineBasicBlock &MBB : post_order(&MF))
coalesceVSETVLIs(*MBB);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, it indeed solves the problem. The patch is updated now.
%25:gprnox0 = PseudoVSETVLI %17, 199 /* e8, mf2, ta, ma */, implicit-def dead $vl, implicit-def dead $vtype | ||
%65:vrm2 = PseudoVMV_V_I_M2 undef $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */ | ||
|
||
bb.8: | ||
successors: %bb.9(0x04000000), %bb.8(0x7c000000) | ||
|
||
%26:vrm2nov0 = PHI %65, %bb.7, %70, %bb.8 | ||
%71:gpr = IMPLICIT_DEF | ||
%72:gprnox0 = MINU undef %17, killed undef %71 | ||
%73:vrm4 = PseudoVLE16_V_M4 undef $noreg, undef %23, 1, 4 /* e16 */, 2 /* tu, ma */ :: (load unknown-size, align 32) | ||
%74:gpr = IMPLICIT_DEF | ||
early-clobber %84:vrm2nov0 = PseudoVLUXEI64_V_M4_M2_MASK undef $noreg, undef %69, killed undef %73, undef %115, killed undef %72, 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size, align 4) | ||
%86:vrm2nov0 = PseudoVMERGE_VVM_M2 %26, undef %26, killed undef %84, killed undef %115, 0, 5 /* e32 */ | ||
%70:vrm2 = PseudoVMV_V_I_M2 undef $noreg, 1, -1, 5 /* e32 */, 0 /* tu, mu */ | ||
%27:vrm2 = COPY undef %86 | ||
BEQ killed undef %74, $x0, %bb.8 | ||
PseudoBR %bb.9 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to trim this test case down to just block 7 and 8? Or is it relying on the other blocks to force a vsetvli toggle somewhere?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, I finally managed to reduce this test into only a couple of lines. Please take a look.
@@ -0,0 +1,266 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=riscv64 -mattr=+v,+m,+b -O3 -start-after=riscv-isel -stop-after=riscv-insert-vsetvli %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be turned into just llc -mtriple=riscv64 -mattr=+v,+m,+b -run-pass=liveintervals,riscv-insert-vsetvli
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Co-Authored-By: Luke Lau <luke@igalia.com>
c0088c9
to
5f1cfb6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
alignment: 2 | ||
tracksRegLiveness: true | ||
noPhis: true | ||
tracksDebugUserValues: true |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" | ||
target triple = "riscv64" | ||
|
||
@g0 = external global ptr | ||
@coalesce.g1 = external constant [16 x ptr] | ||
|
||
define fastcc i32 @coalesce() #0 { | ||
ret i32 4 | ||
} | ||
|
||
attributes #0 = { "target-features"="+v,+m,+b" } | ||
... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the LLVM IR be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we no longer need the global variables. It's fixed now.
... | ||
--- | ||
name: coalesce | ||
alignment: 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the alignment needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed.
@@ -0,0 +1,84 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=riscv64 -mattr=+v,+m,+b -run-pass=liveintervals,riscv-insert-vsetvli %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I presume +m and +b are no longer needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct, they're gone now.
…from bottom up (llvm#141298) I ran into a relatively rare case in RISCVInsertVSETVLIPass, where right after the `emitVSETVLI` phase but before the `coalesceVSETVLIs` phase, we have two blocks that look like this: ``` bb.first: %46:gprnox0 = PseudoVSETIVLI %30:gprnox0, 199 /* e8, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype %76:gpr = PseudoVSETVLIX0 killed $x0, ..., implicit-def $vl, implicit-def $vtype $v10m2 = PseudoVMV_V_I_M2 undef renamable $v10m2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ... bb.second: $x0 = PseudoVSETVLI %46, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype $v10 = PseudoVMV_S_X undef $v10(tied-def 0), undef %53:gpr, $noreg, 5, implicit $vl, implicit $vtype $x0 = PseudoVSETVLI %30, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype $v8 = PseudoVREDSUM_VS_M2_E32 undef $v8(tied-def 0), killed $v8m2, killed $v10, $noreg, 5, 0, implicit $vl, implicit $vtype ``` After the `coalesceVSETVLIs` phase, it turns into: ``` diff bb.first: - %46:gprnox0 = PseudoVSETIVLI %30:gprnox0, 199 /* e8, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype + dead %46:gprnox0 = PseudoVSETIVLI %30:gprnox0, 199 /* e8, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype %76:gpr = PseudoVSETVLIX0 killed $x0, ..., implicit-def $vl, implicit-def $vtype $v10m2 = PseudoVMV_V_I_M2 undef renamable $v10m2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ... bb.second: - $x0 = PseudoVSETVLI %46, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + $x0 = PseudoVSETVLI %30, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype $v10 = PseudoVMV_S_X undef $v10(tied-def 0), undef %53:gpr, $noreg, 5, implicit $vl, implicit $vtype - $x0 = PseudoVSETVLI %30, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype $v8 = PseudoVREDSUM_VS_M2_E32 undef $v8(tied-def 0), killed $v8m2, killed $v10, $noreg, 5, 0, implicit $vl, implicit $vtype ``` We forwarded `%30` to any use of `%46` and further reduced the number of VSETVLI we need in `bb.second`. But the problem is, if `bb.first` is processed before `bb.second` -- which is the majority of the cases -- then we're not able to remove the vsetvli which defines the now-dead `%46` in `bb.first` after coalescing `bb.second`. This will produce assembly code like this: ``` vsetvli zero, s0, e8, mf2, ta, ma vsetvli a0, zero, e32, m2, ta, ma vmv.v.i v10, 0 ``` This patch fixes this issue by coalescing the blocks from bottom up such that we can account for dead VSETVLI in the earlier blocks after its uses are eliminated in later blocks. --------- Co-authored-by: Luke Lau <luke@igalia.com>
I ran into a relatively rare case in RISCVInsertVSETVLIPass, where right after the
emitVSETVLI
phase but before thecoalesceVSETVLIs
phase, we have two blocks that look like this:After the
coalesceVSETVLIs
phase, it turns into:We forwarded
%30
to any use of%46
and further reduced the number of VSETVLI we need inbb.second
. But the problem is, ifbb.first
is processed beforebb.second
-- which is the majority of the cases -- then we're not able to remove the vsetvli which defines the now-dead%46
inbb.first
after coalescingbb.second
.This will produce assembly code like this:
This patch fixes this issue by coalescing the blocks from bottom up such that we can account for dead VSETVLI in the earlier blocks after its uses are eliminated in later blocks.