-
Notifications
You must be signed in to change notification settings - Fork 14k
[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Liao Chunyu (ChunyuLiao) ChangesCurrently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM. By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop. corrent: https://godbolt.org/z/EnGMn3xeM Full diff: https://github.com/llvm/llvm-project/pull/144461.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601c..ab36d0aeffa99 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8170,11 +8170,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerRESET_FPENV(Op, DAG);
case ISD::EH_DWARF_CFA:
return lowerEH_DWARF_CFA(Op, DAG);
+ case ISD::VP_SELECT:
+ if (SDValue Op2 = Op.getOperand(2);
+ Op2.hasOneUse() && (Op2.getOpcode() == ISD::SPLAT_VECTOR ||
+ Op2.getOpcode() == ISD::SPLAT_VECTOR_PARTS))
+ return lowerVPSelectConstantFalse(Op, DAG);
+ else
+ return lowerVPOp(Op, DAG);
case ISD::VP_MERGE:
if (Op.getSimpleValueType().getVectorElementType() == MVT::i1)
return lowerVPMergeMask(Op, DAG);
[[fallthrough]];
- case ISD::VP_SELECT:
case ISD::VP_ADD:
case ISD::VP_SUB:
case ISD::VP_MUL:
@@ -13176,6 +13182,22 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
return convertFromScalableVector(VT, Result, DAG, Subtarget);
}
+SDValue RISCVTargetLowering::lowerVPSelectConstantFalse(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue TrueVal = Op.getOperand(1);
+ SDValue FalseVal = Op.getOperand(2);
+ SDValue VL = Op.getOperand(3);
+
+ MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+ SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT,
+ Op.getOperand(0), AllOneMask, VL);
+ return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, NewMask, FalseVal, TrueVal,
+ DAG.getUNDEF(VT), VL);
+}
+
SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a62382..cf04e56f36288 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -477,6 +477,7 @@ class RISCVTargetLowering : public TargetLowering {
SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVPSelectConstantFalse(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index 75537406f3515..a9ed70b94c90f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1>
; ZVE32: # %bb.0:
; ZVE32-NEXT: csrr a1, vlenb
; ZVE32-NEXT: srli a1, a1, 3
-; ZVE32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
-; ZVE32-NEXT: vmv.v.i v8, 0
-; ZVE32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
; ZVE32-NEXT: vle8.v v8, (a0), v0.t
+; ZVE32-NEXT: vmnot.m v0, v0
+; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0
; ZVE32-NEXT: ret
%load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
ret <vscale x 1 x i8> %load
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dda..3918a8009fde8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
ret <vscale x 2 x i64> %v
}
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+ ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT: vmnot.m v0, v0
+; CHECK-NEXT: li a0, 100
+; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+ ret <vscale x 2 x i64> %v
+}
+
declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
; CHECK-NEXT: and a4, a5, a4
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: bltu a2, a1, .LBB48_2
+; CHECK-NEXT: bltu a2, a1, .LBB50_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB48_2:
+; CHECK-NEXT: .LBB50_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
c1d7c1b
to
d5cae13
Compare
; ZVE32-NEXT: vle8.v v8, (a0), v0.t | ||
; ZVE32-NEXT: vmnot.m v0, v0 | ||
; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think using vmerge
is better. Usually vmerge
is slower.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think using
vmerge
is better. Usuallyvmerge
is slower.
Can you help explain why vmerge is slower? According to this example, https://github.com/llvm/llvm-project/blob/main/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s#L1477
vmv, 4 cycle
vmerge, 4 cycles
vmnot.m, 4 cycles
vsetvl, 3 cycles
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There can be some microarchitecture optimizations like mv/zero idioms elimination, that is my point. And for this test case, it is a regression since we generate vmnot.m+vmerge.vim
instead of a vmv.vi
, the latency is doubled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, for this case, it also saves one vsetvl instruction, but introduces one additional cycle of latency.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.
|
||
MVT MaskVT = VT.changeVectorElementType(MVT::i1); | ||
SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); | ||
SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the VMXOR get combined with any compare instruction that produces the mask?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I plan to write another patch to combine vmxor with setcc. This vmxor cannot be eliminated, now.
@@ -8170,11 +8170,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, | |||
return lowerRESET_FPENV(Op, DAG); | |||
case ISD::EH_DWARF_CFA: | |||
return lowerEH_DWARF_CFA(Op, DAG); | |||
case ISD::VP_SELECT: | |||
if (SDValue Op2 = Op.getOperand(2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you tried doing this as a pre-lowering combine instead? I could see that triggering other interactions, but it would address the missing fold problem Craig notes below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, pre-lowering combine is a better, it reuses the optimization from the SDNode, and I've already made the change.
; ZVE32-NEXT: vle8.v v8, (a0), v0.t | ||
; ZVE32-NEXT: vmnot.m v0, v0 | ||
; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.
; CHECK-LABEL: select_nxv2i64_constant_false: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma | ||
; CHECK-NEXT: vmnot.m v0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the prior codegen here is going to be li; vmv.v.x; vmerge right? If so, negating the condition would be profitable in this case. Though you do have the hoisting interaction noted above.
@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1> | |||
; ZVE32: # %bb.0: | |||
; ZVE32-NEXT: csrr a1, vlenb | |||
; ZVE32-NEXT: srli a1, a1, 3 | |||
; ZVE32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma | |||
; ZVE32-NEXT: vmv.v.i v8, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
was this patch (partially) motivated by https://github.com/llvm/llvm-project/pull/144170/files#r2146240973 ? If that's the case, could we solve it by changing the VL optimizer? we don't need to splat 0 for VLMAX elements
Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM. By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop. corrent: https://godbolt.org/z/EnGMn3xeM expected similar form: https://godbolt.org/z/nWhGM6Ej5
d5cae13
to
3151db1
Compare
Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM.
By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop.
corrent: https://godbolt.org/z/EnGMn3xeM
expected similar form: https://godbolt.org/z/nWhGM6Ej5