[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

ChunyuLiao · 2025-06-17T02:52:21Z

Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM.

By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop.

corrent: https://godbolt.org/z/EnGMn3xeM
expected similar form: https://godbolt.org/z/nWhGM6Ej5

llvmbot · 2025-06-17T02:52:50Z

@llvm/pr-subscribers-backend-risc-v

Author: Liao Chunyu (ChunyuLiao)

Changes

Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM.

By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop.

corrent: https://godbolt.org/z/EnGMn3xeM
expected similar form: https://godbolt.org/z/nWhGM6Ej5

Full diff: https://github.com/llvm/llvm-project/pull/144461.diff

4 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+23-1)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+1)
(modified) llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll (+3-3)
(modified) llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll (+24-2)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601c..ab36d0aeffa99 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8170,11 +8170,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerRESET_FPENV(Op, DAG);
   case ISD::EH_DWARF_CFA:
     return lowerEH_DWARF_CFA(Op, DAG);
+  case ISD::VP_SELECT:
+    if (SDValue Op2 = Op.getOperand(2);
+        Op2.hasOneUse() && (Op2.getOpcode() == ISD::SPLAT_VECTOR ||
+                            Op2.getOpcode() == ISD::SPLAT_VECTOR_PARTS))
+      return lowerVPSelectConstantFalse(Op, DAG);
+    else
+      return lowerVPOp(Op, DAG);
   case ISD::VP_MERGE:
     if (Op.getSimpleValueType().getVectorElementType() == MVT::i1)
       return lowerVPMergeMask(Op, DAG);
     [[fallthrough]];
-  case ISD::VP_SELECT:
   case ISD::VP_ADD:
   case ISD::VP_SUB:
   case ISD::VP_MUL:
@@ -13176,6 +13182,22 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
   return convertFromScalableVector(VT, Result, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::lowerVPSelectConstantFalse(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue TrueVal = Op.getOperand(1);
+  SDValue FalseVal = Op.getOperand(2);
+  SDValue VL = Op.getOperand(3);
+
+  MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+  SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+  SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT,
+                                Op.getOperand(0), AllOneMask, VL);
+  return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, NewMask, FalseVal, TrueVal,
+                     DAG.getUNDEF(VT), VL);
+}
+
 SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a62382..cf04e56f36288 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -477,6 +477,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
 
   SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPSelectConstantFalse(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index 75537406f3515..a9ed70b94c90f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1>
 ; ZVE32:       # %bb.0:
 ; ZVE32-NEXT:    csrr a1, vlenb
 ; ZVE32-NEXT:    srli a1, a1, 3
-; ZVE32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; ZVE32-NEXT:    vmv.v.i v8, 0
-; ZVE32-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
+; ZVE32-NEXT:    vmnot.m v0, v0
+; ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0
 ; ZVE32-NEXT:    ret
   %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
   ret <vscale x 1 x i8> %load
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dda..3918a8009fde8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
   ret <vscale x 2 x i64> %v
 }
 
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmnot.m v0, v0
+; CHECK-NEXT:    li a0, 100
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
 declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    bltu a2, a1, .LBB48_2
+; CHECK-NEXT:    bltu a2, a1, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:  .LBB48_2:
+; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload

github-actions · 2025-06-17T02:55:30Z

✅ With the latest revision this PR passed the C/C++ code formatter.

wangpc-pp · 2025-06-17T03:28:51Z

llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll

 ; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
+; ZVE32-NEXT:    vmnot.m v0, v0
+; ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0


I don't think using vmerge is better. Usually vmerge is slower.

I don't think using vmerge is better. Usually vmerge is slower.

Can you help explain why vmerge is slower? According to this example, https://github.com/llvm/llvm-project/blob/main/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s#L1477
vmv, 4 cycle
vmerge, 4 cycles
vmnot.m, 4 cycles
vsetvl, 3 cycles

There can be some microarchitecture optimizations like mv/zero idioms elimination, that is my point. And for this test case, it is a regression since we generate vmnot.m+vmerge.vim instead of a vmv.vi, the latency is doubled.

Thanks, for this case, it also saves one vsetvl instruction, but introduces one additional cycle of latency.

Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.

topperc · 2025-06-17T05:43:55Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+
+  MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+  SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+  SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT,


Does the VMXOR get combined with any compare instruction that produces the mask?

I plan to write another patch to combine vmxor with setcc. This vmxor cannot be eliminated, now.

preames · 2025-06-17T16:09:54Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

@@ -8170,11 +8170,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
    return lowerRESET_FPENV(Op, DAG);
  case ISD::EH_DWARF_CFA:
    return lowerEH_DWARF_CFA(Op, DAG);
+  case ISD::VP_SELECT:
+    if (SDValue Op2 = Op.getOperand(2);


Have you tried doing this as a pre-lowering combine instead? I could see that triggering other interactions, but it would address the missing fold problem Craig notes below.

Indeed, pre-lowering combine is a better, it reuses the optimization from the SDNode, and I've already made the change.

preames · 2025-06-17T16:11:09Z

llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll

 ; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
+; ZVE32-NEXT:    vmnot.m v0, v0
+; ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0


Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.

llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll

preames · 2025-06-17T16:14:01Z

llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll

+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmnot.m v0, v0


I believe the prior codegen here is going to be li; vmv.v.x; vmerge right? If so, negating the condition would be profitable in this case. Though you do have the hoisting interaction noted above.

mshockwave · 2025-06-17T16:15:06Z

llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll

@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1>
 ; ZVE32:       # %bb.0:
 ; ZVE32-NEXT:    csrr a1, vlenb
 ; ZVE32-NEXT:    srli a1, a1, 3
-; ZVE32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; ZVE32-NEXT:    vmv.v.i v8, 0


was this patch (partially) motivated by https://github.com/llvm/llvm-project/pull/144170/files#r2146240973 ? If that's the case, could we solve it by changing the VL optimizer? we don't need to splat 0 for VLMAX elements

Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM. By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop. corrent: https://godbolt.org/z/EnGMn3xeM expected similar form: https://godbolt.org/z/nWhGM6Ej5

ChunyuLiao requested review from preames, lukel97, mshockwave and topperc June 17, 2025 02:52

llvmbot added the backend:RISC-V label Jun 17, 2025

ChunyuLiao force-pushed the lower_vpselect branch from c1d7c1b to d5cae13 Compare June 17, 2025 02:58

wangpc-pp reviewed Jun 17, 2025

View reviewed changes

topperc reviewed Jun 17, 2025

View reviewed changes

preames reviewed Jun 17, 2025

View reviewed changes

mshockwave reviewed Jun 17, 2025

View reviewed changes

ChunyuLiao added a commit that referenced this pull request Jun 18, 2025

[RISCV] Pre-test for #144461

e14f327

ChunyuLiao force-pushed the lower_vpselect branch from d5cae13 to 3151db1 Compare June 18, 2025 04:18

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

ChunyuLiao commented Jun 17, 2025

Uh oh!

llvmbot commented Jun 17, 2025

Uh oh!

github-actions bot commented Jun 17, 2025 •

edited

Loading

Uh oh!

wangpc-pp Jun 17, 2025

Uh oh!

ChunyuLiao Jun 17, 2025

Uh oh!

wangpc-pp Jun 17, 2025

Uh oh!

ChunyuLiao Jun 17, 2025

Uh oh!

preames Jun 17, 2025

Uh oh!

topperc Jun 17, 2025

Uh oh!

ChunyuLiao Jun 17, 2025

Uh oh!

preames Jun 17, 2025

Uh oh!

ChunyuLiao Jun 18, 2025

Uh oh!

preames Jun 17, 2025

Uh oh!

Uh oh!

preames Jun 17, 2025

Uh oh!

mshockwave Jun 17, 2025 •

edited

Loading

Uh oh!

Uh oh!

[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

Are you sure you want to change the base?

[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461

Conversation

ChunyuLiao commented Jun 17, 2025

Uh oh!

llvmbot commented Jun 17, 2025

Uh oh!

github-actions bot commented Jun 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

mshockwave Jun 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Jun 17, 2025 •

edited

Loading

mshockwave Jun 17, 2025 •

edited

Loading