-
Notifications
You must be signed in to change notification settings - Fork 14k
[SelectionDAG][RISCV] (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)) in getNode. #144565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… (C0 + C1)) in getNode. This is an alternative to the DAGCombine proposed in llvm#144507.
@llvm/pr-subscribers-llvm-selectiondag Author: Craig Topper (topperc) ChangesWe already have shl/mul vscale related folds in getNode. This is an alternative to the DAGCombine proposed in #144507. Patch is 127.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144565.diff 14 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 45a37622a531b..b50d8fe72d5cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7377,6 +7377,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
if ((Opcode == ISD::ADD || Opcode == ISD::SUB) &&
VT.getScalarType() == MVT::i1)
return getNode(ISD::XOR, DL, VT, N1, N2);
+ if (Opcode == ISD::ADD && N1.getOpcode() == ISD::VSCALE &&
+ N2.getOpcode() == ISD::VSCALE) {
+ const APInt &C1 = N1->getConstantOperandAPInt(0);
+ const APInt &C2 = N2->getConstantOperandAPInt(0);
+ return getVScale(DL, VT, C1 + C2);
+ }
break;
case ISD::MUL:
assert(VT.isInteger() && "This operator does not apply to FP types!");
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 83637e4a71d45..0d288c0426a5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -490,8 +490,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vslidedown.vx v12, v9, a0
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v12, v10, a0
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
@@ -545,8 +543,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vslidedown.vx v12, v9, a0
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v12, v10, a0
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index ca9cec921b3cd..e6f3c56956f72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -80,10 +80,9 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a1, a1, a0
; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a0
; CHECK-NEXT: ret
%v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
ret <vscale x 4 x i8> %v
@@ -246,8 +245,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v16, a0
; CHECK-NEXT: ret
%v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -282,8 +280,8 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_1(<vscale x 16 x i8> %vec, <vsc
; CHECK-LABEL: insert_nxv16i8_nxv1i8_1:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a1, a0, 2
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: ret
@@ -296,8 +294,9 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_2(<vscale x 16 x i8> %vec, <vsc
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: ret
@@ -309,10 +308,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vsc
; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vx v8, v10, a1
; CHECK-NEXT: ret
@@ -363,8 +362,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v16, a0
; CHECK-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -376,8 +374,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v14, v16, a0
; CHECK-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
@@ -397,8 +394,9 @@ define <vscale x 32 x half> @insert_nxv32f16_undef_nxv1f16_26(<vscale x 1 x half
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v14, v8, a0
; CHECK-NEXT: ret
@@ -422,8 +420,8 @@ define <vscale x 32 x i1> @insert_nxv32i1_nxv8i1_8(<vscale x 32 x i1> %v, <vscal
; CHECK-LABEL: insert_nxv32i1_nxv8i1_8:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a1, a0, 2
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v8, a0
; CHECK-NEXT: ret
@@ -462,10 +460,11 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_2(<vscale x 4 x i1> %v, <vscale x
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: srli a1, a0, 3
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v9, v8, a0
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
@@ -570,8 +569,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_2(<vscale x 32 x bfloat
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v16, a0
; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 2)
@@ -583,8 +581,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_26(<vscale x 32 x bfloa
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v14, v16, a0
; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 26)
@@ -604,8 +601,9 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_undef_nxv1bf16_26(<vscale x 1 x
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v14, v8, a0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 28b27bb75f210..9972df97ad9f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1371,6 +1371,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
@@ -1378,9 +1380,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv8r.v v0, v16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
@@ -1406,6 +1407,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: slli t0, t0, 1
; CHECK-NEXT: mv t1, t0
; CHECK-NEXT: slli t0, t0, 2
+; CHECK-NEXT: add t1, t1, t0
+; CHECK-NEXT: slli t0, t0, 1
; CHECK-NEXT: add t0, t0, t1
; CHECK-NEXT: add t0, sp, t0
; CHECK-NEXT: addi t0, t0, 16
@@ -1413,9 +1416,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: vslidedown.vx v16, v8, a1
; CHECK-NEXT: vl8re16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: mv t0, a0
; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add t0, t0, a0
+; CHECK-NEXT: mv t0, a0
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add a0, a0, t0
; CHECK-NEXT: add a0, sp, a0
@@ -1445,10 +1447,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a6, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
@@ -1457,85 +1455,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v6, v24, v16, v0.t
-; CHECK-NEXT: add a0, a3, a3
+; CHECK-NEXT: vmfeq.vv v7, v24, v16, v0.t
; CHECK-NEXT: bltu a2, a5, .LBB85_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a5
; CHECK-NEXT: .LBB85_4:
-; CHECK-NEXT: sub a5, a2, a4
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 1
-; CHECK-NEXT: mv a7, a6
-; CHECK-NEXT: slli a6, a6, 2
-; CHECK-NEXT: add a6, a6, a7
-; CHECK-NEXT: add a6, sp, a6
-; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, a3
-; CHECK-NEXT: sltu a6, a2, a5
-; CHECK-NEXT: addi a6, a6, -1
-; CHECK-NEXT: and a5, a6, a5
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: mv a7, a6
-; CHECK-NEXT: slli a6, a6, 1
-; CHECK-NEXT: add a7, a7, a6
-; CHECK-NEXT: slli a6, a6, 3
-; CHECK-NEXT: add a6, a6, a7
-; CHECK-NEXT: add a6, sp, a6
-; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: sub a0, a2, a4
; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: mv a6, a5
; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: add a6, a6, a5
+; CHECK-NEXT: mv a6, a5
; CHECK-NEXT: slli a5, a5, 2
+; CHECK-NEXT: add a6, a6, a5
+; CHECK-NEXT: slli a5, a5, 1
+; CHECK-NEXT: add a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a3
+; CHECK-NEXT: sltu a5, a2, a0
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a0, a5, a0
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 1
+; CHECK-NEXT: mv a6, a5
+; CHECK-NEXT: slli a5, a5, 3
; CHECK-NEXT: add a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a5, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v4, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v6, v5, a3
+; CHECK-NEXT: vmfeq.vv v10, v16, v24, v0.t
+; CHECK-NEXT: vmv1r.v v9, v7
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v5, a3
; CHECK-NEXT: bltu a2, a4, .LBB85_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a2, a4
; CHECK-NEXT: .LBB85_6:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a5, a4
-; CHECK-NEXT: slli a4, a4, 1
-; CHECK-NEXT: add a5, a5, a4
-; CHECK-NEXT: slli a4, a4, 3
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, sp, a4
-; CHECK-NEXT: addi a4, a4, 16
-; CHECK-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a4, a0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, a0, a4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a4, a4, a2
-; CHECK-NEXT: slli a2, a2, 2
-; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a2, a2, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v4, a3
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v6, a1
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a3
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a1
; CHECK-NEXT: vmv.v.v v0, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: mv a1, a0
@@ -3546,8 +3554,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFH-NEXT: vsetvli zero, a2, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT: add a0, a1, a1
-; ZVFH-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; ZVFH-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; ZVFH-NEXT: vslideup.vx v16, v6, a1
; ZVFH-NEXT: vmv.v.v v0, v16
; ZVFH-NEXT: csrr a0, vlenb
@@ -3576,6 +3583,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: mv a3, a1
; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a3, a3, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: add a1, a1, a3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
@@ -3583,9 +3592,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; ZVFHMIN-NEXT: vmv8r.v v0, v16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: mv a3, a1
; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a3, a3, a1
+; ZVFHMIN-NEXT: mv a3, a1
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: add a1, a1, a3
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -3611,6 +3619,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: slli t0, t0, 1
; ZVFHMIN-NEXT: mv t1, t0
; ZVFHMIN-NEXT: slli t0, t0, 2
+; ZVFHMIN-NEXT: add t1, t1, t0
+; ZVFHMIN-NEXT: slli t0, t0, 1
; ZVFHMIN-NEXT: add t0, t0, t1
; ZVFHMIN-NEXT: add t0, sp, t0
; ZVFHMIN-NEXT: addi t0, t0, 16
@@ -3618,9 +3628,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vslidedown.vx v16, v8, a1
; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: mv t0, a0
; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add t0, t0, a0
+; ZVFHMIN-NEXT: mv t0, a0
; ZVFHMIN-NEXT: slli a0, a0, 2
; ZVFHMIN-NEXT: add a0, a0, t0
; ZVFHMIN-NEXT: add a0, sp, a0
@@ -3650,10 +3659,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a6, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
@@ -3662,85 +3667,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
; ZVFHMIN-NEXT: ...
[truncated]
|
; CHECK-NEXT: slli a1, a0, 1 | ||
; CHECK-NEXT: add a1, a1, a0 | ||
; CHECK-NEXT: add a0, a1, a0 | ||
; CHECK-NEXT: srli a1, a0, 3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This uses more registers, but is a shorter critical path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a pretty big win, I'm a bit surprised we never caught this before...
@@ -296,8 +294,9 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_2(<vscale x 16 x i8> %vec, <vsc | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: csrr a0, vlenb | |||
; CHECK-NEXT: srli a1, a0, 3 | |||
; CHECK-NEXT: slli a2, a1, 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like maybe we have a missed fold here? Or a wrap behavior difference?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's a missed fold.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think #144571 will fix it, but that has other effects so I didn't want to make the test diffs in this patch larger. I think we should commit it first and then I'll rebase this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. To double check this is needed to catch vscales generated during op legalization that won't get a chance to go through a DAG combine pass?
Correct. |
; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma | ||
; ZVFHMIN-NEXT: vslideup.vx v8, v10, a3 | ||
; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m1, ta, ma | ||
; ZVFHMIN-NEXT: vslideup.vx v8, v9, a1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be a regression but I don't understand the reason why it is. I think this patch affects scalar part only?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe just some scheduling noises.
We already have shl/mul vscale related folds in getNode.
This is an alternative to the DAGCombine proposed in #144507.