Skip to content

Commit 76cc4e5

Browse files
NickGuy-Armsivan-shani
authored andcommitted
[AArch64][SelectionDAG] Add type legalization for partial reduce wide adds (llvm#141075)
Based on work initially done by @JamesChesterman.
1 parent a65743b commit 76cc4e5

File tree

6 files changed

+393
-183
lines changed

6 files changed

+393
-183
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12739,7 +12739,10 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1273912739

1274012740
SDValue UnextOp1 = Op1.getOperand(0);
1274112741
EVT UnextOp1VT = UnextOp1.getValueType();
12742-
if (!TLI.isPartialReduceMLALegalOrCustom(N->getValueType(0), UnextOp1VT))
12742+
auto *Context = DAG.getContext();
12743+
if (!TLI.isPartialReduceMLALegalOrCustom(
12744+
TLI.getTypeToTransformTo(*Context, N->getValueType(0)),
12745+
TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
1274312746
return SDValue();
1274412747

1274512748
bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
18851885
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
18861886

18871887
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv16i8, Custom);
1888+
1889+
// Wide add types
1890+
if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1891+
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i32, Legal);
1892+
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i16, Legal);
1893+
setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i8, Legal);
1894+
}
18881895
}
18891896

18901897
// Handle operations that are only available in non-streaming SVE mode.

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3813,6 +3813,32 @@ let Predicates = [HasSVE2_or_SME] in {
38133813
defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
38143814
defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
38153815

3816+
def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$Input, (nxv4i32 (splat_vector (i32 1))))),
3817+
(UADDWT_ZZZ_D (UADDWB_ZZZ_D $Acc, $Input), $Input)>;
3818+
def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$Input, (nxv4i32 (splat_vector (i32 1))))),
3819+
(SADDWT_ZZZ_D (SADDWB_ZZZ_D $Acc, $Input), $Input)>;
3820+
def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$Input, (nxv8i16 (splat_vector (i32 1))))),
3821+
(UADDWT_ZZZ_S (UADDWB_ZZZ_S $Acc, $Input), $Input)>;
3822+
def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$Input, (nxv8i16 (splat_vector (i32 1))))),
3823+
(SADDWT_ZZZ_S (SADDWB_ZZZ_S $Acc, $Input), $Input)>;
3824+
def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
3825+
(UADDWT_ZZZ_H (UADDWB_ZZZ_H $Acc, $Input), $Input)>;
3826+
def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
3827+
(SADDWT_ZZZ_H (SADDWB_ZZZ_H $Acc, $Input), $Input)>;
3828+
3829+
def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
3830+
(UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
3831+
def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
3832+
(SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
3833+
def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
3834+
(UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
3835+
def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
3836+
(SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
3837+
def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
3838+
(UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
3839+
def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
3840+
(SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
3841+
38163842
// SVE2 integer multiply long
38173843
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
38183844
defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -917,20 +917,11 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
917917
;
918918
; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
919919
; CHECK-NEWLOWERING-I8MM: // %bb.0:
920-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v3.8h, v2.8b, #0
921-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0
922-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.4s, v3.4h, #0
923-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v5.4s, v2.4h, #0
924-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.4s, v3.8h, #0
925-
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.4s, v2.8h, #0
926-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v5.2s
920+
; CHECK-NEWLOWERING-I8MM-NEXT: movi v3.16b, #1
921+
; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000
922+
; CHECK-NEWLOWERING-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b
927923
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s
928-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
929924
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
930-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v2.2s
931-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v3.2s
932-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
933-
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
934925
; CHECK-NEWLOWERING-I8MM-NEXT: ret
935926
%a.wide = zext <16 x i8> %a to <16 x i64>
936927
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -967,20 +958,11 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
967958
;
968959
; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
969960
; CHECK-NEWLOWERING-I8MM: // %bb.0:
970-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v3.8h, v2.8b, #0
971-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0
972-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.4s, v3.4h, #0
973-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.4s, v2.4h, #0
974-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.4s, v3.8h, #0
975-
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.4s, v2.8h, #0
976-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v5.2s
961+
; CHECK-NEWLOWERING-I8MM-NEXT: movi v3.16b, #1
962+
; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000
963+
; CHECK-NEWLOWERING-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b
977964
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s
978-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v5.4s
979965
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s
980-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v2.2s
981-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v3.2s
982-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v2.4s
983-
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v3.4s
984966
; CHECK-NEWLOWERING-I8MM-NEXT: ret
985967
%a.wide = sext <16 x i8> %a to <16 x i64>
986968
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 112 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -561,31 +561,34 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
561561
; CHECK-NEXT: add z1.d, z1.d, z3.d
562562
; CHECK-NEXT: ret
563563
;
564-
; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_8to64:
565-
; CHECK-NEWLOWERING: // %bb.0:
566-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
567-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
568-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z3.h
569-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
570-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
571-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
572-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
573-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z3.s
574-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z5.s
575-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
576-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
577-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
578-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
579-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
580-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
581-
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
582-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
583-
; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
584-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
585-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
586-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
587-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
588-
; CHECK-NEWLOWERING-NEXT: ret
564+
; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
565+
; CHECK-NEWLOWERING-SVE: // %bb.0:
566+
; CHECK-NEWLOWERING-SVE-NEXT: movi v3.2d, #0000000000000000
567+
; CHECK-NEWLOWERING-SVE-NEXT: mov z4.b, #1 // =0x1
568+
; CHECK-NEWLOWERING-SVE-NEXT: udot z3.s, z2.b, z4.b
569+
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z2.d, z3.s
570+
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z3.d, z3.s
571+
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z2.d
572+
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z3.d
573+
; CHECK-NEWLOWERING-SVE-NEXT: ret
574+
;
575+
; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
576+
; CHECK-NEWLOWERING-SVE2: // %bb.0:
577+
; CHECK-NEWLOWERING-SVE2-NEXT: movi v3.2d, #0000000000000000
578+
; CHECK-NEWLOWERING-SVE2-NEXT: mov z4.b, #1 // =0x1
579+
; CHECK-NEWLOWERING-SVE2-NEXT: udot z3.s, z2.b, z4.b
580+
; CHECK-NEWLOWERING-SVE2-NEXT: uaddwb z0.d, z0.d, z3.s
581+
; CHECK-NEWLOWERING-SVE2-NEXT: uaddwt z0.d, z0.d, z3.s
582+
; CHECK-NEWLOWERING-SVE2-NEXT: ret
583+
;
584+
; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
585+
; CHECK-NEWLOWERING-SME: // %bb.0:
586+
; CHECK-NEWLOWERING-SME-NEXT: mov z3.b, #1 // =0x1
587+
; CHECK-NEWLOWERING-SME-NEXT: mov z4.s, #0 // =0x0
588+
; CHECK-NEWLOWERING-SME-NEXT: udot z4.s, z2.b, z3.b
589+
; CHECK-NEWLOWERING-SME-NEXT: uaddwb z0.d, z0.d, z4.s
590+
; CHECK-NEWLOWERING-SME-NEXT: uaddwt z0.d, z0.d, z4.s
591+
; CHECK-NEWLOWERING-SME-NEXT: ret
589592
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
590593
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
591594
ret <vscale x 4 x i64> %partial.reduce
@@ -603,31 +606,34 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
603606
; CHECK-NEXT: add z1.d, z1.d, z3.d
604607
; CHECK-NEXT: ret
605608
;
606-
; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_8to64:
607-
; CHECK-NEWLOWERING: // %bb.0:
608-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
609-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
610-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z3.h
611-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
612-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
613-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
614-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
615-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z3.s
616-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z5.s
617-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
618-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
619-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
620-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
621-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
622-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
623-
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
624-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
625-
; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
626-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
627-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
628-
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
629-
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
630-
; CHECK-NEWLOWERING-NEXT: ret
609+
; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
610+
; CHECK-NEWLOWERING-SVE: // %bb.0:
611+
; CHECK-NEWLOWERING-SVE-NEXT: movi v3.2d, #0000000000000000
612+
; CHECK-NEWLOWERING-SVE-NEXT: mov z4.b, #1 // =0x1
613+
; CHECK-NEWLOWERING-SVE-NEXT: sdot z3.s, z2.b, z4.b
614+
; CHECK-NEWLOWERING-SVE-NEXT: sunpklo z2.d, z3.s
615+
; CHECK-NEWLOWERING-SVE-NEXT: sunpkhi z3.d, z3.s
616+
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z2.d
617+
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z3.d
618+
; CHECK-NEWLOWERING-SVE-NEXT: ret
619+
;
620+
; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
621+
; CHECK-NEWLOWERING-SVE2: // %bb.0:
622+
; CHECK-NEWLOWERING-SVE2-NEXT: movi v3.2d, #0000000000000000
623+
; CHECK-NEWLOWERING-SVE2-NEXT: mov z4.b, #1 // =0x1
624+
; CHECK-NEWLOWERING-SVE2-NEXT: sdot z3.s, z2.b, z4.b
625+
; CHECK-NEWLOWERING-SVE2-NEXT: saddwb z0.d, z0.d, z3.s
626+
; CHECK-NEWLOWERING-SVE2-NEXT: saddwt z0.d, z0.d, z3.s
627+
; CHECK-NEWLOWERING-SVE2-NEXT: ret
628+
;
629+
; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
630+
; CHECK-NEWLOWERING-SME: // %bb.0:
631+
; CHECK-NEWLOWERING-SME-NEXT: mov z3.b, #1 // =0x1
632+
; CHECK-NEWLOWERING-SME-NEXT: mov z4.s, #0 // =0x0
633+
; CHECK-NEWLOWERING-SME-NEXT: sdot z4.s, z2.b, z3.b
634+
; CHECK-NEWLOWERING-SME-NEXT: saddwb z0.d, z0.d, z4.s
635+
; CHECK-NEWLOWERING-SME-NEXT: saddwt z0.d, z0.d, z4.s
636+
; CHECK-NEWLOWERING-SME-NEXT: ret
631637
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
632638
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
633639
ret <vscale x 4 x i64> %partial.reduce
@@ -647,18 +653,34 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
647653
; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
648654
; CHECK-NEXT: ret
649655
;
650-
; CHECK-NEWLOWERING-LABEL: not_udot:
651-
; CHECK-NEWLOWERING: // %bb.0: // %entry
652-
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
653-
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
654-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
655-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
656-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
657-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
658-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
659-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
660-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
661-
; CHECK-NEWLOWERING-NEXT: ret
656+
; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
657+
; CHECK-NEWLOWERING-SVE: // %bb.0: // %entry
658+
; CHECK-NEWLOWERING-SVE-NEXT: and z1.h, z1.h, #0xff
659+
; CHECK-NEWLOWERING-SVE-NEXT: and z2.h, z2.h, #0xff
660+
; CHECK-NEWLOWERING-SVE-NEXT: ptrue p0.s
661+
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z3.s, z1.h
662+
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z4.s, z2.h
663+
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z1.s, z1.h
664+
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z2.s, z2.h
665+
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.s, p0/m, z3.s, z4.s
666+
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.s, p0/m, z1.s, z2.s
667+
; CHECK-NEWLOWERING-SVE-NEXT: ret
668+
;
669+
; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
670+
; CHECK-NEWLOWERING-SVE2: // %bb.0: // %entry
671+
; CHECK-NEWLOWERING-SVE2-NEXT: and z2.h, z2.h, #0xff
672+
; CHECK-NEWLOWERING-SVE2-NEXT: and z1.h, z1.h, #0xff
673+
; CHECK-NEWLOWERING-SVE2-NEXT: umlalb z0.s, z1.h, z2.h
674+
; CHECK-NEWLOWERING-SVE2-NEXT: umlalt z0.s, z1.h, z2.h
675+
; CHECK-NEWLOWERING-SVE2-NEXT: ret
676+
;
677+
; CHECK-NEWLOWERING-SME-LABEL: not_udot:
678+
; CHECK-NEWLOWERING-SME: // %bb.0: // %entry
679+
; CHECK-NEWLOWERING-SME-NEXT: and z2.h, z2.h, #0xff
680+
; CHECK-NEWLOWERING-SME-NEXT: and z1.h, z1.h, #0xff
681+
; CHECK-NEWLOWERING-SME-NEXT: umlalb z0.s, z1.h, z2.h
682+
; CHECK-NEWLOWERING-SME-NEXT: umlalt z0.s, z1.h, z2.h
683+
; CHECK-NEWLOWERING-SME-NEXT: ret
662684
entry:
663685
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
664686
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
@@ -681,18 +703,34 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
681703
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
682704
; CHECK-NEXT: ret
683705
;
684-
; CHECK-NEWLOWERING-LABEL: not_udot_wide:
685-
; CHECK-NEWLOWERING: // %bb.0: // %entry
686-
; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
687-
; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
688-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
689-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
690-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
691-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
692-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
693-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
694-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
695-
; CHECK-NEWLOWERING-NEXT: ret
706+
; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
707+
; CHECK-NEWLOWERING-SVE: // %bb.0: // %entry
708+
; CHECK-NEWLOWERING-SVE-NEXT: and z1.s, z1.s, #0xffff
709+
; CHECK-NEWLOWERING-SVE-NEXT: and z2.s, z2.s, #0xffff
710+
; CHECK-NEWLOWERING-SVE-NEXT: ptrue p0.d
711+
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z3.d, z1.s
712+
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z4.d, z2.s
713+
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z1.d, z1.s
714+
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z2.d, z2.s
715+
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.d, p0/m, z3.d, z4.d
716+
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.d, p0/m, z1.d, z2.d
717+
; CHECK-NEWLOWERING-SVE-NEXT: ret
718+
;
719+
; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
720+
; CHECK-NEWLOWERING-SVE2: // %bb.0: // %entry
721+
; CHECK-NEWLOWERING-SVE2-NEXT: and z2.s, z2.s, #0xffff
722+
; CHECK-NEWLOWERING-SVE2-NEXT: and z1.s, z1.s, #0xffff
723+
; CHECK-NEWLOWERING-SVE2-NEXT: umlalb z0.d, z1.s, z2.s
724+
; CHECK-NEWLOWERING-SVE2-NEXT: umlalt z0.d, z1.s, z2.s
725+
; CHECK-NEWLOWERING-SVE2-NEXT: ret
726+
;
727+
; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
728+
; CHECK-NEWLOWERING-SME: // %bb.0: // %entry
729+
; CHECK-NEWLOWERING-SME-NEXT: and z2.s, z2.s, #0xffff
730+
; CHECK-NEWLOWERING-SME-NEXT: and z1.s, z1.s, #0xffff
731+
; CHECK-NEWLOWERING-SME-NEXT: umlalb z0.d, z1.s, z2.s
732+
; CHECK-NEWLOWERING-SME-NEXT: umlalt z0.d, z1.s, z2.s
733+
; CHECK-NEWLOWERING-SME-NEXT: ret
696734
entry:
697735
%a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
698736
%b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>

0 commit comments

Comments
 (0)