Skip to content

[AArch64][SelectionDAG] Add type legalization for partial reduce wide adds #141075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12735,7 +12735,10 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {

SDValue UnextOp1 = Op1.getOperand(0);
EVT UnextOp1VT = UnextOp1.getValueType();
if (!TLI.isPartialReduceMLALegalOrCustom(N->getValueType(0), UnextOp1VT))
auto *Context = DAG.getContext();
if (!TLI.isPartialReduceMLALegalOrCustom(
TLI.getTypeToTransformTo(*Context, N->getValueType(0)),
TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
return SDValue();

bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1885,6 +1885,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);

setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv16i8, Custom);

// Wide add types
if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i32, Legal);
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i16, Legal);
setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i8, Legal);
}
}

// Handle operations that are only available in non-streaming SVE mode.
Expand Down
26 changes: 26 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3813,6 +3813,32 @@ let Predicates = [HasSVE2_or_SME] in {
defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;

def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$Input, (nxv4i32 (splat_vector (i32 1))))),
(UADDWT_ZZZ_D (UADDWB_ZZZ_D $Acc, $Input), $Input)>;
def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$Input, (nxv4i32 (splat_vector (i32 1))))),
(SADDWT_ZZZ_D (SADDWB_ZZZ_D $Acc, $Input), $Input)>;
def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$Input, (nxv8i16 (splat_vector (i32 1))))),
(UADDWT_ZZZ_S (UADDWB_ZZZ_S $Acc, $Input), $Input)>;
def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$Input, (nxv8i16 (splat_vector (i32 1))))),
(SADDWT_ZZZ_S (SADDWB_ZZZ_S $Acc, $Input), $Input)>;
def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
(UADDWT_ZZZ_H (UADDWB_ZZZ_H $Acc, $Input), $Input)>;
def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
(SADDWT_ZZZ_H (SADDWB_ZZZ_H $Acc, $Input), $Input)>;

def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
(UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
(SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
(UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
(SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
(UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
(SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;

// SVE2 integer multiply long
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
Expand Down
30 changes: 6 additions & 24 deletions llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
Original file line number Diff line number Diff line change
Expand Up @@ -917,20 +917,11 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
;
; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING-I8MM: // %bb.0:
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v3.8h, v2.8b, #0
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.4s, v3.4h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: ushll v5.4s, v2.4h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v5.2s
; CHECK-NEWLOWERING-I8MM-NEXT: movi v3.16b, #1
; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEWLOWERING-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v2.2s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v3.2s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
; CHECK-NEWLOWERING-I8MM-NEXT: ret
%a.wide = zext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
Expand Down Expand Up @@ -967,20 +958,11 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
;
; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING-I8MM: // %bb.0:
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v3.8h, v2.8b, #0
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.4s, v3.4h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.4s, v2.4h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v5.2s
; CHECK-NEWLOWERING-I8MM-NEXT: movi v3.16b, #1
; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEWLOWERING-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v5.4s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v2.2s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v3.2s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v2.4s
; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v3.4s
; CHECK-NEWLOWERING-I8MM-NEXT: ret
%a.wide = sext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
Expand Down
186 changes: 112 additions & 74 deletions llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
Original file line number Diff line number Diff line change
Expand Up @@ -561,31 +561,34 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SVE: // %bb.0:
; CHECK-NEWLOWERING-SVE-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEWLOWERING-SVE-NEXT: mov z4.b, #1 // =0x1
; CHECK-NEWLOWERING-SVE-NEXT: udot z3.s, z2.b, z4.b
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z2.d, z3.s
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z3.d
; CHECK-NEWLOWERING-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SVE2: // %bb.0:
; CHECK-NEWLOWERING-SVE2-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEWLOWERING-SVE2-NEXT: mov z4.b, #1 // =0x1
; CHECK-NEWLOWERING-SVE2-NEXT: udot z3.s, z2.b, z4.b
; CHECK-NEWLOWERING-SVE2-NEXT: uaddwb z0.d, z0.d, z3.s
; CHECK-NEWLOWERING-SVE2-NEXT: uaddwt z0.d, z0.d, z3.s
; CHECK-NEWLOWERING-SVE2-NEXT: ret
;
; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SME: // %bb.0:
; CHECK-NEWLOWERING-SME-NEXT: mov z3.b, #1 // =0x1
; CHECK-NEWLOWERING-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-NEWLOWERING-SME-NEXT: udot z4.s, z2.b, z3.b
; CHECK-NEWLOWERING-SME-NEXT: uaddwb z0.d, z0.d, z4.s
; CHECK-NEWLOWERING-SME-NEXT: uaddwt z0.d, z0.d, z4.s
; CHECK-NEWLOWERING-SME-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
ret <vscale x 4 x i64> %partial.reduce
Expand All @@ -603,31 +606,34 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SVE: // %bb.0:
; CHECK-NEWLOWERING-SVE-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEWLOWERING-SVE-NEXT: mov z4.b, #1 // =0x1
; CHECK-NEWLOWERING-SVE-NEXT: sdot z3.s, z2.b, z4.b
; CHECK-NEWLOWERING-SVE-NEXT: sunpklo z2.d, z3.s
; CHECK-NEWLOWERING-SVE-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-SVE-NEXT: add z0.d, z0.d, z3.d
; CHECK-NEWLOWERING-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SVE2: // %bb.0:
; CHECK-NEWLOWERING-SVE2-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEWLOWERING-SVE2-NEXT: mov z4.b, #1 // =0x1
; CHECK-NEWLOWERING-SVE2-NEXT: sdot z3.s, z2.b, z4.b
; CHECK-NEWLOWERING-SVE2-NEXT: saddwb z0.d, z0.d, z3.s
; CHECK-NEWLOWERING-SVE2-NEXT: saddwt z0.d, z0.d, z3.s
; CHECK-NEWLOWERING-SVE2-NEXT: ret
;
; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING-SME: // %bb.0:
; CHECK-NEWLOWERING-SME-NEXT: mov z3.b, #1 // =0x1
; CHECK-NEWLOWERING-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-NEWLOWERING-SME-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-NEWLOWERING-SME-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-NEWLOWERING-SME-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-NEWLOWERING-SME-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
ret <vscale x 4 x i64> %partial.reduce
Expand All @@ -647,18 +653,34 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: not_udot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEWLOWERING-NEXT: ret
; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
; CHECK-NEWLOWERING-SVE: // %bb.0: // %entry
; CHECK-NEWLOWERING-SVE-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-SVE-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-SVE-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z3.s, z1.h
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEWLOWERING-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
; CHECK-NEWLOWERING-SVE2: // %bb.0: // %entry
; CHECK-NEWLOWERING-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-SVE2-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-SVE2-NEXT: umlalb z0.s, z1.h, z2.h
; CHECK-NEWLOWERING-SVE2-NEXT: umlalt z0.s, z1.h, z2.h
; CHECK-NEWLOWERING-SVE2-NEXT: ret
;
; CHECK-NEWLOWERING-SME-LABEL: not_udot:
; CHECK-NEWLOWERING-SME: // %bb.0: // %entry
; CHECK-NEWLOWERING-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-SME-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-SME-NEXT: umlalb z0.s, z1.h, z2.h
; CHECK-NEWLOWERING-SME-NEXT: umlalt z0.s, z1.h, z2.h
; CHECK-NEWLOWERING-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
Expand All @@ -681,18 +703,34 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: not_udot_wide:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
; CHECK-NEWLOWERING-SVE: // %bb.0: // %entry
; CHECK-NEWLOWERING-SVE-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEWLOWERING-SVE-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEWLOWERING-SVE-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z3.d, z1.s
; CHECK-NEWLOWERING-SVE-NEXT: uunpklo z4.d, z2.s
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-SVE-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-SVE-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEWLOWERING-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
; CHECK-NEWLOWERING-SVE2: // %bb.0: // %entry
; CHECK-NEWLOWERING-SVE2-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEWLOWERING-SVE2-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEWLOWERING-SVE2-NEXT: umlalb z0.d, z1.s, z2.s
; CHECK-NEWLOWERING-SVE2-NEXT: umlalt z0.d, z1.s, z2.s
; CHECK-NEWLOWERING-SVE2-NEXT: ret
;
; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
; CHECK-NEWLOWERING-SME: // %bb.0: // %entry
; CHECK-NEWLOWERING-SME-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEWLOWERING-SME-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEWLOWERING-SME-NEXT: umlalb z0.d, z1.s, z2.s
; CHECK-NEWLOWERING-SME-NEXT: umlalt z0.d, z1.s, z2.s
; CHECK-NEWLOWERING-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
%b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
Expand Down
Loading