-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][GlobalISel] Legalize more CTPOP vector types. #131513
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesSimilar to other operations, s8, s16 s32 and s64 vector elements are clamped to legal vector sizes, odd number of elements are widened to the next power-2 and s128 is scalarized. This helps legalize cttz as well as ctpop. Patch is 52.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131513.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed8bd25698c03..0b84e03e05782 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -6139,6 +6139,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
case TargetOpcode::G_FCANONICALIZE:
case TargetOpcode::G_SEXT_INREG:
case TargetOpcode::G_ABS:
+ case TargetOpcode::G_CTPOP:
if (TypeIdx != 0)
return UnableToLegalize;
Observer.changingInstr(MI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 0da3c73b6926d..f56c4fc0373b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -323,7 +323,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampScalar(0, s32, s128)
.widenScalarToNextPow2(0)
.minScalarEltSameAsIf(always, 1, 0)
- .maxScalarEltSameAsIf(always, 1, 0);
+ .maxScalarEltSameAsIf(always, 1, 0)
+ .clampNumElements(0, v8s8, v16s8)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64)
+ .moreElementsToNextPow2(0)
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
getActionDefinitionsBuilder(G_CTLZ)
.legalForCartesianProduct(
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 785a447123b5e..465f8f3a9be4c 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -1,37 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128
define void @v2i8(ptr %p1) {
-; CHECK-LABEL: v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w9, [x0]
-; CHECK-NEXT: strb w8, [x0, #1]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldrb w8, [x0]
+; CHECK-SD-NEXT: ldrb w9, [x0, #1]
+; CHECK-SD-NEXT: fmov s0, w8
+; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: cnt v0.8b, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: fmov w9, s0
+; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: ldr b1, [x0, #1]
+; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT: cnt v0.8b, v0.8b
+; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT: ret
entry:
%d = load <2 x i8>, ptr %p1
%s = call <2 x i8> @llvm.ctpop(<2 x i8> %d)
@@ -40,23 +38,39 @@ entry:
}
define void @v3i8(ptr %p1) {
-; CHECK-LABEL: v3i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w9, [sp, #12]
-; CHECK-NEXT: strb w8, [x0, #2]
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT: cnt v0.8b, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: str s1, [sp, #12]
+; CHECK-SD-NEXT: ldrh w9, [sp, #12]
+; CHECK-SD-NEXT: strb w8, [x0, #2]
+; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: ldr b1, [x0, #1]
+; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: add x9, x0, #2
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT: ldr b1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT: cnt v0.8b, v0.8b
+; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT: ret
entry:
%d = load <3 x i8>, ptr %p1
%s = call <3 x i8> @llvm.ctpop(<3 x i8> %d)
@@ -65,15 +79,31 @@ entry:
}
define void @v4i8(ptr %p1) {
-; CHECK-LABEL: v4i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: str s0, [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: cnt v0.8b, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b0, v0.b[3]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT: cnt v0.8b, v2.8b
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: str w8, [x0]
+; CHECK-GI-NEXT: ret
entry:
%d = load <4 x i8>, ptr %p1
%s = call <4 x i8> @llvm.ctpop(<4 x i8> %d)
@@ -113,20 +143,31 @@ entry:
}
define void @v2i16(ptr %p1) {
-; CHECK-LABEL: v2i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrh w9, [x0, #2]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: strh w8, [x0, #2]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldrh w8, [x0]
+; CHECK-SD-NEXT: ldrh w9, [x0, #2]
+; CHECK-SD-NEXT: fmov s0, w8
+; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: cnt v0.8b, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: fmov w9, s0
+; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: cnt v0.8b, v0.8b
+; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT: str h0, [x0]
+; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ret
entry:
%d = load <2 x i16>, ptr %p1
%s = call <2 x i16> @llvm.ctpop(<2 x i16> %d)
@@ -135,15 +176,29 @@ entry:
}
define void @v3i16(ptr %p1) {
-; CHECK-LABEL: v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: add x8, x0, #4
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: st1 { v0.h }[2], [x8]
-; CHECK-NEXT: str s0, [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: add x8, x0, #4
+; CHECK-SD-NEXT: cnt v0.8b, v0.8b
+; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: add x8, x0, #2
+; CHECK-GI-NEXT: add x9, x0, #4
+; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT: cnt v0.8b, v0.8b
+; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT: str h0, [x0]
+; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT: ret
entry:
%d = load <3 x i16>, ptr %p1
%s = call <3 x i16> @llvm.ctpop(<3 x i16> %d)
@@ -251,25 +306,42 @@ entry:
}
define <3 x i64> @v3i64(<3 x i64> %d) {
-; CHECK-LABEL: v3i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: cnt v1.16b, v2.16b
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v1.8h, v1.16b
-; CHECK-NEXT: uaddlp v2.4s, v1.8h
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v2.2d, v2.4s
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: cnt v1.16b, v2.16b
+; CHECK-SD-NEXT: cnt v0.16b, v0.16b
+; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT: uaddlp v2.4s, v1.8h
+; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-SD-NEXT: uaddlp v2.2d, v2.4s
+; CHECK-SD-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: cnt v1.8b, v2.8b
+; CHECK-GI-NEXT: cnt v0.16b, v0.16b
+; CHECK-GI-NEXT: uaddlv h1, v1.8b
+; CHECK-GI-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT: mov w8, v1.s[0]
+; CHECK-GI-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT: fmov d2, x8
+; CHECK-GI-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%s = call <3 x i64> @llvm.ctpop(<3 x i64> %d)
ret <3 x i64> %s
@@ -293,85 +365,148 @@ entry:
}
define <2 x i128> @v2i128(<2 x i128> %d) {
-; CHECK-LABEL: v2i128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x2
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: mov v1.d[1], x1
-; CHECK-NEXT: mov v0.d[1], x3
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: mov x3, xzr
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: fmov x0, d1
-; CHECK-NEXT: fmov x2, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d0, x2
+; CHECK-SD-NEXT: fmov d1, x0
+; CHECK-SD-NEXT: mov v1.d[1], x1
+; CHECK-SD-NEXT: mov v0.d[1], x3
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: mov x3, xzr
+; CHECK-SD-NEXT: cnt v1.16b, v1.16b
+; CHECK-SD-NEXT: cnt v0.16b, v0.16b
+; CHECK-SD-NEXT: addv b1, v1.16b
+; CHECK-SD-NEXT: addv b0, v0.16b
+; CHECK-SD-NEXT: fmov x0, d1
+; CHECK-SD-NEXT: fmov x2, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: mov v1.d[0], x2
+; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v1.d[1], x3
+; CHECK-GI-NEXT: mov x1, xzr
+; CHECK-GI-NEXT: mov x3, xzr
+; CHECK-GI-NEXT: cnt v0.16b, v0.16b
+; CHECK-GI-NEXT: cnt v1.16b, v1.16b
+; CHECK-GI-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-NEXT: uaddlv h1, v1.16b
+; CHECK-GI-NEXT: mov w0, v0.s[0]
+; CHECK-GI-NEXT: mov w2, v1.s[0]
+; CHECK-GI-NEXT: ret
entry:
%s = call <2 x i128> @llvm.ctpop(<2 x i128> %d)
ret <2 x i128> %s
}
define <3 x i128> @v3i128(<3 x i128> %d) {
-; CHECK-LABEL: v3i128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x4
-; CHECK-NEXT: fmov d1, x2
-; CHECK-NEXT: fmov d2, x0
-; CHECK-NEXT: mov v0.d[1], x5
-; CHECK-NEXT: mov v1.d[1], x3
-; CHECK-NEXT: mov v2.d[1], x1
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: mov x3, xzr
-; CHECK-NEXT: mov x5, xzr
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: cnt v2.16b, v2.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: addv b2, v2.16b
-; CHECK-NEXT: fmov x0, d2
-; CHECK-NEXT: fmov x2, d1
-; CHECK-NEXT: fmov x4, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v3i128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d0, x4
+; CHECK-SD-NEXT: fmov d1, x2
+; CHECK-SD-NEXT: fmov d2, x0
+; CHECK-SD-NEXT: mov v0.d[1], x5
+; CHECK-SD-NEXT: mov v1.d[1], x3
+; CHECK-SD-NEXT: mov v2.d[1], x1
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: mov x3, xzr
+; CHECK-SD-NEXT: mov x5, xzr
+; CHECK-SD-NEXT: cnt v0.16b, v0.16b
+; CHECK-SD-NEXT: cnt v1.16b, v1.16b
+; CHECK-SD-NEXT: cnt v2.16b, v2.16b
+; CHECK-SD-NEXT: addv b0, v0.16b
+; CHECK-SD-NEXT: addv b1, v1.16b
+; CHECK-SD-NEXT: addv b2, v2.16b
+; CHECK-SD-NEXT: fmov x0, d2
+; CHECK-SD-NEXT: fmov x2, d1
+; CHECK-SD-NEXT: fmov x4, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v3i128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: mov v1.d[0], x2
+; CHECK-GI-NEXT: mov v2.d[0], x4
+; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v1.d[1], x3
+; CHECK-GI-NEXT: mov v2.d[1], x5
+; CHECK-GI-NEXT: mov x1, xzr
+; CHECK-GI-NEXT: mov x3, xzr
+; CHECK-GI-NEXT: mov x5, xzr
+; CHECK-GI-NEXT: cnt v0.16b, v0.16b
+; CHECK-GI-NEXT: cnt v1.16b, v1.16b
+; CHECK-GI-NEXT: cnt v2.16b, v2.16b
+; CHECK-GI-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-NEXT: uaddlv h1, v1.16b
+; CHECK-GI-NEXT: uaddlv h2, v2.16b
+; CHECK-GI-NEXT: mov w0, v0.s[0]
+; CHECK-GI-NEXT: mov w2, v1.s[0]
+; CHECK-GI-NEXT: mov w4, v2.s[0]
+; CHECK-GI-NEXT: ret
entry:
%s = call <3 x i128> @llvm.ctpop(<3 x i128> %d)
ret <3 x i128> %s
}
define <4 x i128> @v4i128(<4 x i128> %d) {
-; CHECK-LABEL: v4i128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x6
-; CHECK-NEXT: fmov d1, x4
-; CHECK-NEXT: fmov d2, x2
-; CHECK-NEXT: fmov d3, x0
-; CHECK-NEXT: mov v1.d[1], x5
-; CHECK-NEXT: mov v2.d[1], x3
-; CHECK-NEXT: mov v0.d[1], x7
-; CHECK-NEXT: mov v3.d[1], x1
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: mov x3, xzr
-; CHECK-NEXT: mov x5, xzr
-; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: cnt v2.16b, v2.16b
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: cnt v3.16b, v3.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: addv b2, v2.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: addv b3, v3.16b
-; CHECK-NEXT: fmov x2, d2
-; CHECK-NEXT: fmov x4, d1
-; CHECK-NEXT: fmov x6, d0
-; CHECK-NEXT: fmov x0, d3
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: v4i128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d0, x6
+; CHECK-SD-NEXT: fmov d1, x4
+; CHECK-SD-NEXT: fmov d2, x2
+; CHECK-SD-NEXT: fmov d3, x0
+; CHECK-SD-NEXT: mov v1.d[1], x5
+; CHECK-SD-NEXT: mov v2.d[1], x3
+; CHECK-SD-NEXT: mov v0.d[1], x7
+; CHECK-SD-NEXT: mov v3.d[1], x1
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: mov x3, xzr
+; CHECK-SD-NEXT: mov x5, xzr
+; CHECK-SD-NEXT: mov x7, xzr
+; CHECK-SD-NEXT: cnt v1.16b, v1.16b
+; CHECK-SD-NEXT: cnt v2.16b, v2.16b
+; CHECK-SD-NEXT: cnt v0.16b, v0.16b
+; CHECK-SD-NEXT: cnt v3.16b, v3.16b
+; CHECK-SD-NEXT: addv b1, v1.16b
+; CHECK-SD-NEXT: addv b2, v2.16b
+; CHECK-SD-NEXT: addv b0, v0.16b
+; CHECK-SD-NEXT: addv b3, v3.16b
+; CHECK-SD-NEXT: fmov x2, d2
+; CHECK-SD-NEXT: fmov x4, d1
+; CHECK-SD-NEXT: fmov x6, d0
+; CHECK-SD-NEXT: fmov x0, d3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: v4i128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: mov v1.d[0], x2
+; CHECK-GI-NEXT: mov v2.d[0], x4
+; CHECK-GI-NEXT: mov v3.d[0], x6
+; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v1.d[1], x3
+; CHECK-GI-NEXT: mov v2.d[1], x5
+; CHECK-GI-NEXT: mov v3.d[1], x7
+; CHECK-GI-NEXT: mov x1, xzr
+; CHECK-GI-NEXT: mov x3, xzr
+; CHECK-GI-NEXT: mov x5, xzr
+; CHECK-GI-NEXT: mov x7, xzr
+; CHECK-GI-NEXT: cnt v0.16b, v0.16b
+; CHECK-GI-NEXT: cnt v1.16b, v1.16b
+; CHECK-GI-NEXT: cnt v2.16b, v2.16b
+; CHECK-GI-NEXT: cnt v3.16b, v3.16b
+; CHECK-GI-NEXT: uaddlv h0, v0.16b
+; CHECK-GI-NEXT: uaddlv h1, v1.16b
+; CHECK-GI-NEXT: uaddlv h2, v2.16b
+; CHECK-GI-NEXT: uaddlv h3, v3.16b
+; CHECK-GI-NEXT: mov w0, v0.s[0]
+; CHECK-GI-NEXT: mov w2, v1.s[0]
+; CHECK-GI-NEXT: mov w4, v2.s[0]
+; CHECK-GI-NEXT: mov w6, v3.s[0]
+; CHECK-GI-NEXT: ret
entry:
%s = call <4 x i128> @llvm.ctpop(<4 x i128> %d)
ret <4 x i128> %s
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index a254df229c127..5d9ca94e0720d 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/ll...
[truncated]
|
.clampNumElements(0, v2s32, v4s32) | ||
.clampNumElements(0, v2s64, v2s64) | ||
.moreElementsToNextPow2(0) | ||
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplify to just scalarize()?
Similar to other operations, s8, s16 s32 and s64 vector elements are clamped to legal vector sizes, odd number of elements are widened to the next power-2 and s128 is scalarized. This helps legalize cttz as well as ctpop.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/1736 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/14944 Here is the relevant piece of the build log for the reference
|
Similar to other operations, s8, s16 s32 and s64 vector elements are clamped to legal vector sizes, odd number of elements are widened to the next power-2 and s128 is scalarized.
This helps legalize cttz as well as ctpop.