-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SROA] Vector promote some memsets #133301
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: None (macurtis-amd) ChangesPatch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
*p = 0;
}
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
typedef struct {
long a, b, c, d;
private char *p;
} StructTy3;
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
StructTy3 S3 = {0, 0, 0, 0, 0};
*ptr = S3;
}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+ const DataLayout &DL) {
+ const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (!Length)
+ return nullptr;
+
+ APInt Val = Length->getValue();
+ if (Val.ugt(std::numeric_limits<unsigned>::max()))
+ return nullptr;
+
+ auto *VTy =
+ FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+ if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+ return nullptr;
+
+ return VTy;
+}
+
/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
return Base::visitGetElementPtrInst(GEPI);
}
+ bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+ return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+ }
+
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
// We allow splitting of non-volatile loads and stores where the type is an
// integer type. These may be used to implement 'memcpy' or other "transfer
// of bits" patterns.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
- insertUse(I, Offset, Size, IsSplittable);
+ insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
}
void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);
+ auto IsSplittable = [&]() {
+ FixedVectorType *VTy = getVectorTypeFor(II, DL);
+ Type *ATy = AS.AI.getAllocatedType();
+
+ if (!Length)
+ return false;
+ if (!VTy)
+ return true;
+ if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+ return true;
+ return isSplittableMemOp(ATy, II.isVolatile());
+ };
+
insertUse(II, Offset,
Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
- (bool)Length);
+ IsSplittable());
}
void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
+
+ auto *II = dyn_cast<MemSetInst>(U->getUser());
+ if (!II && !S.isSplittable()) {
+ // Skip any non-memset unsplittable intrinsics.
+ return false;
+ }
+ if (II) {
+ // For memset, allow if we have a suitable vector type
+ Type *VTy = getVectorTypeFor(*II, DL);
+ if (!VTy)
+ return false;
+ if (!canConvertValue(DL, SliceTy, VTy))
+ return false;
+ }
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
- Type *Ty;
+ Type *Ty = nullptr;
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
Ty = LI->getType();
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
Ty = SI->getValueOperand()->getType();
- else
+ else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+ Ty = getVectorTypeFor(*II, DL);
+
+ if (!Ty)
continue;
auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
;; Allocas have been promoted - the linked dbg.assigns have been removed.
;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
;; point.z = 5000;
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
;; local.other.x = global.other.x
;; local.other.y = global.other.y
;; local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
;; other is now 3 scalars:
;; point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
;;
;; point.z = other.y
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
;
; There should be no debug info for the padding.
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
%struct.prog_src_register = type { i32, i24 }
; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
define ptr @test10() {
; CHECK-LABEL: @test10(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT: ret ptr null
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: ret ptr [[TMP1]]
;
entry:
%a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
;
; CHECK-LABEL: @PR14059.1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
; CHECK-NEXT: [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
; CHECK-NEXT: [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
; CHECK-NEXT: store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
define void @memset_fp80_padding() {
; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT: store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT: [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT: store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
; CHECK-NEXT: ret void
;
%x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
define amdgpu_kernel void @test_half_array() #0 {
; CHECK-LABEL: @test_half_array(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float undef to i32
; CHECK-NEXT: [[DATA:%.*]] = load [4 x float], ptr undef, align 4
; CHECK-NEXT: [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT: store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT: store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT: ret void
;
entry:
@@ -285,17 +291,17 @@ bb:
define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LAB...
[truncated]
|
@llvm/pr-subscribers-clang Author: None (macurtis-amd) ChangesPatch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
*p = 0;
}
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
typedef struct {
long a, b, c, d;
private char *p;
} StructTy3;
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
StructTy3 S3 = {0, 0, 0, 0, 0};
*ptr = S3;
}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+ const DataLayout &DL) {
+ const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (!Length)
+ return nullptr;
+
+ APInt Val = Length->getValue();
+ if (Val.ugt(std::numeric_limits<unsigned>::max()))
+ return nullptr;
+
+ auto *VTy =
+ FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+ if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+ return nullptr;
+
+ return VTy;
+}
+
/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
return Base::visitGetElementPtrInst(GEPI);
}
+ bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+ return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+ }
+
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
// We allow splitting of non-volatile loads and stores where the type is an
// integer type. These may be used to implement 'memcpy' or other "transfer
// of bits" patterns.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
- insertUse(I, Offset, Size, IsSplittable);
+ insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
}
void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);
+ auto IsSplittable = [&]() {
+ FixedVectorType *VTy = getVectorTypeFor(II, DL);
+ Type *ATy = AS.AI.getAllocatedType();
+
+ if (!Length)
+ return false;
+ if (!VTy)
+ return true;
+ if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+ return true;
+ return isSplittableMemOp(ATy, II.isVolatile());
+ };
+
insertUse(II, Offset,
Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
- (bool)Length);
+ IsSplittable());
}
void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
+
+ auto *II = dyn_cast<MemSetInst>(U->getUser());
+ if (!II && !S.isSplittable()) {
+ // Skip any non-memset unsplittable intrinsics.
+ return false;
+ }
+ if (II) {
+ // For memset, allow if we have a suitable vector type
+ Type *VTy = getVectorTypeFor(*II, DL);
+ if (!VTy)
+ return false;
+ if (!canConvertValue(DL, SliceTy, VTy))
+ return false;
+ }
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
- Type *Ty;
+ Type *Ty = nullptr;
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
Ty = LI->getType();
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
Ty = SI->getValueOperand()->getType();
- else
+ else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+ Ty = getVectorTypeFor(*II, DL);
+
+ if (!Ty)
continue;
auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
;; Allocas have been promoted - the linked dbg.assigns have been removed.
;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
;; point.z = 5000;
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
;; local.other.x = global.other.x
;; local.other.y = global.other.y
;; local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
;; other is now 3 scalars:
;; point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
;;
;; point.z = other.y
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
;
; There should be no debug info for the padding.
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
%struct.prog_src_register = type { i32, i24 }
; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
define ptr @test10() {
; CHECK-LABEL: @test10(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT: ret ptr null
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: ret ptr [[TMP1]]
;
entry:
%a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
;
; CHECK-LABEL: @PR14059.1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
; CHECK-NEXT: [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
; CHECK-NEXT: [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
; CHECK-NEXT: store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
define void @memset_fp80_padding() {
; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT: store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT: [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT: store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
; CHECK-NEXT: ret void
;
%x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
define amdgpu_kernel void @test_half_array() #0 {
; CHECK-LABEL: @test_half_array(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float undef to i32
; CHECK-NEXT: [[DATA:%.*]] = load [4 x float], ptr undef, align 4
; CHECK-NEXT: [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT: store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT: store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT: ret void
;
entry:
@@ -285,17 +291,17 @@ bb:
define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LAB...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/Transforms/SROA/vector-promotion-memset.ll llvm/lib/Transforms/Scalar/SROA.cpp llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll llvm/test/DebugInfo/X86/sroasplit-5.ll llvm/test/Transforms/SROA/basictest.ll llvm/test/Transforms/SROA/slice-width.ll llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll llvm/test/Transforms/SROA/tbaa-struct3.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing new tests? I'd expect to see a few new targeted tests stressing different vector sizes and alignments, and not just updates of existing tests
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
|
||
auto *VTy = | ||
FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue()); | ||
if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DL.getTypeAllocSizeInBits is implemented with getTypeStoreSizeInBits and I find it confusing to combine the two, use explicit alignment checks if you need both
I also do not think you should need to consider the type alignment. The resulting store should have an explicit alignment, which does not need to match the type's natural alignment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a workaround.
Latest revision adds a comment to make this explicit and also cleans up the usage of getType*.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy)) | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Further duplicated size checks, it's hard to follow the flow
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cleaned up the code. Hopefully better now.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { | |||
if (!IsOffsetKnown) | |||
return PI.setAborted(&II); | |||
|
|||
auto IsSplittable = [&]() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you move this to a separate utility function? The Length capture is slightly confusing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cleaned up code removes the helper altogether. Hopefully better now.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
|
||
if (!Ty) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Leave as else continue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated
096fde6
to
a9b215b
Compare
Added a new test. Thanks for the review! |
; CHECK-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <32 x i8> zeroinitializer, i8 [[TMP3]], i32 0 | ||
; CHECK-NEXT: ret void | ||
; | ||
%2 = alloca %struct.a, align 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values in tests
if (!Length) | ||
return nullptr; | ||
|
||
APInt Val = Length->getValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
APInt Val = Length->getValue(); | |
const APInt &Val = Length->getValue(); |
if (Val.ugt(std::numeric_limits<unsigned>::max())) | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't understand this limit. Is this the maximum number of vector elements? Should avoid hardcoding that
return nullptr; | ||
|
||
uint64_t MemSetLen = Val.getZExtValue(); | ||
auto *VTy = FixedVectorType::get(II.getValue()->getType(), MemSetLen); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The element will always be i8. TODO to support llvm.experimental.memset.pattern?
if (Val.ugt(std::numeric_limits<unsigned>::max())) | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this just because the maximum number of vector elts? Can you put this value into a helper on FixedVectorType instead of hardcoding unsigned here?
Although we probably shouldn't be trying to promote anything that's anything close to that big.
|
||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test with the number of elements equalling and exceeding 32-bit limit case?
No description provided.