[SROA] Vector promote some memsets #133301

macurtis-amd · 2025-03-27T19:07:18Z

Teach SROA to vector promote rather than integer promote additional memsets.

Reduced function derived from user gemm kernel can be seen here: https://godbolt.org/z/8ebcTEjTs.
Replacing the memsets with stores results in vector promotion: https://godbolt.org/z/G9f4vPhs6.

Point of divergence between these two examples is whether the Use is considered splittable. This change makes certain memsets non-splittable resulting a vector promotion.

llvmbot · 2025-03-27T19:07:56Z

@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-debuginfo

@llvm/pr-subscribers-backend-amdgpu

Author: None (macurtis-amd)

Changes

Patch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff

7 Files Affected:

(modified) clang/test/CodeGenOpenCL/amdgpu-nullptr.cl (+4-6)
(modified) llvm/lib/Transforms/Scalar/SROA.cpp (+58-9)
(modified) llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll (+7-5)
(modified) llvm/test/DebugInfo/X86/sroasplit-5.ll (+2-3)
(modified) llvm/test/Transforms/SROA/basictest.ll (+14-26)
(modified) llvm/test/Transforms/SROA/slice-width.ll (+2-3)
(modified) llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll (+56-48)

diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
     *p = 0;
 }
 
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
 typedef struct {
   long a, b, c, d;
   private char *p;
 } StructTy3;
 
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK:  store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
 // CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
 // CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
 // CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
 // CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
   StructTy3 S3 = {0, 0, 0, 0, 0};
   *ptr = S3;
 }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
   return foldSelectInst(cast<SelectInst>(I));
 }
 
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+                                         const DataLayout &DL) {
+  const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+  if (!Length)
+    return nullptr;
+
+  APInt Val = Length->getValue();
+  if (Val.ugt(std::numeric_limits<unsigned>::max()))
+    return nullptr;
+
+  auto *VTy =
+      FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+  if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+    return nullptr;
+
+  return VTy;
+}
+
 /// Builder for the alloca slices.
 ///
 /// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     return Base::visitGetElementPtrInst(GEPI);
   }
 
+  bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+    return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+  }
+
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          uint64_t Size, bool IsVolatile) {
     // We allow splitting of non-volatile loads and stores where the type is an
     // integer type. These may be used to implement 'memcpy' or other "transfer
     // of bits" patterns.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
-    insertUse(I, Offset, Size, IsSplittable);
+    insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
   }
 
   void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
+    auto IsSplittable = [&]() {
+      FixedVectorType *VTy = getVectorTypeFor(II, DL);
+      Type *ATy = AS.AI.getAllocatedType();
+
+      if (!Length)
+        return false;
+      if (!VTy)
+        return true;
+      if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+        return true;
+      return isSplittableMemOp(ATy, II.isVolatile());
+    };
+
     insertUse(II, Offset,
               Length ? Length->getLimitedValue()
                      : AllocSize - Offset.getLimitedValue(),
-              (bool)Length);
+              IsSplittable());
   }
 
   void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
     if (MI->isVolatile())
       return false;
-    if (!S.isSplittable())
-      return false; // Skip any unsplittable intrinsics.
+
+    auto *II = dyn_cast<MemSetInst>(U->getUser());
+    if (!II && !S.isSplittable()) {
+      // Skip any non-memset unsplittable intrinsics.
+      return false;
+    }
+    if (II) {
+      // For memset, allow if we have a suitable vector type
+      Type *VTy = getVectorTypeFor(*II, DL);
+      if (!VTy)
+        return false;
+      if (!canConvertValue(DL, SliceTy, VTy))
+        return false;
+    }
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
       return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
 
   // Put load and store types into a set for de-duplication.
   for (const Slice &S : P) {
-    Type *Ty;
+    Type *Ty = nullptr;
     if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
       Ty = LI->getType();
     else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
       Ty = SI->getValueOperand()->getType();
-    else
+    else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+      Ty = getVectorTypeFor(*II, DL);
+
+    if (!Ty)
       continue;
 
     auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
 ;; Allocas have been promoted - the linked dbg.assigns have been removed.
 
 ;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
 
 ;; point.z = 5000;
 ; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
 ;;     local.other.x = global.other.x
 ;;     local.other.y = global.other.y
 ;;     local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
 ; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
 ; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
 
 ;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
 ;;   other is now 3 scalars:
 ;;     point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
 ;;
 ;;     point.z = other.y
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; There should be no debug info for the padding.
 ; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
 ; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
 %struct.prog_src_register = type { i32, i24 }
 
 ; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
 define ptr @test10() {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT:    ret ptr null
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
 entry:
   %a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
 ;
 ; CHECK-LABEL: @PR14059.1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT:    [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT:    [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT:    [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
 ; CHECK-NEXT:    [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
 ; CHECK-NEXT:    [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
 ; CHECK-NEXT:    store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT:    i32 4, label [[BB4:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB3:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB2:%.*]]
-; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB1:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
 
 define void @memset_fp80_padding() {
 ; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT:    [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT:    store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT:    [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT:    store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
 ; CHECK-NEXT:    ret void
 ;
   %x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
 define amdgpu_kernel void @test_half_array() #0 {
 ; CHECK-LABEL: @test_half_array(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
 ; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
 ; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
 ; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
 ; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
 ; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -285,17 +291,17 @@ bb:
 define amdgpu_kernel void @test_array_vector() #0 {
 ; CHECK-LAB...
[truncated]

llvmbot · 2025-03-27T19:07:56Z

@llvm/pr-subscribers-clang

Author: None (macurtis-amd)

Changes

Patch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff

7 Files Affected:

(modified) clang/test/CodeGenOpenCL/amdgpu-nullptr.cl (+4-6)
(modified) llvm/lib/Transforms/Scalar/SROA.cpp (+58-9)
(modified) llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll (+7-5)
(modified) llvm/test/DebugInfo/X86/sroasplit-5.ll (+2-3)
(modified) llvm/test/Transforms/SROA/basictest.ll (+14-26)
(modified) llvm/test/Transforms/SROA/slice-width.ll (+2-3)
(modified) llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll (+56-48)

diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
     *p = 0;
 }
 
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
 typedef struct {
   long a, b, c, d;
   private char *p;
 } StructTy3;
 
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK:  store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
 // CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
 // CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
 // CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
 // CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
   StructTy3 S3 = {0, 0, 0, 0, 0};
   *ptr = S3;
 }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
   return foldSelectInst(cast<SelectInst>(I));
 }
 
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+                                         const DataLayout &DL) {
+  const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+  if (!Length)
+    return nullptr;
+
+  APInt Val = Length->getValue();
+  if (Val.ugt(std::numeric_limits<unsigned>::max()))
+    return nullptr;
+
+  auto *VTy =
+      FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+  if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+    return nullptr;
+
+  return VTy;
+}
+
 /// Builder for the alloca slices.
 ///
 /// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     return Base::visitGetElementPtrInst(GEPI);
   }
 
+  bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+    return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+  }
+
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          uint64_t Size, bool IsVolatile) {
     // We allow splitting of non-volatile loads and stores where the type is an
     // integer type. These may be used to implement 'memcpy' or other "transfer
     // of bits" patterns.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
-    insertUse(I, Offset, Size, IsSplittable);
+    insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
   }
 
   void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
+    auto IsSplittable = [&]() {
+      FixedVectorType *VTy = getVectorTypeFor(II, DL);
+      Type *ATy = AS.AI.getAllocatedType();
+
+      if (!Length)
+        return false;
+      if (!VTy)
+        return true;
+      if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+        return true;
+      return isSplittableMemOp(ATy, II.isVolatile());
+    };
+
     insertUse(II, Offset,
               Length ? Length->getLimitedValue()
                      : AllocSize - Offset.getLimitedValue(),
-              (bool)Length);
+              IsSplittable());
   }
 
   void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
     if (MI->isVolatile())
       return false;
-    if (!S.isSplittable())
-      return false; // Skip any unsplittable intrinsics.
+
+    auto *II = dyn_cast<MemSetInst>(U->getUser());
+    if (!II && !S.isSplittable()) {
+      // Skip any non-memset unsplittable intrinsics.
+      return false;
+    }
+    if (II) {
+      // For memset, allow if we have a suitable vector type
+      Type *VTy = getVectorTypeFor(*II, DL);
+      if (!VTy)
+        return false;
+      if (!canConvertValue(DL, SliceTy, VTy))
+        return false;
+    }
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
       return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
 
   // Put load and store types into a set for de-duplication.
   for (const Slice &S : P) {
-    Type *Ty;
+    Type *Ty = nullptr;
     if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
       Ty = LI->getType();
     else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
       Ty = SI->getValueOperand()->getType();
-    else
+    else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+      Ty = getVectorTypeFor(*II, DL);
+
+    if (!Ty)
       continue;
 
     auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
 ;; Allocas have been promoted - the linked dbg.assigns have been removed.
 
 ;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
 
 ;; point.z = 5000;
 ; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
 ;;     local.other.x = global.other.x
 ;;     local.other.y = global.other.y
 ;;     local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
 ; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
 ; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
 
 ;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
 ;;   other is now 3 scalars:
 ;;     point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
 ;;
 ;;     point.z = other.y
 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; There should be no debug info for the padding.
 ; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
 ; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
 %struct.prog_src_register = type { i32, i24 }
 
 ; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
 define ptr @test10() {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT:    ret ptr null
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
 entry:
   %a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
 ;
 ; CHECK-LABEL: @PR14059.1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT:    [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT:    [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT:    [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT:    [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT:    [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
 ; CHECK-NEXT:    [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
 ; CHECK-NEXT:    [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
 ; CHECK-NEXT:    store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT:    i32 4, label [[BB4:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB3:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB2:%.*]]
-; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB1:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
 
 define void @memset_fp80_padding() {
 ; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT:    [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT:    store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT:    [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT:    store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
 ; CHECK-NEXT:    ret void
 ;
   %x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
 define amdgpu_kernel void @test_half_array() #0 {
 ; CHECK-LABEL: @test_half_array(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
 ; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
 ; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
 ; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
 ; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
 ; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -285,17 +291,17 @@ bb:
 define amdgpu_kernel void @test_array_vector() #0 {
 ; CHECK-LAB...
[truncated]

github-actions · 2025-03-27T19:10:42Z

✅ With the latest revision this PR passed the undef deprecator.

arsenm

Missing new tests? I'd expect to see a few new targeted tests stressing different vector sizes and alignments, and not just updates of existing tests

arsenm · 2025-03-28T01:06:28Z

llvm/lib/Transforms/Scalar/SROA.cpp

+
+  auto *VTy =
+      FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+  if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))


DL.getTypeAllocSizeInBits is implemented with getTypeStoreSizeInBits and I find it confusing to combine the two, use explicit alignment checks if you need both

I also do not think you should need to consider the type alignment. The resulting store should have an explicit alignment, which does not need to match the type's natural alignment

This is a workaround.
Latest revision adds a comment to make this explicit and also cleans up the usage of getType*.

arsenm · 2025-03-28T01:07:06Z

llvm/lib/Transforms/Scalar/SROA.cpp

+      if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+        return true;


Further duplicated size checks, it's hard to follow the flow

Cleaned up the code. Hopefully better now.

arsenm · 2025-03-28T01:08:37Z

llvm/lib/Transforms/Scalar/SROA.cpp

@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
    if (!IsOffsetKnown)
      return PI.setAborted(&II);

+    auto IsSplittable = [&]() {


Can you move this to a separate utility function? The Length capture is slightly confusing

Cleaned up code removes the helper altogether. Hopefully better now.

arsenm · 2025-03-28T01:08:56Z

llvm/lib/Transforms/Scalar/SROA.cpp

+
+    if (!Ty)


Leave as else continue

macurtis-amd · 2025-03-29T10:57:13Z

Missing new tests? I'd expect to see a few new targeted tests stressing different vector sizes and alignments, and not just updates of existing tests

Added a new test.

Thanks for the review!

arsenm · 2025-03-30T08:23:53Z

llvm/test/Transforms/SROA/vector-promotion-memset.ll

+; CHECK-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <32 x i8> zeroinitializer, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    ret void
+;
+  %2 = alloca %struct.a, align 32


Use named values in tests

Updated to named values.

arsenm · 2025-03-30T08:29:32Z

llvm/lib/Transforms/Scalar/SROA.cpp

+  if (!Length)
+    return nullptr;
+
+  APInt Val = Length->getValue();


Suggested change

APInt Val = Length->getValue();

const APInt &Val = Length->getValue();

Changed to suggestion

arsenm · 2025-03-30T08:30:07Z

llvm/lib/Transforms/Scalar/SROA.cpp

+  if (Val.ugt(std::numeric_limits<unsigned>::max()))
+    return nullptr;


Don't understand this limit. Is this the maximum number of vector elements? Should avoid hardcoding that

arsenm · 2025-03-30T08:31:51Z

llvm/lib/Transforms/Scalar/SROA.cpp

+    return nullptr;
+
+  uint64_t MemSetLen = Val.getZExtValue();
+  auto *VTy = FixedVectorType::get(II.getValue()->getType(), MemSetLen);


The element will always be i8. TODO to support llvm.experimental.memset.pattern?

Added comment

arsenm · 2025-03-30T08:35:56Z

llvm/lib/Transforms/Scalar/SROA.cpp

+  if (Val.ugt(std::numeric_limits<unsigned>::max()))
+    return nullptr;


Is this just because the maximum number of vector elts? Can you put this value into a helper on FixedVectorType instead of hardcoding unsigned here?

Although we probably shouldn't be trying to promote anything that's anything close to that big.

Is this just because the maximum number of vector elts?

Yes.

Although we probably shouldn't be trying to promote anything that's anything close to that big.

Is there a better limit that I should use instead?

I see other similar checks here and here.

I decided to move the limit from checkVectorTypeForPromotion into a helper function and use it here.

Let me know if there is a more sensible limit to use instead.

arsenm · 2025-03-30T08:36:41Z

llvm/test/Transforms/SROA/vector-promotion-memset.ll

+
+  ret void
+}
+


Test with the number of elements equalling and exceeding 32-bit limit case?

Added test for upper limit

macurtis-amd · 2025-03-31T14:47:40Z

@arsenm Any recommendations for appeasing the undef deprecator?

arsenm · 2025-03-31T15:08:39Z

@arsenm Any recommendations for appeasing the undef deprecator?

I don't think you did anything other than update existing tests, I would ignore it for the purposes of this change

macurtis-amd · 2025-04-03T12:23:37Z

@arsenm Are you okay with the latest revision?

nikic

Missing PR description. The tests also don't make any sense to me, seeing as they compile down to a constant, unused insertelement.

macurtis-amd · 2025-04-03T14:41:58Z

Thanks Nikita for taking a look at this.

Missing PR description.

I've updated the PR description.

The tests also don't make any sense to me, seeing as they compile down to a constant, unused insertelement.

Yes. Tests are very synthetic. Reduced from the already reduced example given in the PR description. I'll see if I can produce something of about the same size but more practical.

Do you think it would make sense to add that example as a test case?

macurtis-amd · 2025-04-04T15:15:53Z

The tests also don't make any sense to me, seeing as they compile down to a constant, unused insertelement.

@nikic I cleaned up the test function a bit, again starting from the original example. They exercise the new code paths, though they still result in an unused insertelement. Is that okay?

macurtis-amd · 2025-04-14T11:56:04Z

@nikic ping

nikic

So if I understand correctly, you are marking memsets as unsplittable, lowering them to vector zero and smaller accesses to inserts/extracts.

I don't think your general approach here is going to work. We need to be careful about introducing vector operations out of thin air, because LLVM is not going to second guess them. If you convert a memset to <32768 x i8> ops here, LLVM is going to carry those all they down, even though this is almost certainly not performant. Additionally, you are breaking the ability for SROA to split the alloca and fully promote parts of it.

nikic · 2025-04-14T13:09:20Z

llvm/lib/Transforms/Scalar/SROA.cpp

+    bool Splittable;
+
+    if (getVectorTypeFor(II, DL))
+      Splittable = isSplittableMemOp(AS.AI.getAllocatedType(), II.isVolatile());


I don't think checking AI.getAllocatedType() here makes a lot of sense, seeing as how the memset may only be operating on part of it.

macurtis-amd · 2025-04-14T14:24:32Z

I should preface this by mentioning that I'm not all that familiar with SROA, so thank you for your patience.

So if I understand correctly, you are marking memsets as unsplittable, lowering them to vector zero and smaller accesses to inserts/extracts.

Yes. As a naive attempt to produce the desired result, I was merely trying to mimic the behavior of store (see https://godbolt.org/z/G9f4vPhs6).

I don't think your general approach here is going to work. We need to be careful about introducing vector operations out of thin air, because LLVM is not going to second guess them. If you convert a memset to <32768 x i8> ops here, LLVM is going to carry those all they down, even though this is almost certainly not performant. Additionally, you are breaking the ability for SROA to split the alloca and fully promote parts of it.

Is there an appropriate place in the code to decide if a splittable memset should be promoted to a vector operation?

nikic · 2025-04-14T16:29:27Z

I don't think your general approach here is going to work. We need to be careful about introducing vector operations out of thin air, because LLVM is not going to second guess them. If you convert a memset to <32768 x i8> ops here, LLVM is going to carry those all they down, even though this is almost certainly not performant. Additionally, you are breaking the ability for SROA to split the alloca and fully promote parts of it.

Is there an appropriate place in the code to decide if a splittable memset should be promoted to a vector operation?

Basically: We should only do it if there already is an existing vector operation. If we have a memset and then a later load of e.g. <16 x i8>, then lowering the memset to store <16 x i8> zeroinitializer makes sense. But if we only have scalar operations, we should not be introducing vectors in SROA.

macurtis-amd · 2025-04-21T20:27:04Z

@nikic Latest revision only promotes memsets where the region being set is within an existing vector in the underlying alloca (as determined by getTypePartition).

It is much more constrained as you can see by the much smaller number of tests that are now affected, while still improving the original gemm kernel.

Is this okay?

macurtis-amd · 2025-04-24T15:35:24Z

@nikic ping

macurtis-amd · 2025-04-29T15:59:34Z

@nikic ping ping
Are you okay with the latest revision?
(Thanks for reviewing this. Apologies if I'm being annoying).

macurtis-amd · 2025-05-08T15:32:36Z

@nikic ping ping ping

macurtis-amd · 2025-06-02T11:19:02Z

@nikic ping
or if anyone else (@davemgreen @kazutakahirata?) has time, a review would be much appreciated

macurtis-amd · 2025-06-16T16:23:02Z

@nikic ping

ronlieb

LGTM, probably needs a rebase

nikic · 2025-06-17T22:19:55Z

On x86, what we actually end up doing is to combine those to unaligned i64 loads (see https://godbolt.org/z/P5z674x4r), which is probably the best outcome if they are supported. I assume AMDGPU does not support unaligned loads, and that's why you want to have single element loads that get inserted into a vector and then perform sub-vector extracts on it?

shiltian · 2025-06-17T22:49:54Z

I assume AMDGPU does not support unaligned loads

We do support unaligned access IIUC, and it is actually required by HSA ABI. CC @arsenm

arsenm · 2025-06-17T23:38:08Z

We do support unaligned access IIUC, and it is actually required by HSA ABI. CC @arsenm

Unaligned access has been supported since almost always. There's just a mode control for it, you can disable it in the driver though

arsenm · 2025-06-17T23:42:01Z

and that's why you want to have single element loads that get inserted into a vector and then perform sub-vector extracts on it?

We care about eliminate stack usage at any cost. Big load are always better, even if not naturally aligned. When we do care about alignment, we only care about align > 4, we don't care about the actual type alignment.

nikic · 2025-06-18T16:22:12Z

To make the question more precise, would it make sense to change AMDGPUs unaligned access hook such that https://godbolt.org/z/Gv1j4vjqE will look the same as on X86?

That should also fix the motivating problem here.

macurtis-amd · 2025-06-19T10:34:01Z

To make the question more precise, would it make sense to change AMDGPUs unaligned access hook such that https://godbolt.org/z/Gv1j4vjqE will look the same as on X86?

That should also fix the motivating problem here.

I'll investigate this as an alternative. Thanks.

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU debuginfo llvm:transforms labels Mar 27, 2025

macurtis-amd requested a review from jrbyrnes March 27, 2025 19:08

arsenm reviewed Mar 28, 2025

View reviewed changes

macurtis-amd force-pushed the vector-promote-memset branch from 096fde6 to a9b215b Compare March 29, 2025 10:24

arsenm reviewed Mar 30, 2025

View reviewed changes

macurtis-amd force-pushed the vector-promote-memset branch from 7075b30 to f1d777e Compare March 31, 2025 14:00

nikic requested changes Apr 3, 2025

View reviewed changes

macurtis-amd force-pushed the vector-promote-memset branch from f1d777e to fe79e66 Compare April 4, 2025 13:50

macurtis-amd requested a review from nikic April 9, 2025 14:36

nikic requested changes Apr 14, 2025

View reviewed changes

macurtis-amd force-pushed the vector-promote-memset branch from fe79e66 to 6248b40 Compare April 21, 2025 13:30

macurtis-amd requested a review from nikic April 28, 2025 11:19

macurtis-amd requested review from kazutakahirata and davemgreen May 19, 2025 15:05

macurtis-amd requested a review from ronlieb June 17, 2025 19:10

ronlieb approved these changes Jun 17, 2025

View reviewed changes

macurtis-amd added 6 commits June 18, 2025 05:13

[SROA] Vector promote some memsets

e27d97a

fixup! [SROA] Vector promote some memsets

ac62b8d

fixup! [SROA] Vector promote some memsets

a20f3bf

fixup! [SROA] Vector promote some memsets

a078dae

fixup! [SROA] Vector promote some memsets

128c1ee

fixup! [SROA] Vector promote some memsets

a7608eb

macurtis-amd force-pushed the vector-promote-memset branch from 6248b40 to a7608eb Compare June 18, 2025 10:49

		if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
		return true;

	APInt Val = Length->getValue();
	const APInt &Val = Length->getValue();

		if (Val.ugt(std::numeric_limits<unsigned>::max()))
		return nullptr;

[SROA] Vector promote some memsets #133301

Are you sure you want to change the base?

[SROA] Vector promote some memsets #133301

Conversation

macurtis-amd commented Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 27, 2025

Uh oh!

github-actions bot commented Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

macurtis-amd commented Mar 29, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

macurtis-amd commented Mar 31, 2025

Uh oh!

arsenm commented Mar 31, 2025

Uh oh!

macurtis-amd commented Apr 3, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

macurtis-amd commented Apr 3, 2025

Uh oh!

macurtis-amd commented Apr 4, 2025

Uh oh!

macurtis-amd commented Apr 14, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

macurtis-amd commented Mar 27, 2025 •

edited

Loading

llvmbot commented Mar 27, 2025 •

edited

Loading

github-actions bot commented Mar 27, 2025 •

edited

Loading