Revert "[AMDGPU] Trim zero components from buffer and image stores"

Mateja Marjanovic · Mateja Marjanovic · commit 9c8c31eea439 · 2023-05-18T17:02:01.000+02:00
This reverts commit 3181a6e.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -872,12 +872,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
 
   defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
               "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
-              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>,
-              AMDGPUImageDMaskIntrinsic;
+              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
   defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
               "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
-              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>,
-              AMDGPUImageDMaskIntrinsic;
+              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
 
   //////////////////////////////////////////////////////////////////////////
   // MSAA intrinsics
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -376,36 +376,6 @@ static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
   return false;
 }
 
-// Trim all zero components from the end of the vector \p UseV and return
-// an appropriate bitset with known elements.
-static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
-                                       Instruction *I) {
-  auto *VTy = cast<FixedVectorType>(UseV->getType());
-  unsigned VWidth = VTy->getNumElements();
-  APInt DemandedElts = APInt::getAllOnes(VWidth);
-
-  for (int i = VWidth - 1; i >= 0; --i) {
-    APInt DemandOneElt = APInt::getOneBitSet(VWidth, i);
-    KnownFPClass KnownFPClass =
-        computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(),
-                            /*InterestedClasses=*/fcAllFlags,
-                            /*Depth=*/0, &IC.getTargetLibraryInfo(),
-                            &IC.getAssumptionCache(), I,
-                            &IC.getDominatorTree(),
-                            &IC.getOptimizationRemarkEmitter());
-    if (KnownFPClass.KnownFPClasses != fcPosZero)
-      break;
-    DemandedElts.clearBit(i);
-  }
-  return DemandedElts;
-}
-
-static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
-                                                    IntrinsicInst &II,
-                                                    APInt DemandedElts,
-                                                    int DMaskIdx = -1,
-                                                    bool IsLoad = true);
-
 std::optional<Instruction *>
 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   Intrinsic::ID IID = II.getIntrinsicID();
@@ -1120,65 +1090,26 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
     break;
   }
-  case Intrinsic::amdgcn_buffer_store:
-  case Intrinsic::amdgcn_buffer_store_format:
-  case Intrinsic::amdgcn_raw_buffer_store:
-  case Intrinsic::amdgcn_raw_buffer_store_format:
-  case Intrinsic::amdgcn_raw_tbuffer_store:
-  case Intrinsic::amdgcn_struct_buffer_store:
-  case Intrinsic::amdgcn_struct_buffer_store_format:
-  case Intrinsic::amdgcn_struct_tbuffer_store:
-  case Intrinsic::amdgcn_tbuffer_store:
-  case Intrinsic::amdgcn_image_store_1d:
-  case Intrinsic::amdgcn_image_store_1darray:
-  case Intrinsic::amdgcn_image_store_2d:
-  case Intrinsic::amdgcn_image_store_2darray:
-  case Intrinsic::amdgcn_image_store_2darraymsaa:
-  case Intrinsic::amdgcn_image_store_2dmsaa:
-  case Intrinsic::amdgcn_image_store_3d:
-  case Intrinsic::amdgcn_image_store_cube:
-  case Intrinsic::amdgcn_image_store_mip_1d:
-  case Intrinsic::amdgcn_image_store_mip_1darray:
-  case Intrinsic::amdgcn_image_store_mip_2d:
-  case Intrinsic::amdgcn_image_store_mip_2darray:
-  case Intrinsic::amdgcn_image_store_mip_3d:
-  case Intrinsic::amdgcn_image_store_mip_cube: {
-    if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
-      break;
-
-    APInt DemandedElts =
-        trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
-
-    int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
-    if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
-                                              false)) {
-      return IC.eraseInstFromFunction(II);
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+      return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
     }
-
-    break;
-  }
   }
-  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
-            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
-    return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
   }
   return std::nullopt;
 }
 
 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
 ///
-/// The result of simplifying amdgcn image and buffer store intrinsics is updating
-/// definitions of the intrinsics vector argument, not Uses of the result like
-/// image and buffer loads.
 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
 ///       struct returns.
 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
                                                     IntrinsicInst &II,
                                                     APInt DemandedElts,
-                                                    int DMaskIdx, bool IsLoad) {
+                                                    int DMaskIdx = -1) {
 
-  auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
-                                             : II.getOperand(0)->getType());
+  auto *IIVTy = cast<FixedVectorType>(II.getType());
   unsigned VWidth = IIVTy->getNumElements();
   if (VWidth == 1)
     return nullptr;
@@ -1249,13 +1180,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
 
     unsigned NewDMaskVal = 0;
-    unsigned OrigLdStIdx = 0;
+    unsigned OrigLoadIdx = 0;
     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
       const unsigned Bit = 1 << SrcIdx;
       if (!!(DMaskVal & Bit)) {
-        if (!!DemandedElts[OrigLdStIdx])
+        if (!!DemandedElts[OrigLoadIdx])
           NewDMaskVal |= Bit;
-        OrigLdStIdx++;
+        OrigLoadIdx++;
       }
     }
 
@@ -1283,45 +1214,29 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
   OverloadTys[0] = NewTy;
 
-  if (!IsLoad) {
-    SmallVector<int, 8> EltMask;
-    for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
-      if (DemandedElts[OrigStoreIdx])
-        EltMask.push_back(OrigStoreIdx);
-
-    if (NewNumElts == 1)
-      Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
-    else
-      Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
-  }
-
   Function *NewIntrin = Intrinsic::getDeclaration(
       II.getModule(), II.getIntrinsicID(), OverloadTys);
   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
   NewCall->takeName(&II);
   NewCall->copyMetadata(II);
 
-  if (IsLoad) {
-    if (NewNumElts == 1) {
-      return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
-                                            DemandedElts.countr_zero());
-    }
-
-    SmallVector<int, 8> EltMask;
-    unsigned NewLoadIdx = 0;
-    for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
-      if (!!DemandedElts[OrigLoadIdx])
-        EltMask.push_back(NewLoadIdx++);
-      else
-        EltMask.push_back(NewNumElts);
-    }
-
-    auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+  if (NewNumElts == 1) {
+    return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
+                                          DemandedElts.countr_zero());
+  }
 
-    return Shuffle;
+  SmallVector<int, 8> EltMask;
+  unsigned NewLoadIdx = 0;
+  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+    if (!!DemandedElts[OrigLoadIdx])
+      EltMask.push_back(NewLoadIdx++);
+    else
+      EltMask.push_back(NewNumElts);
   }
 
-  return NewCall;
+  Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+
+  return Shuffle;
 }
 
 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -66,7 +66,7 @@ define double @test_constant_fold_rcp_f64_43() nounwind {
 
 define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
 ; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR13:[0-9]+]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
@@ -107,7 +107,7 @@ define double @test_constant_fold_sqrt_f64_undef() nounwind {
 
 define half @test_constant_fold_sqrt_f16_0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f16_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR14:[0-9]+]]
+; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]]
 ; CHECK-NEXT:    ret half [[VAL]]
 ;
   %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone
@@ -116,7 +116,7 @@ define half @test_constant_fold_sqrt_f16_0() nounwind {
 
 define float @test_constant_fold_sqrt_f32_0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f32_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR14]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone
@@ -125,7 +125,7 @@ define float @test_constant_fold_sqrt_f32_0() nounwind {
 
 define double @test_constant_fold_sqrt_f64_0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f64_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR14]]
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone
@@ -134,7 +134,7 @@ define double @test_constant_fold_sqrt_f64_0() nounwind {
 
 define half @test_constant_fold_sqrt_f16_neg0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR14]]
+; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]]
 ; CHECK-NEXT:    ret half [[VAL]]
 ;
   %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone
@@ -143,7 +143,7 @@ define half @test_constant_fold_sqrt_f16_neg0() nounwind {
 
 define float @test_constant_fold_sqrt_f32_neg0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR14]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone
@@ -152,7 +152,7 @@ define float @test_constant_fold_sqrt_f32_neg0() nounwind {
 
 define double @test_constant_fold_sqrt_f64_neg0() nounwind {
 ; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR14]]
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone
@@ -644,7 +644,7 @@ define i1 @test_class_isnan_f32(float %x) nounwind {
 
 define i1 @test_class_isnan_f32_strict(float %x) nounwind {
 ; CHECK-LABEL: @test_class_isnan_f32_strict(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR15:[0-9]+]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]]
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp
@@ -662,7 +662,7 @@ define i1 @test_class_is_p0_n0_f32(float %x) nounwind {
 
 define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind {
 ; CHECK-LABEL: @test_class_is_p0_n0_f32_strict(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR15]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]]
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp
@@ -1275,8 +1275,8 @@ define i32 @ubfe_offset_0_width_0(i32 %src) {
 
 define i32 @ubfe_offset_0_width_3(i32 %src) {
 ; CHECK-LABEL: @ubfe_offset_0_width_3(
-; CHECK-NEXT:    [[BFE:%.*]] = and i32 [[SRC:%.*]], 7
-; CHECK-NEXT:    ret i32 [[BFE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
   ret i32 %bfe
@@ -1793,7 +1793,7 @@ define i64 @icmp_constant_inputs_false() {
 
 define i64 @icmp_constant_inputs_true() {
 ; CHECK-LABEL: @icmp_constant_inputs_true(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR16:[0-9]+]]
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]]
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)
@@ -2500,7 +2500,7 @@ define i64 @fcmp_constant_inputs_false() {
 
 define i64 @fcmp_constant_inputs_true() {
 ; CHECK-LABEL: @fcmp_constant_inputs_true(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]]
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]]
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4)
@@ -2542,7 +2542,7 @@ define i64 @ballot_zero_64() {
 
 define i64 @ballot_one_64() {
 ; CHECK-LABEL: @ballot_one_64(
-; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]]
+; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]]
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
   %b = call i64 @llvm.amdgcn.ballot.i64(i1 1)
@@ -2568,7 +2568,7 @@ define i32 @ballot_zero_32() {
 
 define i32 @ballot_one_32() {
 ; CHECK-LABEL: @ballot_one_32(
-; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR16]]
+; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]]
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %b = call i32 @llvm.amdgcn.ballot.i32(i1 1)
@@ -5586,7 +5586,7 @@ define double @trig_preop_constfold() {
 
 define double @trig_preop_constfold_strictfp() {
 ; CHECK-LABEL: @trig_preop_constfold_strictfp(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR15]]
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll