Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099

Open
wants to merge 5 commits into
base: main
Choose a base branch
from

Conversation

alexey-bataev
Copy link
Member

Added initial support for (masked)loads + compress and
(masked)interleaved loads.

Created using spr 1.3.5
@llvmbot
Copy link
Member

llvmbot commented Mar 19, 2025

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Alexey Bataev (alexey-bataev)

Changes

Added initial support for (masked)loads + compress and
(masked)interleaved loads.


Patch is 109.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132099.diff

14 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+325-34)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll (+7-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll (+6-16)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll (+6-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (+52-110)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (+52-110)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll (+5-6)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll (+28-24)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll (+5-7)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (+4-7)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll (+4-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (+4-4)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll (+3-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll (+5-6)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
     Gather,
     Vectorize,
     ScatterVectorize,
-    StridedVectorize
+    StridedVectorize,
+    MaskedLoadCompressVectorize
   };
 
   using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
       Vectorize,         ///< The node is regularly vectorized.
       ScatterVectorize,  ///< Masked scatter/gather node.
       StridedVectorize,  ///< Strided loads (and stores)
+      MaskedLoadCompressVectorize, ///< Masked load with compress.
       NeedToGather,      ///< Gather/buildvector node.
       CombinedVectorize, ///< Vectorized node, combined with its user into more
                          ///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
       case StridedVectorize:
         dbgs() << "StridedVectorize\n";
         break;
+      case MaskedLoadCompressVectorize:
+        dbgs() << "MaskedLoadCompressVectorize\n";
+        break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->isGather())
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::StridedVectorize)
+        Entry->State == TreeEntry::StridedVectorize ||
+        Entry->State == TreeEntry::MaskedLoadCompressVectorize)
       return "color=blue";
     return "";
   }
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
   return Builder.CreateShuffleVector(Vec, Mask);
 }
 
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+                              ArrayRef<unsigned> Order, Type *ScalarTy,
+                              const DataLayout &DL, ScalarEvolution &SE,
+                              SmallVectorImpl<int> &CompressMask) {
+  const unsigned Sz = PointerOps.size();
+  CompressMask.assign(Sz, PoisonMaskElem);
+  // The first element always set.
+  CompressMask[0] = 0;
+  Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+  for (unsigned I : seq<unsigned>(1, Sz)) {
+    Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+    unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+    CompressMask[I] = Pos;
+  }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+    ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+    ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+    const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+    const DominatorTree &DT, const TargetLibraryInfo &TLI,
+    const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+    unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+    VectorType *&LoadVecTy) {
+  InterleaveFactor = 0;
+  Type *ScalarTy = VL.front()->getType();
+  const unsigned Sz = VL.size();
+  auto *VecTy = getWidenedType(ScalarTy, Sz);
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  // Check external uses.
+  for (const auto [I, V] : enumerate(VL)) {
+    if (AreAllUsersVectorized(V))
+      continue;
+    InstructionCost ExtractCost =
+        TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+    InstructionCost ScalarCost =
+        TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+    if (ExtractCost <= ScalarCost)
+      return false;
+  }
+  Value *Ptr0;
+  Value *PtrN;
+  if (Order.empty()) {
+    Ptr0 = PointerOps.front();
+    PtrN = PointerOps.back();
+  } else {
+    Ptr0 = PointerOps[Order.front()];
+    PtrN = PointerOps[Order.back()];
+  }
+  std::optional<int> Diff =
+      getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+  if (!Diff)
+    return false;
+  const unsigned MaxRegSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
+  // Check for very large distances between elements.
+  if (*Diff / Sz >= MaxRegSize / 8)
+    return false;
+  Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+  LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+  auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+  bool IsNotMasked = isSafeToLoadUnconditionally(
+      Ptr0, LoadVecTy, CommonAlignment, DL,
+      cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+      &TLI);
+  // TODO: perform the analysis of each scalar load for better
+  // safe-load-unconditionally analysis.
+  buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+  assert(CompressMask.size() >= 2 && "At least two elements are required");
+  IsMasked = !IsNotMasked;
+  auto [ScalarGEPCost, VectorGEPCost] =
+      getGEPCosts(TTI, PointerOps, PointerOps.front(),
+                  Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+  // The cost of scalar loads.
+  InstructionCost ScalarLoadsCost =
+      std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+                      [&](InstructionCost C, Value *V) {
+                        return C + TTI.getInstructionCost(cast<Instruction>(V),
+                                                          CostKind);
+                      }) +
+      ScalarGEPCost;
+  APInt DemandedElts = APInt::getAllOnes(Sz);
+  InstructionCost GatherCost =
+      getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                               /*Insert=*/true,
+                               /*Extract=*/false, CostKind) +
+      ScalarLoadsCost;
+  InstructionCost LoadCost = 0;
+  if (IsNotMasked)
+    LoadCost =
+        TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+                            IsNotMasked ? LI->getAlign() : CommonAlignment,
+                            LI->getPointerAddressSpace(), CostKind);
+  else
+    LoadCost =
+        TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+                                  LI->getPointerAddressSpace(), CostKind);
+  SmallVector<int> Mask;
+  if (!Order.empty())
+    inversePermutation(Order, Mask);
+  if (int Interval = CompressMask[1] - CompressMask[0];
+      Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+        return static_cast<unsigned>(D.value()) == D.index() * Interval;
+      })) {
+    // Check for potential segmented(interleaved) loads.
+    if (TTI.isLegalInterleavedAccessType(
+            LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+            LI->getPointerAddressSpace())) {
+      InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+          Instruction::Load, LoadVecTy, Interval, std::nullopt,
+          IsNotMasked ? LI->getAlign() : CommonAlignment,
+          LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+      if (!Mask.empty())
+        InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+                                            LoadVecTy, CompressMask, CostKind);
+      if (InterleavedCost < GatherCost) {
+        InterleaveFactor = Interval;
+        return true;
+      }
+    }
+  }
+  if (!Order.empty()) {
+    SmallVector<int> NewMask(Sz, PoisonMaskElem);
+    for (unsigned I : seq<unsigned>(Sz)) {
+      NewMask[I] = CompressMask[Mask[I]];
+    }
+    CompressMask.swap(NewMask);
+  }
+  InstructionCost CompressCost = ::getShuffleCost(
+      TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+  InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+  return TotalVecCost < GatherCost;
+}
+
 BoUpSLP::LoadsState
 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                            SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     // Check that the sorted loads are consecutive.
     if (static_cast<unsigned>(*Diff) == Sz - 1)
       return LoadsState::Vectorize;
-    if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
-        TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
-      return LoadsState::Gather;
     // Simple check if not a strided access - clear order.
     bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
     // Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         }
       }
     }
+    [[maybe_unused]] bool IsMasked;
+    [[maybe_unused]] unsigned InterleaveFactor;
+    [[maybe_unused]] SmallVector<int> CompressMask;
+    [[maybe_unused]] VectorType *LoadVecTy;;
+    if (isMaskedLoadCompress(
+            VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+            [&](Value *V) {
+              return areAllUsersVectorized(cast<Instruction>(V),
+                                           UserIgnoreList);
+            },
+            IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+      return LoadsState::MaskedLoadCompressVectorize;
   }
+  if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+      TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+    return LoadsState::Gather;
   // Correctly identify compare the cost of loads + shuffles rather than
   // strided/masked gather loads. Returns true if vectorized + shuffles
   // representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         }
         // If need the reorder - consider as high-cost masked gather for now.
         if ((LS == LoadsState::Vectorize ||
-             LS == LoadsState::StridedVectorize) &&
+             LS == LoadsState::StridedVectorize ||
+             LS == LoadsState::MaskedLoadCompressVectorize) &&
             !Order.empty() && !isReverseOrder(Order))
           LS = LoadsState::ScatterVectorize;
         States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                                   CommonAlignment, CostKind) +
                        VectorGEPCost;
           break;
+        case LoadsState::MaskedLoadCompressVectorize:
+          VecLdCost += TTI.getMaskedMemoryOpCost(
+                           Instruction::Load, SubVecTy, CommonAlignment,
+                           LI0->getPointerAddressSpace(), CostKind) +
+                       VectorGEPCost +
+                       ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+                                        {}, CostKind);
+          break;
         case LoadsState::ScatterVectorize:
           VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
                                                   LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
     return std::nullopt;
   if (TE.State == TreeEntry::SplitVectorize ||
       ((TE.State == TreeEntry::Vectorize ||
-        TE.State == TreeEntry::StridedVectorize) &&
+        TE.State == TreeEntry::StridedVectorize ||
+        TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
        (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
         (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
     assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
                                          CurrentOrder, PointerOps);
-      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+          Res == LoadsState::MaskedLoadCompressVectorize)
         return std::move(CurrentOrder);
     }
     // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
-            TE->State == TreeEntry::SplitVectorize) ||
+            TE->State == TreeEntry::SplitVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
       if ((TE->State == TreeEntry::SplitVectorize &&
            TE->ReuseShuffleIndices.empty()) ||
           ((TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) &&
+            TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
            (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
                 InsertElementInst>(TE->getMainOp()) ||
             (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
           return OpData.first == I &&
                  (OpData.second->State == TreeEntry::Vectorize ||
                   OpData.second->State == TreeEntry::StridedVectorize ||
+                  OpData.second->State ==
+                      TreeEntry::MaskedLoadCompressVectorize ||
                   OpData.second->State == TreeEntry::SplitVectorize);
         }))
       continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
       // node, just reorder reuses mask.
       if (TE->State != TreeEntry::Vectorize &&
           TE->State != TreeEntry::StridedVectorize &&
+          TE->State != TreeEntry::MaskedLoadCompressVectorize &&
           TE->State != TreeEntry::SplitVectorize &&
           TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
         GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
                  [&Gather, UserTE, I](TreeEntry *TE) {
                    assert(TE->State != TreeEntry::Vectorize &&
                           TE->State != TreeEntry::StridedVectorize &&
+                          TE->State != TreeEntry::MaskedLoadCompressVectorize &&
                           TE->State != TreeEntry::SplitVectorize &&
                           "Only non-vectorized nodes are expected.");
                    if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
         TE->State != TreeEntry::StridedVectorize &&
+        TE->State != TreeEntry::MaskedLoadCompressVectorize &&
         TE->State != TreeEntry::SplitVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       Queue.push(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize ||
             TE->State == TreeEntry::SplitVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     for (TreeEntry *TE : OrderedOps) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::MaskedLoadCompressVectorize ||
             TE->State == TreeEntry::SplitVectorize ||
             (TE->isGather() && GathersToOrders.contains(TE))) ||
           !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
             TE->State != TreeEntry::StridedVectorize &&
+            TE->State != TreeEntry::MaskedLoadCompressVectorize &&
             TE->State != TreeEntry::SplitVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::StridedVectorize) {
+          Data.first->State == TreeEntry::StridedVectorize ||
+          Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
               // just exit.
               unsigned ConsecutiveNodesSize = 0;
               if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
-                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
-                         [&, Slice = Slice](const auto &P) {
-                           const auto *It = find_if(Slice, [&](Value *V) {
-                             return std::get<1>(P).contains(V);
-                           });
-                           if (It == Slice.end())
-                             return false;
-                           ArrayRef<Value *> VL =
-                               VectorizableTree[std::get<0>(P)]->Scalars;
-                           ConsecutiveNodesSize += VL.size();
-                           unsigned Start = std::distance(Slice.begin(), It);
-                           unsigned Sz = Slice.size() - Start;
-                           return Sz < VL.size() ||
-                                  Slice.slice(std::distance(Slice.begin(), It),
-                                              VL.size()) != VL;
-                         }))
+                  any_of(
+                      zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                      [&, Slice = Slice](const auto &P) {
+                        const auto *It = find_if(Slice, [&](Value *V) {
+                          return std::get<1>(P).contains(V);
+                        });
+                        if (It == Slice.end())
+                          return false;
+                        const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+                        ArrayRef<Value *> VL = TE.Scalars;
+                        OrdersType Order;
+                        SmallVector<Value *> PointerOps;
+                        LoadsState State =
+                        canVectorizeLoads(VL, VL.front(), Order,
+                                              PointerOps);
+                        if (State == LoadsState::ScatterVectorize||
+                            State == LoadsState::MaskedLoadCompressVectorize)
+                          return false;
+                        ConsecutiveNodesSize += VL.size();
+                        unsigned Start = std::distance(Slice.begin(), It);
+                        unsigned Sz = Slice.size() - Start;
+                        return Sz < VL.size() ||
+                               Slice.slice(std::distance(Slice.begin(), It),
+                                           VL.size()) != VL;
+                      }))
                 continue;
               // Try to build long masked gather loads.
               UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntr...
[truncated]

Copy link

github-actions bot commented Mar 19, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Supposed to fix 130872, but fixes only for AVX512 currently. Requires better pointer analysis to avoid masked loads

@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}

/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
/// with \p Order.
static void buildCompressMask(ArrayRef<Value *> PointerOps,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth returning true (or std::optional) here if the compression stride is uniform?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is possible at all. Can you share a scenario?

Created using spr 1.3.5
@alexey-bataev
Copy link
Member Author

Ping!

Created using spr 1.3.5
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better comments describing what's going on?

Created using spr 1.3.5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants