-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099
base: main
Are you sure you want to change the base?
[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesAdded initial support for (masked)loads + compress and Patch is 109.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132099.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
Gather,
Vectorize,
ScatterVectorize,
- StridedVectorize
+ StridedVectorize,
+ MaskedLoadCompressVectorize
};
using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
Vectorize, ///< The node is regularly vectorized.
ScatterVectorize, ///< Masked scatter/gather node.
StridedVectorize, ///< Strided loads (and stores)
+ MaskedLoadCompressVectorize, ///< Masked load with compress.
NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
case StridedVectorize:
dbgs() << "StridedVectorize\n";
break;
+ case MaskedLoadCompressVectorize:
+ dbgs() << "MaskedLoadCompressVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->isGather())
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::StridedVectorize)
+ Entry->State == TreeEntry::StridedVectorize ||
+ Entry->State == TreeEntry::MaskedLoadCompressVectorize)
return "color=blue";
return "";
}
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, Type *ScalarTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<int> &CompressMask) {
+ const unsigned Sz = PointerOps.size();
+ CompressMask.assign(Sz, PoisonMaskElem);
+ // The first element always set.
+ CompressMask[0] = 0;
+ Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+ for (unsigned I : seq<unsigned>(1, Sz)) {
+ Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+ unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ CompressMask[I] = Pos;
+ }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+ ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+ const DominatorTree &DT, const TargetLibraryInfo &TLI,
+ const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+ unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+ VectorType *&LoadVecTy) {
+ InterleaveFactor = 0;
+ Type *ScalarTy = VL.front()->getType();
+ const unsigned Sz = VL.size();
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Check external uses.
+ for (const auto [I, V] : enumerate(VL)) {
+ if (AreAllUsersVectorized(V))
+ continue;
+ InstructionCost ExtractCost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ InstructionCost ScalarCost =
+ TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (ExtractCost <= ScalarCost)
+ return false;
+ }
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ std::optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ if (!Diff)
+ return false;
+ const unsigned MaxRegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ // Check for very large distances between elements.
+ if (*Diff / Sz >= MaxRegSize / 8)
+ return false;
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+ bool IsNotMasked = isSafeToLoadUnconditionally(
+ Ptr0, LoadVecTy, CommonAlignment, DL,
+ cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+ &TLI);
+ // TODO: perform the analysis of each scalar load for better
+ // safe-load-unconditionally analysis.
+ buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+ assert(CompressMask.size() >= 2 && "At least two elements are required");
+ IsMasked = !IsNotMasked;
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(cast<Instruction>(V),
+ CostKind);
+ }) +
+ ScalarGEPCost;
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ InstructionCost GatherCost =
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ InstructionCost LoadCost = 0;
+ if (IsNotMasked)
+ LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ else
+ LoadCost =
+ TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
+ if (int Interval = CompressMask[1] - CompressMask[0];
+ Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+ return static_cast<unsigned>(D.value()) == D.index() * Interval;
+ })) {
+ // Check for potential segmented(interleaved) loads.
+ if (TTI.isLegalInterleavedAccessType(
+ LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace())) {
+ InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, Interval, std::nullopt,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+ if (!Mask.empty())
+ InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ if (InterleavedCost < GatherCost) {
+ InterleaveFactor = Interval;
+ return true;
+ }
+ }
+ }
+ if (!Order.empty()) {
+ SmallVector<int> NewMask(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ NewMask[I] = CompressMask[Mask[I]];
+ }
+ CompressMask.swap(NewMask);
+ }
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+ InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+ return TotalVecCost < GatherCost;
+}
+
BoUpSLP::LoadsState
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == Sz - 1)
return LoadsState::Vectorize;
- if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
- TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::Gather;
// Simple check if not a strided access - clear order.
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
// Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
}
}
+ [[maybe_unused]] bool IsMasked;
+ [[maybe_unused]] unsigned InterleaveFactor;
+ [[maybe_unused]] SmallVector<int> CompressMask;
+ [[maybe_unused]] VectorType *LoadVecTy;;
+ if (isMaskedLoadCompress(
+ VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+ [&](Value *V) {
+ return areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ },
+ IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+ return LoadsState::MaskedLoadCompressVectorize;
}
+ if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+ TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::Gather;
// Correctly identify compare the cost of loads + shuffles rather than
// strided/masked gather loads. Returns true if vectorized + shuffles
// representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
// If need the reorder - consider as high-cost masked gather for now.
if ((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
+ LS == LoadsState::StridedVectorize ||
+ LS == LoadsState::MaskedLoadCompressVectorize) &&
!Order.empty() && !isReverseOrder(Order))
LS = LoadsState::ScatterVectorize;
States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
CommonAlignment, CostKind) +
VectorGEPCost;
break;
+ case LoadsState::MaskedLoadCompressVectorize:
+ VecLdCost += TTI.getMaskedMemoryOpCost(
+ Instruction::Load, SubVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind) +
+ VectorGEPCost +
+ ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+ {}, CostKind);
+ break;
case LoadsState::ScatterVectorize:
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::nullopt;
if (TE.State == TreeEntry::SplitVectorize ||
((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
+ TE.State == TreeEntry::StridedVectorize ||
+ TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
CurrentOrder, PointerOps);
- if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+ if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+ Res == LoadsState::MaskedLoadCompressVectorize)
return std::move(CurrentOrder);
}
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::SplitVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
if ((TE->State == TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty()) ||
((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) ||
(SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State ==
+ TreeEntry::MaskedLoadCompressVectorize ||
OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Queue.push(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedOps) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::StridedVectorize) {
+ Data.first->State == TreeEntry::StridedVectorize ||
+ Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// just exit.
unsigned ConsecutiveNodesSize = 0;
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- ArrayRef<Value *> VL =
- VectorizableTree[std::get<0>(P)]->Scalars;
- ConsecutiveNodesSize += VL.size();
- unsigned Start = std::distance(Slice.begin(), It);
- unsigned Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(std::distance(Slice.begin(), It),
- VL.size()) != VL;
- }))
+ any_of(
+ zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ LoadsState State =
+ canVectorizeLoads(VL, VL.front(), Order,
+ PointerOps);
+ if (State == LoadsState::ScatterVectorize||
+ State == LoadsState::MaskedLoadCompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntr...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Supposed to fix 130872, but fixes only for AVX512 currently. Requires better pointer analysis to avoid masked loads |
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, | |||
return Builder.CreateShuffleVector(Vec, Mask); | |||
} | |||
|
|||
/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered | |||
/// with \p Order. | |||
static void buildCompressMask(ArrayRef<Value *> PointerOps, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth returning true (or std::optional) here if the compression stride is uniform?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if this is possible at all. Can you share a scenario?
Created using spr 1.3.5
Ping! |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better comments describing what's going on?
Created using spr 1.3.5
Added initial support for (masked)loads + compress and
(masked)interleaved loads.