Skip to content

Commit cac6094

Browse files
committed
[SLP]Improve shuffles cost estimation where possible.
Improved/fixed cost modeling for shuffles by providing masks, improved cost model for non-identity insertelements. Differential Revision: https://reviews.llvm.org/D115462
1 parent fa7b4cf commit cac6094

File tree

11 files changed

+283
-303
lines changed

11 files changed

+283
-303
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 69 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5557,17 +5557,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
55575557
for (auto *V : VL) {
55585558
++Idx;
55595559

5560-
// Need to exclude undefs from analysis.
5561-
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5562-
continue;
5563-
55645560
// Reached the start of a new vector registers.
55655561
if (Idx % EltsPerVector == 0) {
55665562
RegMask.assign(EltsPerVector, UndefMaskElem);
55675563
AllConsecutive = true;
55685564
continue;
55695565
}
55705566

5567+
// Need to exclude undefs from analysis.
5568+
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5569+
continue;
5570+
55715571
// Check all extracts for a vector register on the target directly
55725572
// extract values in order.
55735573
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -6012,61 +6012,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
60126012
assert(E->ReuseShuffleIndices.empty() &&
60136013
"Unique insertelements only are expected.");
60146014
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
6015-
60166015
unsigned const NumElts = SrcVecTy->getNumElements();
60176016
unsigned const NumScalars = VL.size();
6017+
6018+
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
6019+
6020+
unsigned OffsetBeg = *getInsertIndex(VL.front());
6021+
unsigned OffsetEnd = OffsetBeg;
6022+
for (Value *V : VL.drop_front()) {
6023+
unsigned Idx = *getInsertIndex(V);
6024+
if (OffsetBeg > Idx)
6025+
OffsetBeg = Idx;
6026+
else if (OffsetEnd < Idx)
6027+
OffsetEnd = Idx;
6028+
}
6029+
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
6030+
if (NumOfParts > 0)
6031+
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
6032+
unsigned VecSz =
6033+
(1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
6034+
VecScalarsSz;
6035+
unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
6036+
unsigned InsertVecSz = std::min<unsigned>(
6037+
PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
6038+
((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
6039+
VecScalarsSz);
6040+
60186041
APInt DemandedElts = APInt::getZero(NumElts);
60196042
// TODO: Add support for Instruction::InsertValue.
60206043
SmallVector<int> Mask;
60216044
if (!E->ReorderIndices.empty()) {
60226045
inversePermutation(E->ReorderIndices, Mask);
6023-
Mask.append(NumElts - NumScalars, UndefMaskElem);
6046+
Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
60246047
} else {
6025-
Mask.assign(NumElts, UndefMaskElem);
6026-
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
6048+
Mask.assign(VecSz, UndefMaskElem);
6049+
std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
60276050
}
6028-
unsigned Offset = *getInsertIndex(VL0);
60296051
bool IsIdentity = true;
6030-
SmallVector<int> PrevMask(NumElts, UndefMaskElem);
6052+
SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
60316053
Mask.swap(PrevMask);
60326054
for (unsigned I = 0; I < NumScalars; ++I) {
60336055
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
60346056
DemandedElts.setBit(InsertIdx);
6035-
IsIdentity &= InsertIdx - Offset == I;
6036-
Mask[InsertIdx - Offset] = I;
6057+
IsIdentity &= InsertIdx - OffsetBeg == I;
6058+
Mask[InsertIdx - OffsetBeg] = I;
60376059
}
60386060
assert(Offset < NumElts && "Failed to find vector index offset");
60396061

60406062
InstructionCost Cost = 0;
60416063
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
60426064
/*Insert*/ true, /*Extract*/ false);
60436065

6044-
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
6045-
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
6046-
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
6047-
Cost += TTI->getShuffleCost(
6048-
TargetTransformInfo::SK_PermuteSingleSrc,
6049-
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
6050-
} else if (!IsIdentity) {
6051-
auto *FirstInsert =
6052-
cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6053-
return !is_contained(E->Scalars,
6054-
cast<Instruction>(V)->getOperand(0));
6055-
}));
6056-
if (isUndefVector(FirstInsert->getOperand(0))) {
6057-
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
6066+
// First cost - resize to actual vector size if not identity shuffle or
6067+
// need to shift the vector.
6068+
// Do not calculate the cost if the actual size is the register size and
6069+
// we can merge this shuffle with the following SK_Select.
6070+
auto *InsertVecTy =
6071+
FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
6072+
if (!IsIdentity)
6073+
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
6074+
InsertVecTy, Mask);
6075+
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6076+
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
6077+
}));
6078+
// Second cost - permutation with subvector, if some elements are from the
6079+
// initial vector or inserting a subvector.
6080+
// TODO: Implement the analysis of the FirstInsert->getOperand(0)
6081+
// subvector of ActualVecTy.
6082+
if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
6083+
(Offset != OffsetBeg || (OffsetEnd + 1) % VecScalarsSz != 0)) {
6084+
if (InsertVecSz != VecSz) {
6085+
auto *ActualVecTy =
6086+
FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
6087+
Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
6088+
None, OffsetBeg - Offset, InsertVecTy);
60586089
} else {
6059-
SmallVector<int> InsertMask(NumElts);
6060-
std::iota(InsertMask.begin(), InsertMask.end(), 0);
6061-
for (unsigned I = 0; I < NumElts; I++) {
6090+
for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
6091+
Mask[I] = I;
6092+
for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
6093+
I <= End; ++I)
60626094
if (Mask[I] != UndefMaskElem)
6063-
InsertMask[Offset + I] = NumElts + I;
6064-
}
6065-
Cost +=
6066-
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
6095+
Mask[I] = I + VecSz;
6096+
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
6097+
Mask[I] = I;
6098+
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
60676099
}
60686100
}
6069-
60706101
return Cost;
60716102
}
60726103
case Instruction::ZExt:
@@ -6519,7 +6550,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
65196550
// No need to vectorize inserts of gathered values.
65206551
if (VectorizableTree.size() == 2 &&
65216552
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
6522-
VectorizableTree[1]->State == TreeEntry::NeedToGather)
6553+
VectorizableTree[1]->State == TreeEntry::NeedToGather &&
6554+
(VectorizableTree[1]->getVectorFactor() <= 2 ||
6555+
!(isSplat(VectorizableTree[1]->Scalars) ||
6556+
allConstant(VectorizableTree[1]->Scalars))))
65236557
return true;
65246558

65256559
// We can vectorize the tree if its size is greater than or equal to the

0 commit comments

Comments
 (0)