Skip to content

Commit d3441f7

Browse files
[LV] Change getSmallBestKnownTC to return an ElementCount (NFC) (#141793)
This is prep work for enabling better UF calculations when using vscale based VFs to vectorise loops with vscale based tripcounts. NOTE: NFC because All uses remain fixed-length until a following PR changes LoopVectorize's version of getSmallConstantTripCount().
1 parent acde20b commit d3441f7

File tree

1 file changed

+30
-20
lines changed

1 file changed

+30
-20
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,13 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
419419
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
420420
}
421421

422+
/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
423+
/// ElementCount to include loops whose trip count is a function of vscale.
424+
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
425+
const Loop *L) {
426+
return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
427+
}
428+
422429
/// Returns "best known" trip count, which is either a valid positive trip count
423430
/// or std::nullopt when an estimate cannot be made (including when the trip
424431
/// count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +434,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
427434
/// 2) Returns expected trip count according to profile data if any.
428435
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429436
/// 4) Returns std::nullopt if all of the above failed.
430-
static std::optional<unsigned>
437+
static std::optional<ElementCount>
431438
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
432439
bool CanUseConstantMax = true) {
433440
// Check if exact trip count is known.
434-
if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
441+
if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
435442
return ExpectedTC;
436443

437444
// Check if there is an expected trip count available from profile data.
438445
if (LoopVectorizeWithBlockFrequency)
439446
if (auto EstimatedTC = getLoopEstimatedTripCount(L))
440-
return *EstimatedTC;
447+
return ElementCount::getFixed(*EstimatedTC);
441448

442449
if (!CanUseConstantMax)
443450
return std::nullopt;
444451

445452
// Check if upper bound estimate is known.
446453
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
447-
return ExpectedTC;
454+
return ElementCount::getFixed(ExpectedTC);
448455

449456
return std::nullopt;
450457
}
@@ -1960,7 +1967,8 @@ class GeneratedRTChecks {
19601967
// Get the best known TC estimate.
19611968
if (auto EstimatedTC = getSmallBestKnownTC(
19621969
PSE, OuterLoop, /* CanUseConstantMax = */ false))
1963-
BestTripCount = *EstimatedTC;
1970+
if (EstimatedTC->isFixed())
1971+
BestTripCount = EstimatedTC->getFixedValue();
19641972

19651973
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
19661974

@@ -3750,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
37503758
}
37513759

37523760
ScalarEvolution *SE = PSE.getSE();
3753-
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3761+
ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
37543762
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
37553763
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3756-
if (TC != MaxTC)
3764+
if (TC != ElementCount::getFixed(MaxTC))
37573765
LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3758-
if (TC == 1) {
3766+
if (TC.isScalar()) {
37593767
reportVectorizationFailure("Single iteration (non) loop",
37603768
"loop trip count is one, irrelevant for vectorization",
37613769
"SingleIterationLoop", ORE, TheLoop);
@@ -3869,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
38693877
}
38703878

38713879
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3872-
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
3880+
if (ExpectedTC && ExpectedTC->isFixed() &&
3881+
ExpectedTC->getFixedValue() <=
3882+
TTI.getMinTripCountTailFoldingThreshold()) {
38733883
if (MaxPowerOf2RuntimeVF > 0u) {
38743884
// If we have a low-trip-count, and the fixed-width VF is known to divide
38753885
// the trip count but the scalable factor does not, use the fixed-width
@@ -3927,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
39273937
return FixedScalableVFPair::getNone();
39283938
}
39293939

3930-
if (TC == 0) {
3940+
if (TC.isZero()) {
39313941
reportVectorizationFailure(
39323942
"unable to calculate the loop count due to complex control flow",
39333943
"UnknownLoopCountComplexCFG", ORE, TheLoop);
@@ -4816,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48164826
// At least one iteration must be scalar when this constraint holds. So the
48174827
// maximum available iterations for interleaving is one less.
48184828
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4819-
? (*BestKnownTC) - 1
4820-
: *BestKnownTC;
4829+
? BestKnownTC->getFixedValue() - 1
4830+
: BestKnownTC->getFixedValue();
48214831

48224832
unsigned InterleaveCountLB = bit_floor(std::max(
48234833
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
48244834

4825-
if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
4835+
if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
48264836
// If the best known trip count is exact, we select between two
48274837
// prospective ICs, where
48284838
//
@@ -5182,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
51825192
// costs of comparison and induction instructions, as they'll get simplified
51835193
// away.
51845194
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5185-
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5186-
if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5195+
auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5196+
if (TC == VF && !foldTailByMasking())
51875197
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
51885198
ValuesToIgnoreForVF);
51895199

@@ -6878,8 +6888,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68786888
// simplified away.
68796889
// TODO: Remove this code after stepping away from the legacy cost model and
68806890
// adding code to simplify VPlans before calculating their costs.
6881-
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
6882-
if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
6891+
auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6892+
if (TC == VF && !CM.foldTailByMasking())
68836893
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
68846894
CostCtx.SkipCostComputation);
68856895

@@ -9647,8 +9657,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
96479657
// Skip vectorization if the expected trip count is less than the minimum
96489658
// required trip count.
96499659
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9650-
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9651-
VF.MinProfitableTripCount)) {
9660+
if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
96529661
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
96539662
"trip count < minimum profitable VF ("
96549663
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10018,7 +10027,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1001810027
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
1001910028
// count by optimizing for size, to minimize overheads.
1002010029
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10021-
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10030+
if (ExpectedTC && ExpectedTC->isFixed() &&
10031+
ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
1002210032
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
1002310033
<< "This loop is worth vectorizing only if no scalar "
1002410034
<< "iteration overheads are incurred.");

0 commit comments

Comments
 (0)