Skip to content

Commit 8af01ca

Browse files
[NFC][LLVM][LoopVectorize] Change getSmallBestKnownTC to return an ElementCount.
This is prep work for enabling better UF calculations when using vscale based VFs to vectorise loops with vscale based tripcounts. NOTE: NFC because All uses remain fixed-length until a following PR changes LoopVectorize's version of getSmallConstantTripCount().
1 parent 465e3ce commit 8af01ca

File tree

1 file changed

+29
-20
lines changed

1 file changed

+29
-20
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,12 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
419419
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
420420
}
421421

422+
/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
423+
/// ElementCount to include loops whose trip count is a function of vscale.
424+
ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) {
425+
return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
426+
}
427+
422428
/// Returns "best known" trip count, which is either a valid positive trip count
423429
/// or std::nullopt when an estimate cannot be made (including when the trip
424430
/// count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +433,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
427433
/// 2) Returns expected trip count according to profile data if any.
428434
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429435
/// 4) Returns std::nullopt if all of the above failed.
430-
static std::optional<unsigned>
436+
static std::optional<ElementCount>
431437
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
432438
bool CanUseConstantMax = true) {
433439
// Check if exact trip count is known.
434-
if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
440+
if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
435441
return ExpectedTC;
436442

437443
// Check if there is an expected trip count available from profile data.
438444
if (LoopVectorizeWithBlockFrequency)
439445
if (auto EstimatedTC = getLoopEstimatedTripCount(L))
440-
return *EstimatedTC;
446+
return ElementCount::getFixed(*EstimatedTC);
441447

442448
if (!CanUseConstantMax)
443449
return std::nullopt;
444450

445451
// Check if upper bound estimate is known.
446452
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
447-
return ExpectedTC;
453+
return ElementCount::getFixed(ExpectedTC);
448454

449455
return std::nullopt;
450456
}
@@ -1960,7 +1966,8 @@ class GeneratedRTChecks {
19601966
// Get the best known TC estimate.
19611967
if (auto EstimatedTC = getSmallBestKnownTC(
19621968
PSE, OuterLoop, /* CanUseConstantMax = */ false))
1963-
BestTripCount = *EstimatedTC;
1969+
if (EstimatedTC->isFixed())
1970+
BestTripCount = EstimatedTC->getFixedValue();
19641971

19651972
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
19661973

@@ -3751,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
37513758
}
37523759

37533760
ScalarEvolution *SE = PSE.getSE();
3754-
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3761+
ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
37553762
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
37563763
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3757-
if (TC != MaxTC)
3764+
if (TC != ElementCount::getFixed(MaxTC))
37583765
LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3759-
if (TC == 1) {
3766+
if (TC.isScalar()) {
37603767
reportVectorizationFailure("Single iteration (non) loop",
37613768
"loop trip count is one, irrelevant for vectorization",
37623769
"SingleIterationLoop", ORE, TheLoop);
@@ -3870,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
38703877
}
38713878

38723879
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3873-
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
3880+
if (ExpectedTC && ExpectedTC->isFixed() &&
3881+
ExpectedTC->getFixedValue() <=
3882+
TTI.getMinTripCountTailFoldingThreshold()) {
38743883
if (MaxPowerOf2RuntimeVF > 0u) {
38753884
// If we have a low-trip-count, and the fixed-width VF is known to divide
38763885
// the trip count but the scalable factor does not, use the fixed-width
@@ -3928,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
39283937
return FixedScalableVFPair::getNone();
39293938
}
39303939

3931-
if (TC == 0) {
3940+
if (TC.isZero()) {
39323941
reportVectorizationFailure(
39333942
"unable to calculate the loop count due to complex control flow",
39343943
"UnknownLoopCountComplexCFG", ORE, TheLoop);
@@ -4817,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48174826
// At least one iteration must be scalar when this constraint holds. So the
48184827
// maximum available iterations for interleaving is one less.
48194828
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4820-
? (*BestKnownTC) - 1
4821-
: *BestKnownTC;
4829+
? BestKnownTC->getFixedValue() - 1
4830+
: BestKnownTC->getFixedValue();
48224831

48234832
unsigned InterleaveCountLB = bit_floor(std::max(
48244833
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
48254834

4826-
if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
4835+
if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
48274836
// If the best known trip count is exact, we select between two
48284837
// prospective ICs, where
48294838
//
@@ -5183,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
51835192
// costs of comparison and induction instructions, as they'll get simplified
51845193
// away.
51855194
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5186-
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5187-
if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5195+
auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5196+
if (TC == VF && !foldTailByMasking())
51885197
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
51895198
ValuesToIgnoreForVF);
51905199

@@ -6884,8 +6893,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68846893
// simplified away.
68856894
// TODO: Remove this code after stepping away from the legacy cost model and
68866895
// adding code to simplify VPlans before calculating their costs.
6887-
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
6888-
if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
6896+
auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6897+
if (TC == VF && !CM.foldTailByMasking())
68896898
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
68906899
CostCtx.SkipCostComputation);
68916900

@@ -9641,8 +9650,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
96419650
// Skip vectorization if the expected trip count is less than the minimum
96429651
// required trip count.
96439652
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9644-
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9645-
VF.MinProfitableTripCount)) {
9653+
if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
96469654
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
96479655
"trip count < minimum profitable VF ("
96489656
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10012,7 +10020,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1001210020
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
1001310021
// count by optimizing for size, to minimize overheads.
1001410022
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10015-
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10023+
if (ExpectedTC && ExpectedTC->isFixed() &&
10024+
ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
1001610025
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
1001710026
<< "This loop is worth vectorizing only if no scalar "
1001810027
<< "iteration overheads are incurred.");

0 commit comments

Comments
 (0)