@@ -419,6 +419,13 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
419
419
return DL.getTypeAllocSizeInBits (Ty) != DL.getTypeSizeInBits (Ty);
420
420
}
421
421
422
+ // / A version of ScalarEvolution::getSmallConstantTripCount that returns an
423
+ // / ElementCount to include loops whose trip count is a function of vscale.
424
+ static ElementCount getSmallConstantTripCount (ScalarEvolution *SE,
425
+ const Loop *L) {
426
+ return ElementCount::getFixed (SE->getSmallConstantTripCount (L));
427
+ }
428
+
422
429
// / Returns "best known" trip count, which is either a valid positive trip count
423
430
// / or std::nullopt when an estimate cannot be made (including when the trip
424
431
// / count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +434,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
427
434
// / 2) Returns expected trip count according to profile data if any.
428
435
// / 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429
436
// / 4) Returns std::nullopt if all of the above failed.
430
- static std::optional<unsigned >
437
+ static std::optional<ElementCount >
431
438
getSmallBestKnownTC (PredicatedScalarEvolution &PSE, Loop *L,
432
439
bool CanUseConstantMax = true ) {
433
440
// Check if exact trip count is known.
434
- if (unsigned ExpectedTC = PSE.getSE ()-> getSmallConstantTripCount ( L))
441
+ if (auto ExpectedTC = getSmallConstantTripCount ( PSE.getSE (), L))
435
442
return ExpectedTC;
436
443
437
444
// Check if there is an expected trip count available from profile data.
438
445
if (LoopVectorizeWithBlockFrequency)
439
446
if (auto EstimatedTC = getLoopEstimatedTripCount (L))
440
- return *EstimatedTC;
447
+ return ElementCount::getFixed ( *EstimatedTC) ;
441
448
442
449
if (!CanUseConstantMax)
443
450
return std::nullopt;
444
451
445
452
// Check if upper bound estimate is known.
446
453
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount ())
447
- return ExpectedTC;
454
+ return ElementCount::getFixed ( ExpectedTC) ;
448
455
449
456
return std::nullopt;
450
457
}
@@ -1960,7 +1967,8 @@ class GeneratedRTChecks {
1960
1967
// Get the best known TC estimate.
1961
1968
if (auto EstimatedTC = getSmallBestKnownTC (
1962
1969
PSE, OuterLoop, /* CanUseConstantMax = */ false ))
1963
- BestTripCount = *EstimatedTC;
1970
+ if (EstimatedTC->isFixed ())
1971
+ BestTripCount = EstimatedTC->getFixedValue ();
1964
1972
1965
1973
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1966
1974
@@ -3750,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3750
3758
}
3751
3759
3752
3760
ScalarEvolution *SE = PSE.getSE ();
3753
- unsigned TC = SE-> getSmallConstantTripCount (TheLoop);
3761
+ ElementCount TC = getSmallConstantTripCount (SE, TheLoop);
3754
3762
unsigned MaxTC = PSE.getSmallConstantMaxTripCount ();
3755
3763
LLVM_DEBUG (dbgs () << " LV: Found trip count: " << TC << ' \n ' );
3756
- if (TC != MaxTC)
3764
+ if (TC != ElementCount::getFixed ( MaxTC) )
3757
3765
LLVM_DEBUG (dbgs () << " LV: Found maximum trip count: " << MaxTC << ' \n ' );
3758
- if (TC == 1 ) {
3766
+ if (TC. isScalar () ) {
3759
3767
reportVectorizationFailure (" Single iteration (non) loop" ,
3760
3768
" loop trip count is one, irrelevant for vectorization" ,
3761
3769
" SingleIterationLoop" , ORE, TheLoop);
@@ -3869,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3869
3877
}
3870
3878
3871
3879
auto ExpectedTC = getSmallBestKnownTC (PSE, TheLoop);
3872
- if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold ()) {
3880
+ if (ExpectedTC && ExpectedTC->isFixed () &&
3881
+ ExpectedTC->getFixedValue () <=
3882
+ TTI.getMinTripCountTailFoldingThreshold ()) {
3873
3883
if (MaxPowerOf2RuntimeVF > 0u ) {
3874
3884
// If we have a low-trip-count, and the fixed-width VF is known to divide
3875
3885
// the trip count but the scalable factor does not, use the fixed-width
@@ -3927,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3927
3937
return FixedScalableVFPair::getNone ();
3928
3938
}
3929
3939
3930
- if (TC == 0 ) {
3940
+ if (TC. isZero () ) {
3931
3941
reportVectorizationFailure (
3932
3942
" unable to calculate the loop count due to complex control flow" ,
3933
3943
" UnknownLoopCountComplexCFG" , ORE, TheLoop);
@@ -4816,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4816
4826
// At least one iteration must be scalar when this constraint holds. So the
4817
4827
// maximum available iterations for interleaving is one less.
4818
4828
unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4819
- ? (* BestKnownTC) - 1
4820
- : * BestKnownTC;
4829
+ ? BestKnownTC-> getFixedValue ( ) - 1
4830
+ : BestKnownTC-> getFixedValue () ;
4821
4831
4822
4832
unsigned InterleaveCountLB = bit_floor (std::max (
4823
4833
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4824
4834
4825
- if (PSE.getSE ()-> getSmallConstantTripCount ( TheLoop) > 0 ) {
4835
+ if (getSmallConstantTripCount ( PSE.getSE (), TheLoop). isNonZero () ) {
4826
4836
// If the best known trip count is exact, we select between two
4827
4837
// prospective ICs, where
4828
4838
//
@@ -5182,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5182
5192
// costs of comparison and induction instructions, as they'll get simplified
5183
5193
// away.
5184
5194
SmallPtrSet<Instruction *, 2 > ValuesToIgnoreForVF;
5185
- auto TC = PSE.getSE ()-> getSmallConstantTripCount ( TheLoop);
5186
- if (VF. isFixed () && TC == VF. getFixedValue () && !foldTailByMasking ())
5195
+ auto TC = getSmallConstantTripCount ( PSE.getSE (), TheLoop);
5196
+ if (TC == VF && !foldTailByMasking ())
5187
5197
addFullyUnrolledInstructionsToIgnore (TheLoop, Legal->getInductionVars (),
5188
5198
ValuesToIgnoreForVF);
5189
5199
@@ -6878,8 +6888,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6878
6888
// simplified away.
6879
6889
// TODO: Remove this code after stepping away from the legacy cost model and
6880
6890
// adding code to simplify VPlans before calculating their costs.
6881
- auto TC = PSE.getSE ()-> getSmallConstantTripCount ( OrigLoop);
6882
- if (VF. isFixed () && TC == VF. getFixedValue () && !CM.foldTailByMasking ())
6891
+ auto TC = getSmallConstantTripCount ( PSE.getSE (), OrigLoop);
6892
+ if (TC == VF && !CM.foldTailByMasking ())
6883
6893
addFullyUnrolledInstructionsToIgnore (OrigLoop, Legal->getInductionVars (),
6884
6894
CostCtx.SkipCostComputation );
6885
6895
@@ -9647,8 +9657,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9647
9657
// Skip vectorization if the expected trip count is less than the minimum
9648
9658
// required trip count.
9649
9659
if (auto ExpectedTC = getSmallBestKnownTC (PSE, L)) {
9650
- if (ElementCount::isKnownLT (ElementCount::getFixed (*ExpectedTC),
9651
- VF.MinProfitableTripCount )) {
9660
+ if (ElementCount::isKnownLT (*ExpectedTC, VF.MinProfitableTripCount )) {
9652
9661
LLVM_DEBUG (dbgs () << " LV: Vectorization is not beneficial: expected "
9653
9662
" trip count < minimum profitable VF ("
9654
9663
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10018,7 +10027,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10018
10027
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
10019
10028
// count by optimizing for size, to minimize overheads.
10020
10029
auto ExpectedTC = getSmallBestKnownTC (PSE, L);
10021
- if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10030
+ if (ExpectedTC && ExpectedTC->isFixed () &&
10031
+ ExpectedTC->getFixedValue () < TinyTripCountVectorThreshold) {
10022
10032
LLVM_DEBUG (dbgs () << " LV: Found a loop with a very small trip count. "
10023
10033
<< " This loop is worth vectorizing only if no scalar "
10024
10034
<< " iteration overheads are incurred." );
0 commit comments