@@ -419,6 +419,12 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
419
419
return DL.getTypeAllocSizeInBits (Ty) != DL.getTypeSizeInBits (Ty);
420
420
}
421
421
422
+ // / A version of ScalarEvolution::getSmallConstantTripCount that returns an
423
+ // / ElementCount to include loops whose trip count is a function of vscale.
424
+ ElementCount getSmallConstantTripCount (ScalarEvolution *SE, const Loop *L) {
425
+ return ElementCount::getFixed (SE->getSmallConstantTripCount (L));
426
+ }
427
+
422
428
// / Returns "best known" trip count, which is either a valid positive trip count
423
429
// / or std::nullopt when an estimate cannot be made (including when the trip
424
430
// / count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +433,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
427
433
// / 2) Returns expected trip count according to profile data if any.
428
434
// / 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
429
435
// / 4) Returns std::nullopt if all of the above failed.
430
- static std::optional<unsigned >
436
+ static std::optional<ElementCount >
431
437
getSmallBestKnownTC (PredicatedScalarEvolution &PSE, Loop *L,
432
438
bool CanUseConstantMax = true ) {
433
439
// Check if exact trip count is known.
434
- if (unsigned ExpectedTC = PSE.getSE ()-> getSmallConstantTripCount ( L))
440
+ if (auto ExpectedTC = getSmallConstantTripCount ( PSE.getSE (), L))
435
441
return ExpectedTC;
436
442
437
443
// Check if there is an expected trip count available from profile data.
438
444
if (LoopVectorizeWithBlockFrequency)
439
445
if (auto EstimatedTC = getLoopEstimatedTripCount (L))
440
- return *EstimatedTC;
446
+ return ElementCount::getFixed ( *EstimatedTC) ;
441
447
442
448
if (!CanUseConstantMax)
443
449
return std::nullopt;
444
450
445
451
// Check if upper bound estimate is known.
446
452
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount ())
447
- return ExpectedTC;
453
+ return ElementCount::getFixed ( ExpectedTC) ;
448
454
449
455
return std::nullopt;
450
456
}
@@ -1960,7 +1966,8 @@ class GeneratedRTChecks {
1960
1966
// Get the best known TC estimate.
1961
1967
if (auto EstimatedTC = getSmallBestKnownTC (
1962
1968
PSE, OuterLoop, /* CanUseConstantMax = */ false ))
1963
- BestTripCount = *EstimatedTC;
1969
+ if (EstimatedTC->isFixed ())
1970
+ BestTripCount = EstimatedTC->getFixedValue ();
1964
1971
1965
1972
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1966
1973
@@ -3751,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3751
3758
}
3752
3759
3753
3760
ScalarEvolution *SE = PSE.getSE ();
3754
- unsigned TC = SE-> getSmallConstantTripCount (TheLoop);
3761
+ ElementCount TC = getSmallConstantTripCount (SE, TheLoop);
3755
3762
unsigned MaxTC = PSE.getSmallConstantMaxTripCount ();
3756
3763
LLVM_DEBUG (dbgs () << " LV: Found trip count: " << TC << ' \n ' );
3757
- if (TC != MaxTC)
3764
+ if (TC != ElementCount::getFixed ( MaxTC) )
3758
3765
LLVM_DEBUG (dbgs () << " LV: Found maximum trip count: " << MaxTC << ' \n ' );
3759
- if (TC == 1 ) {
3766
+ if (TC. isScalar () ) {
3760
3767
reportVectorizationFailure (" Single iteration (non) loop" ,
3761
3768
" loop trip count is one, irrelevant for vectorization" ,
3762
3769
" SingleIterationLoop" , ORE, TheLoop);
@@ -3870,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3870
3877
}
3871
3878
3872
3879
auto ExpectedTC = getSmallBestKnownTC (PSE, TheLoop);
3873
- if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold ()) {
3880
+ if (ExpectedTC && ExpectedTC->isFixed () &&
3881
+ ExpectedTC->getFixedValue () <=
3882
+ TTI.getMinTripCountTailFoldingThreshold ()) {
3874
3883
if (MaxPowerOf2RuntimeVF > 0u ) {
3875
3884
// If we have a low-trip-count, and the fixed-width VF is known to divide
3876
3885
// the trip count but the scalable factor does not, use the fixed-width
@@ -3928,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3928
3937
return FixedScalableVFPair::getNone ();
3929
3938
}
3930
3939
3931
- if (TC == 0 ) {
3940
+ if (TC. isZero () ) {
3932
3941
reportVectorizationFailure (
3933
3942
" unable to calculate the loop count due to complex control flow" ,
3934
3943
" UnknownLoopCountComplexCFG" , ORE, TheLoop);
@@ -4817,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4817
4826
// At least one iteration must be scalar when this constraint holds. So the
4818
4827
// maximum available iterations for interleaving is one less.
4819
4828
unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4820
- ? (* BestKnownTC) - 1
4821
- : * BestKnownTC;
4829
+ ? BestKnownTC-> getFixedValue ( ) - 1
4830
+ : BestKnownTC-> getFixedValue () ;
4822
4831
4823
4832
unsigned InterleaveCountLB = bit_floor (std::max (
4824
4833
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4825
4834
4826
- if (PSE.getSE ()-> getSmallConstantTripCount ( TheLoop) > 0 ) {
4835
+ if (getSmallConstantTripCount ( PSE.getSE (), TheLoop). isNonZero () ) {
4827
4836
// If the best known trip count is exact, we select between two
4828
4837
// prospective ICs, where
4829
4838
//
@@ -5183,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5183
5192
// costs of comparison and induction instructions, as they'll get simplified
5184
5193
// away.
5185
5194
SmallPtrSet<Instruction *, 2 > ValuesToIgnoreForVF;
5186
- auto TC = PSE.getSE ()-> getSmallConstantTripCount ( TheLoop);
5187
- if (VF. isFixed () && TC == VF. getFixedValue () && !foldTailByMasking ())
5195
+ auto TC = getSmallConstantTripCount ( PSE.getSE (), TheLoop);
5196
+ if (TC == VF && !foldTailByMasking ())
5188
5197
addFullyUnrolledInstructionsToIgnore (TheLoop, Legal->getInductionVars (),
5189
5198
ValuesToIgnoreForVF);
5190
5199
@@ -6884,8 +6893,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6884
6893
// simplified away.
6885
6894
// TODO: Remove this code after stepping away from the legacy cost model and
6886
6895
// adding code to simplify VPlans before calculating their costs.
6887
- auto TC = PSE.getSE ()-> getSmallConstantTripCount ( OrigLoop);
6888
- if (VF. isFixed () && TC == VF. getFixedValue () && !CM.foldTailByMasking ())
6896
+ auto TC = getSmallConstantTripCount ( PSE.getSE (), OrigLoop);
6897
+ if (TC == VF && !CM.foldTailByMasking ())
6889
6898
addFullyUnrolledInstructionsToIgnore (OrigLoop, Legal->getInductionVars (),
6890
6899
CostCtx.SkipCostComputation );
6891
6900
@@ -9641,8 +9650,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9641
9650
// Skip vectorization if the expected trip count is less than the minimum
9642
9651
// required trip count.
9643
9652
if (auto ExpectedTC = getSmallBestKnownTC (PSE, L)) {
9644
- if (ElementCount::isKnownLT (ElementCount::getFixed (*ExpectedTC),
9645
- VF.MinProfitableTripCount )) {
9653
+ if (ElementCount::isKnownLT (*ExpectedTC, VF.MinProfitableTripCount )) {
9646
9654
LLVM_DEBUG (dbgs () << " LV: Vectorization is not beneficial: expected "
9647
9655
" trip count < minimum profitable VF ("
9648
9656
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10012,7 +10020,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10012
10020
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
10013
10021
// count by optimizing for size, to minimize overheads.
10014
10022
auto ExpectedTC = getSmallBestKnownTC (PSE, L);
10015
- if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10023
+ if (ExpectedTC && ExpectedTC->isFixed () &&
10024
+ ExpectedTC->getFixedValue () < TinyTripCountVectorThreshold) {
10016
10025
LLVM_DEBUG (dbgs () << " LV: Found a loop with a very small trip count. "
10017
10026
<< " This loop is worth vectorizing only if no scalar "
10018
10027
<< " iteration overheads are incurred." );
0 commit comments