@@ -5557,17 +5557,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
55575557 for (auto *V : VL) {
55585558 ++Idx;
55595559
5560- // Need to exclude undefs from analysis.
5561- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5562- continue ;
5563-
55645560 // Reached the start of a new vector registers.
55655561 if (Idx % EltsPerVector == 0 ) {
55665562 RegMask.assign (EltsPerVector, UndefMaskElem);
55675563 AllConsecutive = true ;
55685564 continue ;
55695565 }
55705566
5567+ // Need to exclude undefs from analysis.
5568+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5569+ continue ;
5570+
55715571 // Check all extracts for a vector register on the target directly
55725572 // extract values in order.
55735573 unsigned CurrentIdx = *getExtractIndex (cast<Instruction>(V));
@@ -6012,61 +6012,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
60126012 assert (E->ReuseShuffleIndices .empty () &&
60136013 " Unique insertelements only are expected." );
60146014 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType ());
6015-
60166015 unsigned const NumElts = SrcVecTy->getNumElements ();
60176016 unsigned const NumScalars = VL.size ();
6017+
6018+ unsigned NumOfParts = TTI->getNumberOfParts (SrcVecTy);
6019+
6020+ unsigned OffsetBeg = *getInsertIndex (VL.front ());
6021+ unsigned OffsetEnd = OffsetBeg;
6022+ for (Value *V : VL.drop_front ()) {
6023+ unsigned Idx = *getInsertIndex (V);
6024+ if (OffsetBeg > Idx)
6025+ OffsetBeg = Idx;
6026+ else if (OffsetEnd < Idx)
6027+ OffsetEnd = Idx;
6028+ }
6029+ unsigned VecScalarsSz = PowerOf2Ceil (NumElts);
6030+ if (NumOfParts > 0 )
6031+ VecScalarsSz = PowerOf2Ceil ((NumElts + NumOfParts - 1 ) / NumOfParts);
6032+ unsigned VecSz =
6033+ (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
6034+ VecScalarsSz;
6035+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
6036+ unsigned InsertVecSz = std::min<unsigned >(
6037+ PowerOf2Ceil (OffsetEnd - OffsetBeg + 1 ),
6038+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
6039+ VecScalarsSz);
6040+
60186041 APInt DemandedElts = APInt::getZero (NumElts);
60196042 // TODO: Add support for Instruction::InsertValue.
60206043 SmallVector<int > Mask;
60216044 if (!E->ReorderIndices .empty ()) {
60226045 inversePermutation (E->ReorderIndices , Mask);
6023- Mask.append (NumElts - NumScalars , UndefMaskElem);
6046+ Mask.append (InsertVecSz - Mask. size () , UndefMaskElem);
60246047 } else {
6025- Mask.assign (NumElts , UndefMaskElem);
6026- std::iota (Mask.begin (), std::next (Mask.begin (), NumScalars ), 0 );
6048+ Mask.assign (VecSz , UndefMaskElem);
6049+ std::iota (Mask.begin (), std::next (Mask.begin (), InsertVecSz ), 0 );
60276050 }
6028- unsigned Offset = *getInsertIndex (VL0);
60296051 bool IsIdentity = true ;
6030- SmallVector<int > PrevMask (NumElts , UndefMaskElem);
6052+ SmallVector<int > PrevMask (InsertVecSz , UndefMaskElem);
60316053 Mask.swap (PrevMask);
60326054 for (unsigned I = 0 ; I < NumScalars; ++I) {
60336055 unsigned InsertIdx = *getInsertIndex (VL[PrevMask[I]]);
60346056 DemandedElts.setBit (InsertIdx);
6035- IsIdentity &= InsertIdx - Offset == I;
6036- Mask[InsertIdx - Offset ] = I;
6057+ IsIdentity &= InsertIdx - OffsetBeg == I;
6058+ Mask[InsertIdx - OffsetBeg ] = I;
60376059 }
60386060 assert (Offset < NumElts && " Failed to find vector index offset" );
60396061
60406062 InstructionCost Cost = 0 ;
60416063 Cost -= TTI->getScalarizationOverhead (SrcVecTy, DemandedElts,
60426064 /* Insert*/ true , /* Extract*/ false );
60436065
6044- if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0 ) {
6045- // FIXME: Replace with SK_InsertSubvector once it is properly supported.
6046- unsigned Sz = PowerOf2Ceil (Offset + NumScalars);
6047- Cost += TTI->getShuffleCost (
6048- TargetTransformInfo::SK_PermuteSingleSrc,
6049- FixedVectorType::get (SrcVecTy->getElementType (), Sz));
6050- } else if (!IsIdentity) {
6051- auto *FirstInsert =
6052- cast<Instruction>(*find_if (E->Scalars , [E](Value *V) {
6053- return !is_contained (E->Scalars ,
6054- cast<Instruction>(V)->getOperand (0 ));
6055- }));
6056- if (isUndefVector (FirstInsert->getOperand (0 ))) {
6057- Cost += TTI->getShuffleCost (TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
6066+ // First cost - resize to actual vector size if not identity shuffle or
6067+ // need to shift the vector.
6068+ // Do not calculate the cost if the actual size is the register size and
6069+ // we can merge this shuffle with the following SK_Select.
6070+ auto *InsertVecTy =
6071+ FixedVectorType::get (SrcVecTy->getElementType (), InsertVecSz);
6072+ if (!IsIdentity)
6073+ Cost += TTI->getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
6074+ InsertVecTy, Mask);
6075+ auto *FirstInsert = cast<Instruction>(*find_if (E->Scalars , [E](Value *V) {
6076+ return !is_contained (E->Scalars , cast<Instruction>(V)->getOperand (0 ));
6077+ }));
6078+ // Second cost - permutation with subvector, if some elements are from the
6079+ // initial vector or inserting a subvector.
6080+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)
6081+ // subvector of ActualVecTy.
6082+ if (!isUndefVector (FirstInsert->getOperand (0 )) && NumScalars != NumElts &&
6083+ (Offset != OffsetBeg || (OffsetEnd + 1 ) % VecScalarsSz != 0 )) {
6084+ if (InsertVecSz != VecSz) {
6085+ auto *ActualVecTy =
6086+ FixedVectorType::get (SrcVecTy->getElementType (), VecSz);
6087+ Cost += TTI->getShuffleCost (TTI::SK_InsertSubvector, ActualVecTy,
6088+ None, OffsetBeg - Offset, InsertVecTy);
60586089 } else {
6059- SmallVector<int > InsertMask (NumElts);
6060- std::iota (InsertMask.begin (), InsertMask.end (), 0 );
6061- for (unsigned I = 0 ; I < NumElts; I++) {
6090+ for (unsigned I = 0 , End = OffsetBeg - Offset; I < End; ++I)
6091+ Mask[I] = I;
6092+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
6093+ I <= End; ++I)
60626094 if (Mask[I] != UndefMaskElem)
6063- InsertMask[Offset + I] = NumElts + I ;
6064- }
6065- Cost +=
6066- TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask );
6095+ Mask[ I] = I + VecSz ;
6096+ for ( unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
6097+ Mask[I] = I;
6098+ Cost += TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, InsertVecTy, Mask );
60676099 }
60686100 }
6069-
60706101 return Cost;
60716102 }
60726103 case Instruction::ZExt:
@@ -6519,7 +6550,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
65196550 // No need to vectorize inserts of gathered values.
65206551 if (VectorizableTree.size () == 2 &&
65216552 isa<InsertElementInst>(VectorizableTree[0 ]->Scalars [0 ]) &&
6522- VectorizableTree[1 ]->State == TreeEntry::NeedToGather)
6553+ VectorizableTree[1 ]->State == TreeEntry::NeedToGather &&
6554+ (VectorizableTree[1 ]->getVectorFactor () <= 2 ||
6555+ !(isSplat (VectorizableTree[1 ]->Scalars ) ||
6556+ allConstant (VectorizableTree[1 ]->Scalars ))))
65236557 return true ;
65246558
65256559 // We can vectorize the tree if its size is greater than or equal to the
0 commit comments