Skip to content

Commit f40c630

Browse files
lukel97tomtor
authored andcommitted
[LV] Support scalable interleave groups for factors 3,5,6 and 7 (llvm#141865)
Currently the loop vectorizer can only vectorize interleave groups for power-of-2 factors at scalable VFs by recursively interleaving [de]interleave2 intrinsics. However after llvm#124825 and llvm#139893, we now have [de]interleave intrinsics for all factors up to 8, which is enough to support all types of segmented loads and stores on RISC-V. Now that the interleaved access pass has been taught to lower these in llvm#139373 and llvm#141512, this patch teaches the loop vectorizer to emit these intrinsics for factors up to 8, which enables scalable vectorization for non-power-of-2 factors. As far as I'm aware, no in-tree target will vectorize a scalable interelave group above factor 8 because the maximum interleave factor is capped at 4 on AArch64 and 8 on RISC-V, and the `-max-interleave-group-factor` CLI option defaults to 8, so the recursive [de]interleaving code has been removed for now. Factors of 3 with scalable VFs are also turned off in AArch64 since there's no lowering for [de]interleave3 just yet either.
1 parent c5d4737 commit f40c630

File tree

8 files changed

+418
-458
lines changed

8 files changed

+418
-458
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,12 @@ LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(
176176
LLVM_ABI Intrinsic::ID
177177
getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI);
178178

179+
/// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
180+
LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
181+
182+
/// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
183+
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
184+
179185
/// Given a vector and an element number, see if the scalar value is
180186
/// already around as a register, for example if it were inserted then extracted
181187
/// from the vector.

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,30 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
240240
return Intrinsic::not_intrinsic;
241241
}
242242

243+
struct InterleaveIntrinsic {
244+
Intrinsic::ID Interleave, Deinterleave;
245+
};
246+
247+
static InterleaveIntrinsic InterleaveIntrinsics[] = {
248+
{Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
249+
{Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
250+
{Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
251+
{Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
252+
{Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
253+
{Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
254+
{Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
255+
};
256+
257+
Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
258+
assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
259+
return InterleaveIntrinsics[Factor - 2].Interleave;
260+
}
261+
262+
Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
263+
assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
264+
return InterleaveIntrinsics[Factor - 2].Deinterleave;
265+
}
266+
243267
/// Given a vector and an element number, see if the scalar value is
244268
/// already around as a register, for example if it were inserted then extracted
245269
/// from the vector.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4583,6 +4583,13 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
45834583
if (VecTy->isScalableTy() && !ST->hasSVE())
45844584
return InstructionCost::getInvalid();
45854585

4586+
// Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4587+
// only have lowering for power-of-2 factors.
4588+
// TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4589+
// InterleavedAccessPass for ld3/st3
4590+
if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4591+
return InstructionCost::getInvalid();
4592+
45864593
// Vectorization for masked interleaved accesses is only enabled for scalable
45874594
// VF.
45884595
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3166,10 +3166,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
31663166
if (hasIrregularType(ScalarTy, DL))
31673167
return false;
31683168

3169-
// For scalable vectors, the only interleave factor currently supported
3170-
// must be power of 2 since we require the (de)interleave2 intrinsics
3171-
// instead of shufflevectors.
3172-
if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
3169+
// For scalable vectors, the interleave factors must be <= 8 since we require
3170+
// the (de)interleaveN intrinsics instead of shufflevectors.
3171+
if (VF.isScalable() && InterleaveFactor > 8)
31733172
return false;
31743173

31753174
// If the group involves a non-integral pointer, we may not be able to
@@ -8718,10 +8717,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
87188717
bool Result = (VF.isVector() && // Query is illegal for VF == 1
87198718
CM.getWideningDecision(IG->getInsertPos(), VF) ==
87208719
LoopVectorizationCostModel::CM_Interleave);
8721-
// For scalable vectors, the only interleave factor currently supported
8722-
// must be power of 2 since we require the (de)interleave2 intrinsics
8723-
// instead of shufflevectors.
8724-
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
8720+
// For scalable vectors, the interleave factors must be <= 8 since we
8721+
// require the (de)interleaveN intrinsics instead of shufflevectors.
8722+
assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
87258723
"Unsupported interleave factor for scalable vectors");
87268724
return Result;
87278725
};

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 15 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3296,21 +3296,13 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
32963296
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
32973297
// must use intrinsics to interleave.
32983298
if (VecTy->isScalableTy()) {
3299-
assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
3300-
"scalable vectors, must be power of 2");
3301-
SmallVector<Value *> InterleavingValues(Vals);
3302-
// When interleaving, the number of values will be shrunk until we have the
3303-
// single final interleaved value.
3304-
auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
3305-
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
3306-
InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
3307-
for (unsigned I = 0; I < Midpoint; ++I)
3308-
InterleavingValues[I] = Builder.CreateIntrinsic(
3309-
InterleaveTy, Intrinsic::vector_interleave2,
3310-
{InterleavingValues[I], InterleavingValues[Midpoint + I]},
3311-
/*FMFSource=*/nullptr, Name);
3312-
}
3313-
return InterleavingValues[0];
3299+
assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3300+
VectorType *InterleaveTy =
3301+
VectorType::get(VecTy->getElementType(),
3302+
VecTy->getElementCount().multiplyCoefficientBy(Factor));
3303+
return Builder.CreateIntrinsic(InterleaveTy,
3304+
getInterleaveIntrinsicID(Factor), Vals,
3305+
/*FMFSource=*/nullptr, Name);
33143306
}
33153307

33163308
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3396,7 +3388,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
33963388
&InterleaveFactor](Value *MaskForGaps) -> Value * {
33973389
if (State.VF.isScalable()) {
33983390
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3399-
assert(isPowerOf2_32(InterleaveFactor) &&
3391+
assert(InterleaveFactor <= 8 &&
34003392
"Unsupported deinterleave factor for scalable vectors");
34013393
auto *ResBlockInMask = State.get(BlockInMask);
34023394
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
@@ -3440,43 +3432,18 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
34403432
ArrayRef<VPValue *> VPDefs = definedValues();
34413433
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
34423434
if (VecTy->isScalableTy()) {
3443-
assert(isPowerOf2_32(InterleaveFactor) &&
3444-
"Unsupported deinterleave factor for scalable vectors");
3445-
34463435
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
34473436
// so must use intrinsics to deinterleave.
3448-
SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
3449-
DeinterleavedValues[0] = NewLoad;
3450-
// For the case of InterleaveFactor > 2, we will have to do recursive
3451-
// deinterleaving, because the current available deinterleave intrinsic
3452-
// supports only Factor of 2, otherwise it will bailout after first
3453-
// iteration.
3454-
// When deinterleaving, the number of values will double until we
3455-
// have "InterleaveFactor".
3456-
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
3457-
NumVectors *= 2) {
3458-
// Deinterleave the elements within the vector
3459-
SmallVector<Value *> TempDeinterleavedValues(NumVectors);
3460-
for (unsigned I = 0; I < NumVectors; ++I) {
3461-
auto *DiTy = DeinterleavedValues[I]->getType();
3462-
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
3463-
Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
3464-
/*FMFSource=*/nullptr, "strided.vec");
3465-
}
3466-
// Extract the deinterleaved values:
3467-
for (unsigned I = 0; I < 2; ++I)
3468-
for (unsigned J = 0; J < NumVectors; ++J)
3469-
DeinterleavedValues[NumVectors * I + J] =
3470-
State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
3471-
}
3437+
assert(InterleaveFactor <= 8 &&
3438+
"Unsupported deinterleave factor for scalable vectors");
3439+
Value *Deinterleave = State.Builder.CreateIntrinsic(
3440+
getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
3441+
NewLoad,
3442+
/*FMFSource=*/nullptr, "strided.vec");
34723443

3473-
#ifndef NDEBUG
3474-
for (Value *Val : DeinterleavedValues)
3475-
assert(Val && "NULL Deinterleaved Value");
3476-
#endif
34773444
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
34783445
Instruction *Member = Group->getMember(I);
3479-
Value *StridedVec = DeinterleavedValues[I];
3446+
Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
34803447
if (!Member) {
34813448
// This value is not needed as it's not used
34823449
cast<Instruction>(StridedVec)->eraseFromParent();

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 18 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
375375
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
376376
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
377377
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
378-
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
379378
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
379+
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
380380
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
381381
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
382382
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1479,34 +1479,24 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
14791479
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14801480
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
14811481
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
1482-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
1483-
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
1484-
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
1485-
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
1486-
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
1487-
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
1488-
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
1489-
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
1490-
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
1482+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
1483+
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
1484+
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
1485+
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
1486+
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
14911487
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
14921488
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
1493-
; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
1494-
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
1495-
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
1496-
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
1497-
; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
1498-
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
1499-
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
1500-
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
1501-
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
1489+
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
1490+
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
1491+
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
1492+
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 2
1493+
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 3
15021494
; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
15031495
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
15041496
; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
15051497
; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
15061498
; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
1507-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
1508-
; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
1509-
; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
1499+
; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]])
15101500
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
15111501
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
15121502
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1595,18 +1585,14 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
15951585
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
15961586
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
15971587
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
1598-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
1599-
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
1600-
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
1601-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
1602-
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
1603-
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
1604-
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
1605-
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
1606-
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
1588+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
1589+
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
16071590
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
1591+
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
16081592
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
1593+
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
16091594
; CHECK-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
1595+
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
16101596
; CHECK-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
16111597
; CHECK-NEXT: [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
16121598
; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
@@ -1622,9 +1608,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
16221608
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
16231609
; CHECK-NEXT: [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
16241610
; CHECK-NEXT: [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
1625-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
1626-
; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
1627-
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
1611+
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE8]], <vscale x 4 x i32> [[REVERSE9]])
16281612
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
16291613
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
16301614
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]

0 commit comments

Comments
 (0)