Skip to content

Commit

Permalink
[LV] Consider insts feeding interleave group pointers free.
Browse files Browse the repository at this point in the history
For interleave groups, we only generate a pointer for the start of the
interleave group (the instruction at the insert position). The other
addresses for other members are alreayd considered free, but so are
their operands, if they are only used in address computations for
other interleave group members.
  • Loading branch information
fhahn committed Jun 19, 2024
1 parent 04cd069 commit b9702bb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 26 deletions.
30 changes: 19 additions & 11 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7048,6 +7048,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

SmallSetVector<Value *, 4> DeadInterleavePointerOps;
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
// Find all stores to invariant variables. Since they are going to sink
Expand All @@ -7058,25 +7059,32 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
ValuesToIgnore.insert(&I);

// For interleave groups, we only create a pointer for the start of the
// interleave group. Mark single-use ops feeding interleave group mem ops
// as free when vectorizing, expect the insert-pos memory op.
// interleave group. Queue up addresses of group members except the insert
// position for further processing.
if (isAccessInterleaved(&I)) {
auto *Group = getInterleavedAccessGroup(&I);
if (Group->getInsertPos() == &I)
continue;
Value *PointerOp = getLoadStorePointerOperand(&I);
SmallSetVector<Value *, 4> Worklist;
Worklist.insert(PointerOp);
for (unsigned I = 0; I != Worklist.size(); ++I) {
auto *Op = dyn_cast<Instruction>(Worklist[I]);
if (!Op || !TheLoop->contains(Op) || !Op->hasOneUse())
continue;
VecValuesToIgnore.insert(Op);
Worklist.insert(Op->op_begin(), Op->op_end());
}
DeadInterleavePointerOps.insert(PointerOp);
}
}

// Mark ops feeding interleave group members as free, if they are only used
// by other dead computations.
for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
Instruction *UI = cast<Instruction>(U);
return !VecValuesToIgnore.contains(U) &&
(!isAccessInterleaved(UI) ||
getInterleavedAccessGroup(UI)->getInsertPos() == UI);
}))
continue;
VecValuesToIgnore.insert(Op);
DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end());
}

// Ignore type-promoting instructions we identified during reduction
// detection.
for (const auto &Reduction : Legal->getReductionVars()) {
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -86,29 +86,29 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[P_INVAR]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <4 x float> poison, float [[TMP42]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT33]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT34]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP46]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT35]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[INTERLEAVED_VEC37:%.*]] = shufflevector <16 x float> [[TMP52]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC37]], ptr [[TMP50]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP50]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE2
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE41

define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
; SSE2-LABEL: @test_muladd(
Expand Down

0 comments on commit b9702bb

Please sign in to comment.