-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Optimise users of induction variables in early exit blocks #130766
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesThis is the second of two PRs that attempts to improve the IR This required adding a new VPInstruction called FirstActiveLane, I have added a new function optimizeEarlyExitInductionUser that Patch is 96.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130766.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 43dc30c40bb53..979123d1ab0af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,6 +882,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// Extracts the first active lane of a vector, where the first operand is
// the predicate, and the second operand is the vector to extract.
ExtractFirstActive,
+ // Calculates the first active lane index of the vector predicate operand.
+ FirstActiveLane,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 6f6875f0e5e0e..2aea5f6286668 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -85,6 +85,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return VecTy->getElementType();
return BaseTy;
}
+ case VPInstruction::FirstActiveLane:
+ return IntegerType::get(Ctx, 64);
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a8b304271f0da..2f11ce7e7fe71 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -712,6 +712,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
Builder.getInt64Ty(), Mask, true, "first.active.lane");
return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
}
+ case VPInstruction::FirstActiveLane: {
+ Value *Mask = State.get(getOperand(0));
+ return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
+ true, "first.active.lane");
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -754,6 +759,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
getOpcode() == VPInstruction::ExtractFirstActive ||
+ getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -946,6 +952,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractFirstActive:
O << "extract-first-active";
break;
+ case VPInstruction::FirstActiveLane:
+ O << "first-active-lane";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b44e95a2471c..ecb5b64461a64 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -737,6 +737,64 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
return IsWideIVInc() ? WideIV : nullptr;
}
+/// Attempts to optimize the induction variable exit values for users in the
+/// early exit block.
+static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
+ VPTypeAnalysis &TypeInfo,
+ VPBlockBase *PredVPBB,
+ VPValue *Op) {
+ using namespace VPlanPatternMatch;
+
+ VPValue *Incoming, *Mask;
+ if (!match(Op, m_VPInstruction<VPInstruction::ExtractFirstActive>(
+ m_VPValue(Incoming), m_VPValue(Mask))))
+ return nullptr;
+
+ auto *WideIV = getOptimizableIVOf(Incoming);
+ if (!WideIV)
+ return nullptr;
+
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ // Calculate the final index.
+ VPValue *EndValue = Plan.getCanonicalIV();
+ auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ VPBuilder B(cast<VPBasicBlock>(PredVPBB));
+
+ DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
+ VPValue *FirstActiveLane =
+ B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
+ if (CanonicalIVType != TypeInfo.inferScalarType(FirstActiveLane)) {
+ Instruction::CastOps CastOp = CanonicalIVType->getScalarSizeInBits() < 64
+ ? Instruction::Trunc
+ : Instruction::ZExt;
+ FirstActiveLane =
+ B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
+ }
+ EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
+
+ // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
+ // changed it means the exit is using the incremented value, so we need to
+ // add the step.
+ if (Incoming != WideIV) {
+ VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+ EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
+ }
+
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ EndValue = B.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, EndValue, Step);
+ }
+
+ return EndValue;
+}
+
/// Attempts to optimize the induction variable exit values for users in the
/// exit block coming from the latch in the original scalar loop.
static VPValue *
@@ -799,12 +857,15 @@ void VPlanTransforms::optimizeInductionExitUsers(
break;
for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
+ VPValue *Escape = nullptr;
if (PredVPBB == MiddleVPBB)
- if (VPValue *Escape = optimizeLatchExitInductionUser(
- Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
- EndValues))
- ExitIRI->setOperand(Idx, Escape);
- // TODO: Optimize early exit induction users in follow-on patch.
+ Escape = optimizeLatchExitInductionUser(
+ Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
+ else
+ Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
+ ExitIRI->getOperand(Idx));
+ if (Escape)
+ ExitIRI->setOperand(Idx, Escape);
}
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 5f926db1131f6..02a16234b86cb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -25,16 +25,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> splat (i64 3), [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP10]]
@@ -48,7 +41,6 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.split:
@@ -58,7 +50,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -140,7 +133,8 @@ define i64 @same_exit_block_pre_inc_use4() {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -197,7 +191,6 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[TMP0]]
@@ -209,7 +202,6 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.split:
@@ -218,7 +210,8 @@ define i64 @loop_contains_safe_call() #1 {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -283,16 +276,9 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
-; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 4 x i64> [[TMP16]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 3), [[TMP17]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX1]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -304,7 +290,6 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.split:
@@ -314,7 +299,8 @@ define i64 @loop_contains_safe_div() #1 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -375,7 +361,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -389,7 +374,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.split:
@@ -399,7 +383,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 9cb4b6f9d8680..82b908eab4871 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -15,7 +15,6 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
@@ -29,7 +28,6 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.split:
@@ -38,7 +36,8 @@ define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
; CHECK-NEXT: br label [[LOOP_END]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -86,6 +85,427 @@ loop.end:
}
+define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
+; CHECK-LABEL: define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] ...
[truncated]
|
// Calculates the first active lane index of the vector predicate operand. | ||
FirstActiveLane, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need both ExtractFirstActive
and FirstActiveLane
or can we use FirstActiveLane with a regular extract VPInstruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I can do that. Split off into #131118.
Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR llvm#130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR llvm#130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR llvm#130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR #130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
) Refactor the code to extract the first active element of a vector in the early exit block, in preparation for PR llvm#130766. I've replaced the VPInstruction::ExtractFirstActive nodes with a combination of a new VPInstruction::FirstActiveLane node and a Instruction::ExtractElement node.
Gentle ping. :) |
@@ -86,6 +85,427 @@ loop.end: | |||
} | |||
|
|||
|
|||
define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have test coverage for FP/pointer inductions to make sure those cases also work as expected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I've tried to do this with @same_exit_block_pre_inc_use1_iv64_endf32
and @same_exit_block_pre_inc_use1_iv64_endptr
. Are you thinking of a different test that I've missed here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That looks fine, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
@@ -86,6 +85,427 @@ loop.end: | |||
} | |||
|
|||
|
|||
define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That looks fine, thanks
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR llvm#128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
* Regenerated some new tests after rebase. * Fixed an issue where we were assuming that FirstActiveLane always returned a 64-bit value.
Rebased. Ran make check-all locally and all tests passed, except two clang tests that are unrelated to this PR. |
Until now the feature to enable vectorisation of some early exit loops with uncountable exits was controlled under a flag, off by default. Now that we have efficient code generation for vectorising such loops (see PR llvm#130766) and we are still far enough away from the next LLVM release it seems like a good time to enable the feature by default. If any issues arise post-commit it can be easily reverted. Using this patch I built and ran the LLVM test suite successfully, which on neoverse-v1 led to the vectorisation of 114 additional early exit loops. I then performed a bootstrap build of clang, which built cleanly with 64 extra early exit loops vectorising. I also built and ran SPEC2017 successfully with no change in performance for both neoverse-v1 and neoverse-v2.
This is the second of two PRs that attempts to improve the IR
generated in the exit blocks of vectorised loops with uncountable
early exits. It follows on from PR #128880. In this PR I am
improving the generated code for users of induction variables in
early exit blocks.
This required using a newly add VPInstruction called
FirstActiveLane, which calculates the index of the first active
predicate in the mask operand.
I have added a new function optimizeEarlyExitInductionUser that
is called from optimizeInductionExitUsers when handling users in
early exit blocks.