Skip to content

Commit efa340f

Browse files
committed
[ARM] Workaround tailpredication min/max costmodel
The min/max intrinsics are not yet canonical, but when they are the tail predications analysis will change from treating them like icmp to treating them like intrinsics. Unfortunately, they can currently produce better code by not being tail predicated thanks to the vectorizer picking higher VF's and the backend folding to better instructions (especially for saturate patterns). In the long run we will need to improve the vectorizers cost modelling, recognizing the instruction directly, but in the meantime this treats min/max as before to prevent performance regressions.
1 parent 83df940 commit efa340f

File tree

2 files changed

+241
-0
lines changed

2 files changed

+241
-0
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1960,6 +1960,20 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
19601960
// we simply count the icmps, i.e. there should only be 1 for the backedge.
19611961
if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
19621962
return false;
1963+
// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
1964+
// not currently canonical, but soon will be. Code without them uses icmp, and
1965+
// so is not tail predicated as per the condition above. In order to get the
1966+
// same performance we treat min and max the same as an icmp for tailpred
1967+
// purposes for the moment (we often rely on non-tailpred and higher VF's to
1968+
// pick more optimial instructions like VQDMULH. They need to be recognized
1969+
// directly by the vectorizer).
1970+
if (auto *II = dyn_cast<IntrinsicInst>(&I))
1971+
if ((II->getIntrinsicID() == Intrinsic::smin ||
1972+
II->getIntrinsicID() == Intrinsic::smax ||
1973+
II->getIntrinsicID() == Intrinsic::umin ||
1974+
II->getIntrinsicID() == Intrinsic::umax) &&
1975+
++ICmpCount > 1)
1976+
return false;
19631977

19641978
if (isa<FCmpInst>(&I))
19651979
return false;
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
3+
4+
; This test after a lot of cleanup should produce pick a tail-predicated 8x
5+
; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
6+
; FIXME: Tailpredicate too, but not at the expense of 8x vectorized.
7+
8+
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
9+
target triple = "thumbv8.1m.main-arm-none-eabi"
10+
11+
define void @arm_mult_q15(i16* %pSrcA, i16* %pSrcB, i16 * noalias %pDst, i32 %blockSize) #0 {
12+
; CHECK-LABEL: @arm_mult_q15(
13+
; CHECK-NEXT: entry:
14+
; CHECK-NEXT: [[CMP_NOT3:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
15+
; CHECK-NEXT: br i1 [[CMP_NOT3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
16+
; CHECK: while.body.preheader:
17+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
18+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
19+
; CHECK: vector.ph:
20+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
21+
; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
22+
; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i16, i16* [[PSRCA:%.*]], i32 [[N_VEC]]
23+
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i32 [[N_VEC]]
24+
; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i16, i16* [[PSRCB:%.*]], i32 [[N_VEC]]
25+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
26+
; CHECK: vector.body:
27+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
28+
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRCA]], i32 [[INDEX]]
29+
; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]]
30+
; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i16, i16* [[PSRCB]], i32 [[INDEX]]
31+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
32+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
33+
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
34+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP15]] to <8 x i16>*
35+
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i16>, <8 x i16>* [[TMP2]], align 2
36+
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD16]] to <8 x i32>
37+
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
38+
; CHECK-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
39+
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
40+
; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
41+
; CHECK-NEXT: [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
42+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[NEXT_GEP14]] to <8 x i16>*
43+
; CHECK-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
44+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
45+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
46+
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
47+
; CHECK: middle.block:
48+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
49+
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER17]]
50+
; CHECK: while.body.preheader17:
51+
; CHECK-NEXT: [[BLKCNT_07_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
52+
; CHECK-NEXT: [[PSRCA_ADDR_06_PH:%.*]] = phi i16* [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
53+
; CHECK-NEXT: [[PDST_ADDR_05_PH:%.*]] = phi i16* [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
54+
; CHECK-NEXT: [[PSRCB_ADDR_04_PH:%.*]] = phi i16* [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END13]], [[MIDDLE_BLOCK]] ]
55+
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
56+
; CHECK: while.body:
57+
; CHECK-NEXT: [[BLKCNT_07:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_07_PH]], [[WHILE_BODY_PREHEADER17]] ]
58+
; CHECK-NEXT: [[PSRCA_ADDR_06:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_06_PH]], [[WHILE_BODY_PREHEADER17]] ]
59+
; CHECK-NEXT: [[PDST_ADDR_05:%.*]] = phi i16* [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_05_PH]], [[WHILE_BODY_PREHEADER17]] ]
60+
; CHECK-NEXT: [[PSRCB_ADDR_04:%.*]] = phi i16* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_04_PH]], [[WHILE_BODY_PREHEADER17]] ]
61+
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_06]], i32 1
62+
; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[PSRCA_ADDR_06]], align 2
63+
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32
64+
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_04]], i32 1
65+
; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRCB_ADDR_04]], align 2
66+
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32
67+
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
68+
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], 15
69+
; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[SHR]], 32767
70+
; CHECK-NEXT: [[RETVAL_1_I:%.*]] = select i1 [[TMP13]], i32 [[SHR]], i32 32767
71+
; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[RETVAL_1_I]] to i16
72+
; CHECK-NEXT: [[INCDEC_PTR4]] = getelementptr inbounds i16, i16* [[PDST_ADDR_05]], i32 1
73+
; CHECK-NEXT: store i16 [[CONV3]], i16* [[PDST_ADDR_05]], align 2
74+
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_07]], -1
75+
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
76+
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
77+
; CHECK: while.end:
78+
; CHECK-NEXT: ret void
79+
;
80+
entry:
81+
%pSrcA.addr = alloca i16*, align 4
82+
%pSrcB.addr = alloca i16*, align 4
83+
%pDst.addr = alloca i16*, align 4
84+
%blockSize.addr = alloca i32, align 4
85+
%blkCnt = alloca i32, align 4
86+
store i16* %pSrcA, i16** %pSrcA.addr, align 4
87+
store i16* %pSrcB, i16** %pSrcB.addr, align 4
88+
store i16* %pDst, i16** %pDst.addr, align 4
89+
store i32 %blockSize, i32* %blockSize.addr, align 4
90+
%0 = bitcast i32* %blkCnt to i8*
91+
call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
92+
%1 = load i32, i32* %blockSize.addr, align 4
93+
store i32 %1, i32* %blkCnt, align 4
94+
br label %while.cond
95+
96+
while.cond: ; preds = %while.body, %entry
97+
%2 = load i32, i32* %blkCnt, align 4
98+
%cmp = icmp ugt i32 %2, 0
99+
br i1 %cmp, label %while.body, label %while.end
100+
101+
while.body: ; preds = %while.cond
102+
%3 = load i16*, i16** %pSrcA.addr, align 4
103+
%incdec.ptr = getelementptr inbounds i16, i16* %3, i32 1
104+
store i16* %incdec.ptr, i16** %pSrcA.addr, align 4
105+
%4 = load i16, i16* %3, align 2
106+
%conv = sext i16 %4 to i32
107+
%5 = load i16*, i16** %pSrcB.addr, align 4
108+
%incdec.ptr1 = getelementptr inbounds i16, i16* %5, i32 1
109+
store i16* %incdec.ptr1, i16** %pSrcB.addr, align 4
110+
%6 = load i16, i16* %5, align 2
111+
%conv2 = sext i16 %6 to i32
112+
%mul = mul nsw i32 %conv, %conv2
113+
%shr = ashr i32 %mul, 15
114+
%call = call i32 @__SSAT(i32 %shr, i32 16)
115+
%conv3 = trunc i32 %call to i16
116+
%7 = load i16*, i16** %pDst.addr, align 4
117+
%incdec.ptr4 = getelementptr inbounds i16, i16* %7, i32 1
118+
store i16* %incdec.ptr4, i16** %pDst.addr, align 4
119+
store i16 %conv3, i16* %7, align 2
120+
%8 = load i32, i32* %blkCnt, align 4
121+
%dec = add i32 %8, -1
122+
store i32 %dec, i32* %blkCnt, align 4
123+
br label %while.cond
124+
125+
while.end: ; preds = %while.cond
126+
%9 = bitcast i32* %blkCnt to i8*
127+
call void @llvm.lifetime.end.p0i8(i64 4, i8* %9) #3
128+
ret void
129+
}
130+
131+
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
132+
133+
define internal i32 @__SSAT(i32 %val, i32 %sat) #2 {
134+
entry:
135+
%retval = alloca i32, align 4
136+
%val.addr = alloca i32, align 4
137+
%sat.addr = alloca i32, align 4
138+
%max = alloca i32, align 4
139+
%min = alloca i32, align 4
140+
%cleanup.dest.slot = alloca i32, align 4
141+
store i32 %val, i32* %val.addr, align 4
142+
store i32 %sat, i32* %sat.addr, align 4
143+
%0 = load i32, i32* %sat.addr, align 4
144+
%cmp = icmp uge i32 %0, 1
145+
br i1 %cmp, label %land.lhs.true, label %if.end10
146+
147+
land.lhs.true: ; preds = %entry
148+
%1 = load i32, i32* %sat.addr, align 4
149+
%cmp1 = icmp ule i32 %1, 32
150+
br i1 %cmp1, label %if.then, label %if.end10
151+
152+
if.then: ; preds = %land.lhs.true
153+
%2 = bitcast i32* %max to i8*
154+
call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
155+
%3 = load i32, i32* %sat.addr, align 4
156+
%sub = sub i32 %3, 1
157+
%shl = shl i32 1, %sub
158+
%sub2 = sub i32 %shl, 1
159+
store i32 %sub2, i32* %max, align 4
160+
%4 = bitcast i32* %min to i8*
161+
call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
162+
%5 = load i32, i32* %max, align 4
163+
%sub3 = sub nsw i32 -1, %5
164+
store i32 %sub3, i32* %min, align 4
165+
%6 = load i32, i32* %val.addr, align 4
166+
%7 = load i32, i32* %max, align 4
167+
%cmp4 = icmp sgt i32 %6, %7
168+
br i1 %cmp4, label %if.then5, label %if.else
169+
170+
if.then5: ; preds = %if.then
171+
%8 = load i32, i32* %max, align 4
172+
store i32 %8, i32* %retval, align 4
173+
store i32 1, i32* %cleanup.dest.slot, align 4
174+
br label %cleanup
175+
176+
if.else: ; preds = %if.then
177+
%9 = load i32, i32* %val.addr, align 4
178+
%10 = load i32, i32* %min, align 4
179+
%cmp6 = icmp slt i32 %9, %10
180+
br i1 %cmp6, label %if.then7, label %if.end
181+
182+
if.then7: ; preds = %if.else
183+
%11 = load i32, i32* %min, align 4
184+
store i32 %11, i32* %retval, align 4
185+
store i32 1, i32* %cleanup.dest.slot, align 4
186+
br label %cleanup
187+
188+
if.end: ; preds = %if.else
189+
br label %if.end8
190+
191+
if.end8: ; preds = %if.end
192+
store i32 0, i32* %cleanup.dest.slot, align 4
193+
br label %cleanup
194+
195+
cleanup: ; preds = %if.end8, %if.then7, %if.then5
196+
%12 = bitcast i32* %min to i8*
197+
call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #3
198+
%13 = bitcast i32* %max to i8*
199+
call void @llvm.lifetime.end.p0i8(i64 4, i8* %13) #3
200+
%cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4
201+
switch i32 %cleanup.dest, label %unreachable [
202+
i32 0, label %cleanup.cont
203+
i32 1, label %return
204+
]
205+
206+
cleanup.cont: ; preds = %cleanup
207+
br label %if.end10
208+
209+
if.end10: ; preds = %cleanup.cont, %land.lhs.true, %entry
210+
%14 = load i32, i32* %val.addr, align 4
211+
store i32 %14, i32* %retval, align 4
212+
br label %return
213+
214+
return: ; preds = %if.end10, %cleanup
215+
%15 = load i32, i32* %retval, align 4
216+
ret i32 %15
217+
218+
unreachable: ; preds = %cleanup
219+
unreachable
220+
}
221+
222+
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
223+
224+
attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
225+
attributes #1 = { argmemonly nofree nosync nounwind willreturn }
226+
attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
227+
attributes #3 = { nounwind }

0 commit comments

Comments
 (0)