Skip to content

Commit f52181a

Browse files
harrisonGPUfschlimb
authored andcommitted
[AMDGPU] Support D16 folding for image.sample with multiple extractelement and fptrunc users (llvm#141758)
Now we only support D16 folding for `image sample` instructions with a single user: a `fptrunc` to half. However, we can actually support D16 folding for image.sample instructions with multiple users, as long as each user follows the pattern of extractelement followed by fptrunc to half. For example: ``` %sample = call <4 x float> @llvm.amdgcn.image.sample %e0 = extractelement <4 x float> %sample, i32 0 %h0 = fptrunc float %e0 to half %e1 = extractelement <4 x float> %sample, i32 1 %h1 = fptrunc float %e1 to half %e2 = extractelement <4 x float> %sample, i32 2 %h2 = fptrunc float %e2 to half ``` This change enables D16 folding for such cases and avoids generating `v_cvt_f16_f32_e32` instructions.
1 parent 3810ce6 commit f52181a

File tree

2 files changed

+178
-0
lines changed

2 files changed

+178
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,66 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
248248
});
249249
}
250250
}
251+
252+
// Only perform D16 folding if every user of the image sample is
253+
// an ExtractElementInst immediately followed by an FPTrunc to half.
254+
SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
255+
ExtractTruncPairs;
256+
bool AllHalfExtracts = true;
257+
258+
for (User *U : II.users()) {
259+
auto *Ext = dyn_cast<ExtractElementInst>(U);
260+
if (!Ext || !Ext->hasOneUse()) {
261+
AllHalfExtracts = false;
262+
break;
263+
}
264+
265+
auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
266+
if (!Tr || !Tr->getType()->isHalfTy()) {
267+
AllHalfExtracts = false;
268+
break;
269+
}
270+
271+
ExtractTruncPairs.emplace_back(Ext, Tr);
272+
}
273+
274+
if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275+
auto *VecTy = cast<VectorType>(II.getType());
276+
Type *HalfVecTy =
277+
VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
278+
279+
// Obtain the original image sample intrinsic's signature
280+
// and replace its return type with the half-vector for D16 folding
281+
SmallVector<Type *, 8> SigTys;
282+
Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
283+
SigTys[0] = HalfVecTy;
284+
285+
Module *M = II.getModule();
286+
Function *HalfDecl =
287+
Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
288+
289+
II.mutateType(HalfVecTy);
290+
II.setCalledFunction(HalfDecl);
291+
292+
IRBuilder<> Builder(II.getContext());
293+
for (auto &[Ext, Tr] : ExtractTruncPairs) {
294+
Value *Idx = Ext->getIndexOperand();
295+
296+
Builder.SetInsertPoint(Tr);
297+
298+
Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
299+
HalfExtract->takeName(Tr);
300+
301+
Tr->replaceAllUsesWith(HalfExtract);
302+
}
303+
304+
for (auto &[Ext, Tr] : ExtractTruncPairs) {
305+
IC.eraseInstFromFunction(*Tr);
306+
IC.eraseInstFromFunction(*Ext);
307+
}
308+
309+
return &II;
310+
}
251311
}
252312
}
253313

llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
44
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
55
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
6+
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
67

78
define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
89
; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -121,6 +122,123 @@ main_body:
121122
ret half %addf_sum.2
122123
}
123124

125+
define amdgpu_ps half @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
126+
; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
127+
; GFX7-NEXT: main_body:
128+
; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
129+
; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
130+
; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half
131+
; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
132+
; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half
133+
; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
134+
; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to half
135+
; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
136+
; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
137+
; GFX7-NEXT: ret half [[RES]]
138+
;
139+
; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
140+
; GFX81PLUS-NEXT: main_body:
141+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
142+
; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
143+
; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
144+
; GFX81PLUS-NEXT: [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
145+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
146+
; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
147+
; GFX81PLUS-NEXT: ret half [[RES]]
148+
;
149+
main_body:
150+
%sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
151+
%e0 = extractelement <4 x float> %sample, i32 0
152+
%h0 = fptrunc float %e0 to half
153+
%e1 = extractelement <4 x float> %sample, i32 1
154+
%h1 = fptrunc float %e1 to half
155+
%e2 = extractelement <4 x float> %sample, i32 2
156+
%h2 = fptrunc float %e2 to half
157+
%mul = fmul half %h0, %h1
158+
%res = fadd half %mul, %h2
159+
ret half %res
160+
}
161+
162+
define amdgpu_ps half @image_sample_2d_extractelement_multi_use_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
163+
; GFX7-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
164+
; GFX7-NEXT: main_body:
165+
; GFX7-NEXT: [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
166+
; GFX7-NEXT: [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
167+
; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half
168+
; GFX7-NEXT: [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
169+
; GFX7-NEXT: [[HALF:%.*]] = fptrunc float [[USER2]] to half
170+
; GFX7-NEXT: [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
171+
; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half
172+
; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
173+
; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
174+
; GFX7-NEXT: ret half [[RES]]
175+
;
176+
; GFX81PLUS-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
177+
; GFX81PLUS-NEXT: main_body:
178+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
179+
; GFX81PLUS-NEXT: [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
180+
; GFX81PLUS-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half
181+
; GFX81PLUS-NEXT: [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
182+
; GFX81PLUS-NEXT: [[HALF:%.*]] = fptrunc float [[USER2]] to half
183+
; GFX81PLUS-NEXT: [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
184+
; GFX81PLUS-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half
185+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
186+
; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
187+
; GFX81PLUS-NEXT: ret half [[RES]]
188+
;
189+
main_body:
190+
%sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
191+
%e0 = extractelement <4 x float> %sample, i32 0
192+
%h0 = fptrunc float %e0 to half
193+
%user2 = fadd float %e0, 1.0
194+
%half = fptrunc float %user2 to half
195+
%e1 = extractelement <4 x float> %sample, i32 1
196+
%h1 = fptrunc float %e1 to half
197+
%mul = fmul half %h0, %h1
198+
%res = fadd half %mul, %half
199+
ret half %res
200+
}
201+
202+
define amdgpu_ps bfloat @image_sample_2d_multi_fptrunc_non_half_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
203+
; GFX7-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
204+
; GFX7-NEXT: main_body:
205+
; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
206+
; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
207+
; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to bfloat
208+
; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
209+
; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to bfloat
210+
; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
211+
; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to bfloat
212+
; GFX7-NEXT: [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
213+
; GFX7-NEXT: [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
214+
; GFX7-NEXT: ret bfloat [[RES]]
215+
;
216+
; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
217+
; GFX81PLUS-NEXT: main_body:
218+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
219+
; GFX81PLUS-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
220+
; GFX81PLUS-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to bfloat
221+
; GFX81PLUS-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
222+
; GFX81PLUS-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to bfloat
223+
; GFX81PLUS-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
224+
; GFX81PLUS-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to bfloat
225+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
226+
; GFX81PLUS-NEXT: [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
227+
; GFX81PLUS-NEXT: ret bfloat [[RES]]
228+
;
229+
main_body:
230+
%sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
231+
%e0 = extractelement <4 x float> %sample, i32 0
232+
%h0 = fptrunc float %e0 to bfloat
233+
%e1 = extractelement <4 x float> %sample, i32 1
234+
%h1 = fptrunc float %e1 to bfloat
235+
%e2 = extractelement <4 x float> %sample, i32 2
236+
%h2 = fptrunc float %e2 to bfloat
237+
%mul = fmul bfloat %h0, %h1
238+
%res = fadd bfloat %mul, %h2
239+
ret bfloat %res
240+
}
241+
124242
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
125243
; GFX7-LABEL: @image_gather4_2d_v4f32(
126244
; GFX7-NEXT: main_body:

0 commit comments

Comments
 (0)