-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[AMDGPU] Support D16 folding for image.sample with multiple extractelement and fptrunc users #141758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Harrison Hao (harrisonGPU) ChangesNow we only support D16 folding for
This change enables D16 folding for such cases and avoids generating Patch is 37.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141758.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5f6ab24182d5e..2ff1501e2b977 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -269,6 +269,68 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
ArgTys[0] = User->getType();
});
}
+ } else {
+ // Only perform D16 folding if every user of the image sample is
+ // an ExtractElementInst immediately followed by an FPTrunc to half.
+ SmallVector<ExtractElementInst *, 4> Extracts;
+ SmallVector<FPTruncInst *, 4> Truncs;
+ bool AllHalfExtracts = true;
+
+ for (User *U : II.users()) {
+ auto *Ext = dyn_cast<ExtractElementInst>(U);
+ if (!Ext || !Ext->hasOneUse()) {
+ AllHalfExtracts = false;
+ break;
+ }
+ auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
+ if (!Tr || !Tr->getType()->getScalarType()->isHalfTy()) {
+ AllHalfExtracts = false;
+ break;
+ }
+ Extracts.push_back(Ext);
+ Truncs.push_back(Tr);
+ }
+
+ if (AllHalfExtracts && !Extracts.empty()) {
+ auto *VecTy = cast<VectorType>(II.getType());
+ unsigned NElts = VecTy->getElementCount().getKnownMinValue();
+ Type *HalfVecTy =
+ VectorType::get(Type::getHalfTy(II.getContext()), NElts, false);
+
+ // Obtain the original image sample intrinsic's signature
+ // and replace its return type with the half-vector for D16 folding
+ SmallVector<Type *, 8> SigTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys))
+ return nullptr;
+ SigTys[0] = HalfVecTy;
+
+ Module *M = II.getModule();
+ Function *HalfDecl =
+ Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
+
+ II.mutateType(HalfVecTy);
+ II.setCalledFunction(HalfDecl);
+
+ IRBuilder<> Builder(&II);
+ for (auto [lane, Ext] : enumerate(Extracts)) {
+ FPTruncInst *Tr = Truncs[lane];
+ Value *Idx = Ext->getIndexOperand();
+
+ Builder.SetInsertPoint(Tr);
+
+ Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
+ HalfExtract->takeName(Tr);
+
+ Tr->replaceAllUsesWith(HalfExtract);
+ }
+
+ for (auto *T : Truncs)
+ IC.eraseInstFromFunction(*T);
+ for (auto *E : Extracts)
+ IC.eraseInstFromFunction(*E);
+
+ return &II;
+ }
}
}
}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index 30431ad724843..d39cceb9b549a 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx700 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX7 %s
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
-; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
-; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX10PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX11 %s
define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -16,6 +17,21 @@ define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4
; GFX81PLUS-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; GFX81PLUS-NEXT: ret half [[TEX]]
;
+; GFX9-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: ret half [[TEX]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: ret half [[TEX]]
+;
+; GFX11-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: ret half [[TEX]]
+;
main_body:
%tex = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_half = fptrunc float %tex to half
@@ -40,6 +56,30 @@ define amdgpu_ps half @image_sample_2d_v2f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_0]]
;
+; GFX9-LABEL: @image_sample_2d_v2f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_0]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v2f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_0]]
+;
+; GFX11-LABEL: @image_sample_2d_v2f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: ret half [[ADDF_SUM_0]]
+;
main_body:
%tex = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_2_half = fptrunc <2 x float> %tex to <2 x half>
@@ -71,6 +111,36 @@ define amdgpu_ps half @image_sample_2d_v3f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_1]]
;
+; GFX9-LABEL: @image_sample_2d_v3f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX9-NEXT: ret half [[ADDF_SUM_1]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v3f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_1]]
+;
+; GFX11-LABEL: @image_sample_2d_v3f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX11-NEXT: ret half [[ADDF_SUM_1]]
+;
main_body:
%tex = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_3_half = fptrunc <3 x float> %tex to <3 x half>
@@ -108,6 +178,42 @@ define amdgpu_ps half @image_sample_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_2]]
;
+; GFX9-LABEL: @image_sample_2d_v4f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX9-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v4f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX11-LABEL: @image_sample_2d_v4f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX11-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX11-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX11-NEXT: ret half [[ADDF_SUM_2]]
+;
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_4_half = fptrunc <4 x float> %tex to <4 x half>
@@ -121,6 +227,79 @@ main_body:
ret half %addf_sum.2
}
+define void @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v, ptr addrspace(7) %out) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to half
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX7-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX7-NEXT: ret void
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT: [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX81PLUS-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX81PLUS-NEXT: ret void
+;
+; GFX9-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX9-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX9-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX9-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX9-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX9-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX9-NEXT: ret void
+;
+; GFX10PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX10PLUS-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX10PLUS-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX10PLUS-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX10PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX10PLUS-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX10PLUS-NEXT: ret void
+;
+; GFX11-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX11-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX11-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX11-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX11-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX11-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX11-NEXT: ret void
+;
+main_body:
+ %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %e0 = extractelement <4 x float> %sample, i32 0
+ %h0 = fptrunc float %e0 to half
+ %e1 = extractelement <4 x float> %sample, i32 1
+ %h1 = fptrunc float %e1 to half
+ %e2 = extractelement <4 x float> %sample, i32 2
+ %h2 = fptrunc float %e2 to half
+ %mul = fmul half %h0, %h1
+ %res = fadd half %mul, %h2
+ store half %res, ptr addrspace(7) %out, align 2
+ ret void
+}
+
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX7-LABEL: @image_gather4_2d_v4f32(
; GFX7-NEXT: main_body:
@@ -147,6 +326,42 @@ define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX81PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_2]]
;
+; GFX9-LABEL: @image_gather4_2d_v4f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX9-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX10PLUS-LABEL: @image_gather4_2d_v4f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT:...
[truncated]
|
for (auto [lane, Ext] : enumerate(Extracts)) { | ||
FPTruncInst *Tr = Truncs[lane]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (auto [lane, Ext] : enumerate(Extracts)) { | |
FPTruncInst *Tr = Truncs[lane]; | |
for (auto [Ext, Tr] : zip(Extracts, Truncs)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or put them into a vector of std::pair in the first place.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have used :
for (auto [Ext, Tr] : zip(Extracts, Truncs))
|
||
for (User *U : II.users()) { | ||
auto *Ext = dyn_cast<ExtractElementInst>(U); | ||
if (!Ext || !Ext->hasOneUse()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing negative test for the !hasOneUse?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I added a test called image_sample_2d_extractelement_multi_use_no_d16
.
break; | ||
} | ||
auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin()); | ||
if (!Tr || !Tr->getType()->getScalarType()->isHalfTy()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing negative test for fptrunc to a different type?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I added a test called image_sample_2d_multi_fptrunc_non_half_no_d16
.
@@ -269,6 +269,68 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, | |||
ArgTys[0] = User->getType(); | |||
}); | |||
} | |||
} else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would make a bit more sense to put all of this inside:
if (II.getType().isVectorTy()) {
new code here
} else {
old code here
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this makes sense, because if we encounter an image sample with a vector type and its only user is a fptrunc
from a float vector to a half vector, we should not fall back to the "old code" path. For example:
%38 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 15, float %36, float %37, float 0.000000e+00, <8 x i32> %34, <4 x i32> %35, i1 false, i32 0, i32 0)
%39 = fptrunc <4 x float> %38 to <4 x half>
In this case, the image sample result is a vector and directly truncated to a half vector, which doesn't fit either the new extract‑element + fptrunc pattern or the old code path that expects scalar values. So simply checking isVectorTy() is not sufficient to distinguish the cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually the old code path does handle that case.
Here's a new suggestion: just remove the "else" on line 272. So even if II has one use, if the old code path fails, we should still try the new code path. This will handle the case where the only use of II is a single extractelement instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have updated it.
II.mutateType(HalfVecTy); | ||
II.setCalledFunction(HalfDecl); | ||
|
||
IRBuilder<> Builder(&II); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need to use this c'tor since the insert point is updated right away in the loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I’ve changed it to IRBuilder<> Builder(II.getContext());
to initialize the builder.
cc66281
to
7b416b7
Compare
cce389a
to
c31843d
Compare
c31843d
to
dfed9ec
Compare
Truncs.push_back(Tr); | ||
} | ||
|
||
if (AllHalfExtracts && !Extracts.empty()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bail out early to avoid giant indentation here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don’t bail out early, because doing so would change the existing control-flow logic: there are additional optimizations that run after the hasD16Images check (e.g. the A16/G16 path).
dfed9ec
to
887688e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
SmallVector<ExtractElementInst *, 4> Extracts; | ||
SmallVector<FPTruncInst *, 4> Truncs; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I still thinks this would be neater as a vector-of-std::pairs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have updated it.
d26b66f
to
86a4102
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/11/builds/17534 Here is the relevant piece of the build log for the reference
|
…ement and fptrunc users (llvm#141758) Now we only support D16 folding for `image sample` instructions with a single user: a `fptrunc` to half. However, we can actually support D16 folding for image.sample instructions with multiple users, as long as each user follows the pattern of extractelement followed by fptrunc to half. For example: ``` %sample = call <4 x float> @llvm.amdgcn.image.sample %e0 = extractelement <4 x float> %sample, i32 0 %h0 = fptrunc float %e0 to half %e1 = extractelement <4 x float> %sample, i32 1 %h1 = fptrunc float %e1 to half %e2 = extractelement <4 x float> %sample, i32 2 %h2 = fptrunc float %e2 to half ``` This change enables D16 folding for such cases and avoids generating `v_cvt_f16_f32_e32` instructions.
Now we only support D16 folding for
image sample
instructions with a single user: afptrunc
to half.However, we can actually support D16 folding for image.sample instructions with multiple users,
as long as each user follows the pattern of extractelement followed by fptrunc to half.
For example:
This change enables D16 folding for such cases and avoids generating
v_cvt_f16_f32_e32
instructions.