Skip to content

Commit 3e90b3b

Browse files
committed
AMDGPU: Reduce readlane for single demanded vector element
1 parent b474c3f commit 3e90b3b

File tree

2 files changed

+52
-9
lines changed

2 files changed

+52
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
635635
break;
636636

637637
auto IID = SrcCI->getIntrinsicID();
638-
// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
638+
// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if
639+
// contractable
639640
//
640641
// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
641642
// relaxed.
@@ -845,13 +846,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
845846
break;
846847
}
847848
case Intrinsic::amdgcn_cvt_off_f32_i4: {
848-
Value* Arg = II.getArgOperand(0);
849+
Value *Arg = II.getArgOperand(0);
849850
Type *Ty = II.getType();
850851

851852
if (isa<PoisonValue>(Arg))
852853
return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
853854

854-
if(IC.getSimplifyQuery().isUndefValue(Arg))
855+
if (IC.getSimplifyQuery().isUndefValue(Arg))
855856
return IC.replaceInstUsesWith(II, Constant::getNullValue(Ty));
856857

857858
ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
@@ -1629,18 +1630,18 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
16291630
}
16301631
}
16311632
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1632-
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1633+
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
16331634
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
16341635
}
16351636
return std::nullopt;
16361637
}
16371638

16381639
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
16391640
///
1640-
/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1641-
/// definitions of the intrinsics vector argument, not Uses of the result like
1642-
/// image and buffer loads.
1643-
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1641+
/// The result of simplifying amdgcn image and buffer store intrinsics is
1642+
/// updating definitions of the intrinsics vector argument, not Uses of the
1643+
/// result like image and buffer loads. Note: This only supports non-TFE/LWE
1644+
/// image intrinsic calls; those have
16441645
/// struct returns.
16451646
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
16461647
IntrinsicInst &II,
@@ -1837,7 +1838,12 @@ Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
18371838
Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
18381839

18391840
// TODO: Preserve callsite attributes?
1840-
CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1841+
SmallVector<Value *> Args{Extract};
1842+
if (II.arg_size() > 1) {
1843+
for (int I = 1; I < II.arg_size(); ++I)
1844+
Args.push_back(II.getArgOperand(1));
1845+
}
1846+
CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
18411847

18421848
return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
18431849
NewCall, FirstElt);
@@ -1872,6 +1878,7 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
18721878
SimplifyAndSetOp) const {
18731879
switch (II.getIntrinsicID()) {
18741880
case Intrinsic::amdgcn_readfirstlane:
1881+
case Intrinsic::amdgcn_readlane:
18751882
SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
18761883
return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
18771884
case Intrinsic::amdgcn_raw_buffer_load:

llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,42 @@ define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) {
2525
ret i16 %elt
2626
}
2727

28+
define half @extract_elt0_v2f16_readlane_imm_0(<2 x half> %src) {
29+
; CHECK-LABEL: define half @extract_elt0_v2f16_readlane_imm_0(
30+
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
31+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 0
32+
; CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.readlane.f16(half [[TMP1]], i32 0)
33+
; CHECK-NEXT: ret half [[TMP2]]
34+
;
35+
%x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 0)
36+
%elt = extractelement <2 x half> %x, i32 0
37+
ret half %elt
38+
}
39+
40+
define half @extract_elt0_v2f16_readlane_imm_1(<2 x half> %src) {
41+
; CHECK-LABEL: define half @extract_elt0_v2f16_readlane_imm_1(
42+
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
43+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 1
44+
; CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.readlane.f16(half [[TMP1]], i32 1)
45+
; CHECK-NEXT: ret half [[TMP2]]
46+
;
47+
%x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 1)
48+
%elt = extractelement <2 x half> %x, i32 1
49+
ret half %elt
50+
}
51+
52+
define half @extract_elt0_v2f16_readlane(<2 x half> %src, i32 %idx) {
53+
; CHECK-LABEL: define half @extract_elt0_v2f16_readlane(
54+
; CHECK-SAME: <2 x half> [[SRC:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
55+
; CHECK-NEXT: [[X:%.*]] = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> [[SRC]], i32 [[IDX]])
56+
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[X]], i32 [[IDX]]
57+
; CHECK-NEXT: ret half [[ELT]]
58+
;
59+
%x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %idx)
60+
%elt = extractelement <2 x half> %x, i32 %idx
61+
ret half %elt
62+
}
63+
2864
define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) {
2965
; CHECK-LABEL: define i16 @extract_elt1_v2i16_readfirstlane(
3066
; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0]] {

0 commit comments

Comments
 (0)