Skip to content

Commit a29e762

Browse files
committed
AMDGPU: Push fcanonicalize through partially constant build_vector
This usually avoids some re-packing code, and may help find canonical sources. llvm-svn: 339072
1 parent 0c679b7 commit a29e762

File tree

2 files changed

+210
-1
lines changed

2 files changed

+210
-1
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6879,15 +6879,19 @@ SDValue SITargetLowering::getCanonicalConstantFP(
68796879
return DAG.getConstantFP(C, SL, VT);
68806880
}
68816881

6882+
static bool vectorEltWillFoldAway(SDValue Op) {
6883+
return Op.isUndef() || isa<ConstantFPSDNode>(Op);
6884+
}
6885+
68826886
SDValue SITargetLowering::performFCanonicalizeCombine(
68836887
SDNode *N,
68846888
DAGCombinerInfo &DCI) const {
68856889
SelectionDAG &DAG = DCI.DAG;
68866890
SDValue N0 = N->getOperand(0);
6891+
EVT VT = N->getValueType(0);
68876892

68886893
// fcanonicalize undef -> qnan
68896894
if (N0.isUndef()) {
6890-
EVT VT = N->getValueType(0);
68916895
APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
68926896
return DAG.getConstantFP(QNaN, SDLoc(N), VT);
68936897
}
@@ -6897,6 +6901,38 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
68976901
return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
68986902
}
68996903

6904+
// fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
6905+
// (fcanonicalize k)
6906+
//
6907+
// fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
6908+
6909+
// TODO: This could be better with wider vectors that will be split to v2f16,
6910+
// and to consider uses since there aren't that many packed operations.
6911+
if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
6912+
SDLoc SL(N);
6913+
SDValue NewElts[2];
6914+
SDValue Lo = N0.getOperand(0);
6915+
SDValue Hi = N0.getOperand(1);
6916+
if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
6917+
for (unsigned I = 0; I != 2; ++I) {
6918+
SDValue Op = N0.getOperand(I);
6919+
EVT EltVT = Op.getValueType();
6920+
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
6921+
NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
6922+
CFP->getValueAPF());
6923+
} else if (Op.isUndef()) {
6924+
// This would ordinarily be folded to a qNaN. Since this may be half
6925+
// of a packed operation, it may be cheaper to use a 0.
6926+
NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
6927+
} else {
6928+
NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
6929+
}
6930+
}
6931+
6932+
return DAG.getBuildVector(VT, SL, NewElts);
6933+
}
6934+
}
6935+
69006936
return isCanonicalized(DAG, N0) ? N0 : SDValue();
69016937
}
69026938

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
1010
declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
1111
declare i32 @llvm.amdgcn.workitem.id.x() #0
1212

13+
; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
14+
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
15+
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
16+
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
17+
%canonicalized = call half @llvm.canonicalize.f16(half undef)
18+
store half %canonicalized, half addrspace(1)* %out
19+
ret void
20+
}
1321

1422
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
1523
; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
@@ -34,6 +42,21 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out,
3442
ret void
3543
}
3644

45+
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
46+
; GFX9: v_and_b32_e32 v0, 0xffff, v0
47+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
48+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
49+
50+
; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
51+
; VI: v_max_f16_e32 v0, v0, v0
52+
; VI: v_or_b32_e32 v0, v0, v1
53+
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
54+
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
55+
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
56+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
57+
ret <2 x half> %canonicalized
58+
}
59+
3760
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
3861
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
3962
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
@@ -515,6 +538,156 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
515538
ret <4 x half> %canonicalized
516539
}
517540

541+
; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
542+
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
543+
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
544+
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
545+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
546+
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
547+
ret void
548+
}
549+
550+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
551+
; GFX9: s_waitcnt
552+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
553+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
554+
; GFX9-NEXT: s_setpc_b64
555+
556+
; High bits known zero
557+
; FIXME: Should also be true on gfx9 by default?
558+
; VI: s_waitcnt
559+
; VI-NEXT: v_max_f16_e32 v0, v0, v0
560+
; VI-NEXT: s_setpc_b64
561+
define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
562+
%vec = insertelement <2 x half> undef, half %val, i32 0
563+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
564+
ret <2 x half> %canonicalized
565+
}
566+
567+
; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
568+
; GFX9: s_waitcnt
569+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
570+
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
571+
; GFX9-NEXT: s_setpc_b64
572+
573+
; VI: s_waitcnt
574+
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
575+
; VI-NEXT: s_setpc_b64
576+
define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
577+
%vec = insertelement <2 x half> undef, half %val, i32 1
578+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
579+
ret <2 x half> %canonicalized
580+
}
581+
582+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
583+
; GFX9: s_waitcnt
584+
; GFX9-DAG: v_max_f16_e32 v0, v0, v0
585+
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
586+
; GFX9: v_and_b32_e32 v0, 0xffff, v0
587+
; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
588+
; GFX9: s_setpc_b64
589+
590+
; VI: s_waitcnt
591+
; VI-NEXT: v_max_f16_e32 v0, v0, v0
592+
; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
593+
; VI-NEXT: s_setpc_b64
594+
define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
595+
%vec0 = insertelement <2 x half> undef, half %val, i32 0
596+
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
597+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
598+
ret <2 x half> %canonicalized
599+
}
600+
601+
; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
602+
; GFX9: v_max_f16_e32 v0, v0, v0
603+
; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
604+
; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
605+
; GFX9: s_setpc_b64
606+
607+
; VI: s_waitcnt
608+
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
609+
; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
610+
; VI-NEXT: s_setpc_b64
611+
define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
612+
%vec0 = insertelement <2 x half> undef, half 2.0, i32 0
613+
%vec1 = insertelement <2 x half> %vec0, half %val, i32 1
614+
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
615+
ret <2 x half> %canonicalized
616+
}
617+
618+
; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
619+
; GCN: v_mov_b32_e32 v0, 0x7e007e00
620+
; GCN: v_mov_b32_e32 v1, v0
621+
define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
622+
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
623+
store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
624+
ret void
625+
}
626+
627+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
628+
; GFX9: s_waitcnt
629+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
630+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
631+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
632+
; GFX9-NEXT: s_setpc_b64
633+
634+
; VI: s_waitcnt
635+
; VI-NEXT: v_max_f16_e32 v0, v0, v0
636+
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
637+
; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
638+
; VI-NEXT: s_setpc_b64
639+
define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
640+
%vec = insertelement <4 x half> undef, half %val, i32 0
641+
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
642+
ret <4 x half> %canonicalized
643+
}
644+
645+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
646+
; GFX9: s_waitcnt
647+
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
648+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
649+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
650+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
651+
; GFX9-NEXT: s_setpc_b64
652+
653+
; VI: s_waitcnt
654+
; VI-DAG: v_max_f16_e32 v0, v0, v0
655+
; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
656+
; VI: v_or_b32_e32 v0, v0, v1
657+
; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
658+
; VI-NEXT: s_setpc_b64
659+
define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
660+
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
661+
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
662+
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
663+
ret <4 x half> %canonicalized
664+
}
665+
666+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
667+
; GFX9: s_waitcnt
668+
; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
669+
; GFX9-NEXT: v_and_b32_e32 v1, [[MASK]], v1
670+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
671+
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
672+
; GFX9-NEXT: v_and_b32_e32 v0, [[MASK]], v0
673+
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
674+
; GFX9-NEXT: s_setpc_b64
675+
676+
; VI: s_waitcnt
677+
; VI-NEXT: v_max_f16_e32 v0, v0, v0
678+
; VI-NEXT: v_max_f16_e32 v1, v1, v1
679+
; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
680+
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
681+
; VI-NEXT: v_or_b32_e32 v1, v1, v2
682+
; VI-NEXT: s_setpc_b64
683+
define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
684+
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
685+
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
686+
%vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
687+
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
688+
ret <4 x half> %canonicalized
689+
}
690+
518691
attributes #0 = { nounwind readnone }
519692
attributes #1 = { nounwind }
520693
attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" }

0 commit comments

Comments
 (0)