Skip to content

Commit b91936a

Browse files
authored
AMDGPU: Combine nnan fminimum/fmaximum to fminnum_ieee/fmaxnum_ieee (#142217)
This improves codegen for gfx950, where fminimum/fmaximum are legal through fminimum3/fmaximum3, so may have an additional encoding cost.
1 parent 5f841a6 commit b91936a

File tree

5 files changed

+203
-420
lines changed

5 files changed

+203
-420
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13880,6 +13880,17 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
1388013880
return Res;
1388113881
}
1388213882

13883+
// Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
13884+
// for some types, but at a higher cost since it's implemented with a 3
13885+
// operand form.
13886+
const SDNodeFlags Flags = N->getFlags();
13887+
if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
13888+
!Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
13889+
unsigned NewOpc =
13890+
Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
13891+
return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
13892+
}
13893+
1388313894
return SDValue();
1388413895
}
1388513896

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 44 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,11 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
113113
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
114114
; GFX8-NEXT: s_setpc_b64 s[30:31]
115115
;
116-
; GFX900-LABEL: v_maximum_f16__nnan:
117-
; GFX900: ; %bb.0:
118-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119-
; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
120-
; GFX900-NEXT: s_setpc_b64 s[30:31]
121-
;
122-
; GFX950-LABEL: v_maximum_f16__nnan:
123-
; GFX950: ; %bb.0:
124-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
126-
; GFX950-NEXT: s_setpc_b64 s[30:31]
116+
; GFX9-LABEL: v_maximum_f16__nnan:
117+
; GFX9: ; %bb.0:
118+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
120+
; GFX9-NEXT: s_setpc_b64 s[30:31]
127121
;
128122
; GFX10-LABEL: v_maximum_f16__nnan:
129123
; GFX10: ; %bb.0:
@@ -270,17 +264,11 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
270264
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
271265
; GFX8-NEXT: s_setpc_b64 s[30:31]
272266
;
273-
; GFX900-LABEL: v_maximum_f16__nnan_nsz:
274-
; GFX900: ; %bb.0:
275-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276-
; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
277-
; GFX900-NEXT: s_setpc_b64 s[30:31]
278-
;
279-
; GFX950-LABEL: v_maximum_f16__nnan_nsz:
280-
; GFX950: ; %bb.0:
281-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
283-
; GFX950-NEXT: s_setpc_b64 s[30:31]
267+
; GFX9-LABEL: v_maximum_f16__nnan_nsz:
268+
; GFX9: ; %bb.0:
269+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
271+
; GFX9-NEXT: s_setpc_b64 s[30:31]
284272
;
285273
; GFX10-LABEL: v_maximum_f16__nnan_nsz:
286274
; GFX10: ; %bb.0:
@@ -771,17 +759,11 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
771759
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
772760
; GFX8-NEXT: s_setpc_b64 s[30:31]
773761
;
774-
; GFX900-LABEL: v_maximum_v2f16__nnan:
775-
; GFX900: ; %bb.0:
776-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v1
778-
; GFX900-NEXT: s_setpc_b64 s[30:31]
779-
;
780-
; GFX950-LABEL: v_maximum_v2f16__nnan:
781-
; GFX950: ; %bb.0:
782-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
784-
; GFX950-NEXT: s_setpc_b64 s[30:31]
762+
; GFX9-LABEL: v_maximum_v2f16__nnan:
763+
; GFX9: ; %bb.0:
764+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
766+
; GFX9-NEXT: s_setpc_b64 s[30:31]
785767
;
786768
; GFX10-LABEL: v_maximum_v2f16__nnan:
787769
; GFX10: ; %bb.0:
@@ -939,17 +921,11 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
939921
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
940922
; GFX8-NEXT: s_setpc_b64 s[30:31]
941923
;
942-
; GFX900-LABEL: v_maximum_v2f16__nnan_nsz:
943-
; GFX900: ; %bb.0:
944-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v1
946-
; GFX900-NEXT: s_setpc_b64 s[30:31]
947-
;
948-
; GFX950-LABEL: v_maximum_v2f16__nnan_nsz:
949-
; GFX950: ; %bb.0:
950-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
952-
; GFX950-NEXT: s_setpc_b64 s[30:31]
924+
; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
925+
; GFX9: ; %bb.0:
926+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
928+
; GFX9-NEXT: s_setpc_b64 s[30:31]
953929
;
954930
; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
955931
; GFX10: ; %bb.0:
@@ -1296,19 +1272,12 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
12961272
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
12971273
; GFX8-NEXT: s_setpc_b64 s[30:31]
12981274
;
1299-
; GFX900-LABEL: v_maximum_v3f16__nnan:
1300-
; GFX900: ; %bb.0:
1301-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
1303-
; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
1304-
; GFX900-NEXT: s_setpc_b64 s[30:31]
1305-
;
1306-
; GFX950-LABEL: v_maximum_v3f16__nnan:
1307-
; GFX950: ; %bb.0:
1308-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1309-
; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
1310-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
1311-
; GFX950-NEXT: s_setpc_b64 s[30:31]
1275+
; GFX9-LABEL: v_maximum_v3f16__nnan:
1276+
; GFX9: ; %bb.0:
1277+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278+
; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
1279+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
1280+
; GFX9-NEXT: s_setpc_b64 s[30:31]
13121281
;
13131282
; GFX10-LABEL: v_maximum_v3f16__nnan:
13141283
; GFX10: ; %bb.0:
@@ -1501,19 +1470,12 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
15011470
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
15021471
; GFX8-NEXT: s_setpc_b64 s[30:31]
15031472
;
1504-
; GFX900-LABEL: v_maximum_v3f16__nnan_nsz:
1505-
; GFX900: ; %bb.0:
1506-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1507-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
1508-
; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
1509-
; GFX900-NEXT: s_setpc_b64 s[30:31]
1510-
;
1511-
; GFX950-LABEL: v_maximum_v3f16__nnan_nsz:
1512-
; GFX950: ; %bb.0:
1513-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1514-
; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
1515-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
1516-
; GFX950-NEXT: s_setpc_b64 s[30:31]
1473+
; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
1474+
; GFX9: ; %bb.0:
1475+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1476+
; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
1477+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
1478+
; GFX9-NEXT: s_setpc_b64 s[30:31]
15171479
;
15181480
; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
15191481
; GFX10: ; %bb.0:
@@ -1741,19 +1703,12 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
17411703
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
17421704
; GFX8-NEXT: s_setpc_b64 s[30:31]
17431705
;
1744-
; GFX900-LABEL: v_maximum_v4f16__nnan:
1745-
; GFX900: ; %bb.0:
1746-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1747-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
1748-
; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
1749-
; GFX900-NEXT: s_setpc_b64 s[30:31]
1750-
;
1751-
; GFX950-LABEL: v_maximum_v4f16__nnan:
1752-
; GFX950: ; %bb.0:
1753-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1754-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
1755-
; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
1756-
; GFX950-NEXT: s_setpc_b64 s[30:31]
1706+
; GFX9-LABEL: v_maximum_v4f16__nnan:
1707+
; GFX9: ; %bb.0:
1708+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1709+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
1710+
; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
1711+
; GFX9-NEXT: s_setpc_b64 s[30:31]
17571712
;
17581713
; GFX10-LABEL: v_maximum_v4f16__nnan:
17591714
; GFX10: ; %bb.0:
@@ -1981,19 +1936,12 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
19811936
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
19821937
; GFX8-NEXT: s_setpc_b64 s[30:31]
19831938
;
1984-
; GFX900-LABEL: v_maximum_v4f16__nnan_nsz:
1985-
; GFX900: ; %bb.0:
1986-
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1987-
; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
1988-
; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
1989-
; GFX900-NEXT: s_setpc_b64 s[30:31]
1990-
;
1991-
; GFX950-LABEL: v_maximum_v4f16__nnan_nsz:
1992-
; GFX950: ; %bb.0:
1993-
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994-
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
1995-
; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
1996-
; GFX950-NEXT: s_setpc_b64 s[30:31]
1939+
; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
1940+
; GFX9: ; %bb.0:
1941+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1942+
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
1943+
; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
1944+
; GFX9-NEXT: s_setpc_b64 s[30:31]
19971945
;
19981946
; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
19991947
; GFX10: ; %bb.0:
@@ -2788,4 +2736,3 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
27882736
}
27892737
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
27902738
; GCN: {{.*}}
2791-
; GFX9: {{.*}}

0 commit comments

Comments
 (0)