Skip to content

Commit 88a4327

Browse files
changpengrorth
authored andcommitted
AMDGPU: Custom lower fptrunc vectors for f32 -> f16 (llvm#141883)
The latest asics support v_cvt_pk_f16_f32 instruction. However current implementation of vector fptrunc lowering fully scalarizes the vectors, and the scalar conversions may not always be combined to generate the packed one. We made v2f32 -> v2f16 legal in llvm#139956. This work is an extension to handle wider vectors. Instead of fully scalarization, we split the vector to packs (v2f32 -> v2f16) to ensure the packed conversion can always been generated.
1 parent af83ed2 commit 88a4327

File tree

4 files changed

+295
-14
lines changed

4 files changed

+295
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,10 +1061,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10611061
}
10621062

10631063
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1064-
if (ST.hasCvtPkF16F32Inst())
1065-
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}});
1066-
else
1064+
if (ST.hasCvtPkF16F32Inst()) {
1065+
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1066+
.clampMaxNumElements(0, S16, 2);
1067+
} else {
10671068
FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1069+
}
10681070
FPTruncActions.scalarize(0).lower();
10691071

10701072
getActionDefinitionsBuilder(G_FPEXT)

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -926,8 +926,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
926926
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
927927
}
928928

929-
if (Subtarget->hasCvtPkF16F32Inst())
930-
setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
929+
if (Subtarget->hasCvtPkF16F32Inst()) {
930+
setOperationAction(ISD::FP_ROUND,
931+
{MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
932+
Custom);
933+
}
931934

932935
setTargetDAGCombine({ISD::ADD,
933936
ISD::UADDO_CARRY,
@@ -6896,14 +6899,35 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
68966899
DAG.getTargetConstant(0, DL, MVT::i32));
68976900
}
68986901

6902+
SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
6903+
SelectionDAG &DAG) const {
6904+
EVT DstVT = Op.getValueType();
6905+
unsigned NumElts = DstVT.getVectorNumElements();
6906+
assert(NumElts > 2 && isPowerOf2_32(NumElts));
6907+
6908+
auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6909+
6910+
SDLoc DL(Op);
6911+
unsigned Opc = Op.getOpcode();
6912+
SDValue Flags = Op.getOperand(1);
6913+
EVT HalfDstVT =
6914+
EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
6915+
SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
6916+
SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
6917+
6918+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
6919+
}
6920+
68996921
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
69006922
SDValue Src = Op.getOperand(0);
69016923
EVT SrcVT = Src.getValueType();
69026924
EVT DstVT = Op.getValueType();
69036925

6904-
if (DstVT == MVT::v2f16) {
6926+
if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
69056927
assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
6906-
return SrcVT == MVT::v2f32 ? Op : SDValue();
6928+
if (SrcVT.getScalarType() != MVT::f32)
6929+
return SDValue();
6930+
return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
69076931
}
69086932

69096933
if (SrcVT.getScalarType() != MVT::f64)

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
145145

146146
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
147147
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
148+
SDValue splitFP_ROUNDVectorOp(SDValue Op, SelectionDAG &DAG) const;
148149
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
149150
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
150151
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll

Lines changed: 261 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,272 @@ define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) {
1212
ret <2 x half> %res
1313
}
1414

15-
define half @fptrunc_v2f32_v2f16_then_extract(<2 x float> %src) {
16-
; GFX950-LABEL: fptrunc_v2f32_v2f16_then_extract:
15+
define <3 x half> @v_test_cvt_v3f32_v3f16(<3 x float> %src) {
16+
; GFX950-LABEL: v_test_cvt_v3f32_v3f16:
1717
; GFX950: ; %bb.0:
1818
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19+
; GFX950-NEXT: v_cvt_f16_f32_e32 v2, v2
1920
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
20-
; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
21+
; GFX950-NEXT: v_mov_b32_e32 v1, v2
22+
; GFX950-NEXT: s_setpc_b64 s[30:31]
23+
%res = fptrunc <3 x float> %src to <3 x half>
24+
ret <3 x half> %res
25+
}
26+
27+
define <4 x half> @v_test_cvt_v4f32_v4f16(<4 x float> %src) {
28+
; GFX950-LABEL: v_test_cvt_v4f32_v4f16:
29+
; GFX950: ; %bb.0:
30+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
32+
; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
33+
; GFX950-NEXT: s_setpc_b64 s[30:31]
34+
%res = fptrunc <4 x float> %src to <4 x half>
35+
ret <4 x half> %res
36+
}
37+
38+
define <8 x half> @v_test_cvt_v8f32_v2f16(<8 x float> %src) {
39+
; GFX950-LABEL: v_test_cvt_v8f32_v2f16:
40+
; GFX950: ; %bb.0:
41+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
43+
; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
44+
; GFX950-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
45+
; GFX950-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
46+
; GFX950-NEXT: s_setpc_b64 s[30:31]
47+
%res = fptrunc <8 x float> %src to <8 x half>
48+
ret <8 x half> %res
49+
}
50+
51+
define <16 x half> @v_test_cvt_v16f32_v16f16(<16 x float> %src) {
52+
; GFX950-LABEL: v_test_cvt_v16f32_v16f16:
53+
; GFX950: ; %bb.0:
54+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
56+
; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
57+
; GFX950-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
58+
; GFX950-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
59+
; GFX950-NEXT: v_cvt_pk_f16_f32 v4, v8, v9
60+
; GFX950-NEXT: v_cvt_pk_f16_f32 v5, v10, v11
61+
; GFX950-NEXT: v_cvt_pk_f16_f32 v6, v12, v13
62+
; GFX950-NEXT: v_cvt_pk_f16_f32 v7, v14, v15
63+
; GFX950-NEXT: s_setpc_b64 s[30:31]
64+
%res = fptrunc <16 x float> %src to <16 x half>
65+
ret <16 x half> %res
66+
}
67+
68+
define half @fptrunc_v2f32_v2f16_extract_uses(<2 x float> %src) {
69+
; GFX950-LABEL: fptrunc_v2f32_v2f16_extract_uses:
70+
; GFX950: ; %bb.0:
71+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
73+
; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2174
; GFX950-NEXT: s_setpc_b64 s[30:31]
2275
%vec_half = fptrunc <2 x float> %src to <2 x half>
23-
%first = extractelement <2 x half> %vec_half, i64 1
24-
%second = extractelement <2 x half> %vec_half, i64 0
25-
%res = fadd half %first, %second
26-
ret half %res
76+
%f0 = extractelement <2 x half> %vec_half, i64 0
77+
%f1 = extractelement <2 x half> %vec_half, i64 1
78+
%rslt = fadd half %f0, %f1
79+
ret half %rslt
80+
}
81+
82+
define half @fptrunc_v3f32_v3f16_extract_uses(<3 x float> %vec_float) {
83+
; GFX950-SDAG-LABEL: fptrunc_v3f32_v3f16_extract_uses:
84+
; GFX950-SDAG: ; %bb.0:
85+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86+
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
87+
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
88+
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
89+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
90+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v2, v0
91+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
92+
;
93+
; GFX950-GISEL-LABEL: fptrunc_v3f32_v3f16_extract_uses:
94+
; GFX950-GISEL: ; %bb.0:
95+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96+
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
97+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
98+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
99+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v2, v0
100+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
101+
%vec_half = fptrunc <3 x float> %vec_float to <3 x half>
102+
%f0 = extractelement <3 x half> %vec_half, i64 0
103+
%f1 = extractelement <3 x half> %vec_half, i64 1
104+
%f2 = extractelement <3 x half> %vec_half, i64 2
105+
%sum0 = fadd half %f0, %f1
106+
%rslt = fadd half %f2, %sum0
107+
ret half %rslt
108+
}
109+
110+
define half @fptrunc_v4f32_v4f16_extract_uses(<4 x float> %vec_float) {
111+
; GFX950-SDAG-LABEL: fptrunc_v4f32_v4f16_extract_uses:
112+
; GFX950-SDAG: ; %bb.0:
113+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
115+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
116+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
117+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
118+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
119+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
120+
;
121+
; GFX950-GISEL-LABEL: fptrunc_v4f32_v4f16_extract_uses:
122+
; GFX950-GISEL: ; %bb.0:
123+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
125+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
126+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
127+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
128+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
129+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
130+
%vec_half = fptrunc <4 x float> %vec_float to <4 x half>
131+
%f0 = extractelement <4 x half> %vec_half, i64 0
132+
%f1 = extractelement <4 x half> %vec_half, i64 1
133+
%f2 = extractelement <4 x half> %vec_half, i64 2
134+
%f3 = extractelement <4 x half> %vec_half, i64 3
135+
%sum0 = fadd half %f0, %f1
136+
%sum1 = fadd half %f2, %f3
137+
%rslt = fadd half %sum0, %sum1
138+
ret half %rslt
139+
}
140+
141+
define half @fptrunc_v8f32_v8f16_extract_uses(<8 x float> %vec_float) {
142+
; GFX950-SDAG-LABEL: fptrunc_v8f32_v8f16_extract_uses:
143+
; GFX950-SDAG: ; %bb.0:
144+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v6, v6, v7
146+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v4, v4, v5
147+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
148+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
149+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
150+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
151+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
152+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
153+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
154+
; GFX950-SDAG-NEXT: v_add_f16_e32 v1, v2, v3
155+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
156+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
157+
;
158+
; GFX950-GISEL-LABEL: fptrunc_v8f32_v8f16_extract_uses:
159+
; GFX950-GISEL: ; %bb.0:
160+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
162+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
163+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
164+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
165+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
166+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
167+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
168+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
169+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
170+
; GFX950-GISEL-NEXT: v_add_f16_e32 v1, v2, v3
171+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
172+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
173+
%vec_half = fptrunc <8 x float> %vec_float to <8 x half>
174+
%f0 = extractelement <8 x half> %vec_half, i64 0
175+
%f1 = extractelement <8 x half> %vec_half, i64 1
176+
%f2 = extractelement <8 x half> %vec_half, i64 2
177+
%f3 = extractelement <8 x half> %vec_half, i64 3
178+
%f4 = extractelement <8 x half> %vec_half, i64 4
179+
%f5 = extractelement <8 x half> %vec_half, i64 5
180+
%f6 = extractelement <8 x half> %vec_half, i64 6
181+
%f7 = extractelement <8 x half> %vec_half, i64 7
182+
%sum0 = fadd half %f0, %f1
183+
%sum1 = fadd half %f2, %f3
184+
%sum2 = fadd half %f4, %f5
185+
%sum3 = fadd half %f6, %f7
186+
%sum4 = fadd half %sum0, %sum1
187+
%sum5 = fadd half %sum2, %sum3
188+
%rslt = fadd half %sum4, %sum5
189+
ret half %rslt
190+
}
191+
192+
define half @fptrunc_v16f32_v16f16_extract_uses(<16 x float> %vec_float) {
193+
; GFX950-SDAG-LABEL: fptrunc_v16f32_v16f16_extract_uses:
194+
; GFX950-SDAG: ; %bb.0:
195+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v14, v14, v15
197+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v12, v12, v13
198+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v10, v10, v11
199+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v8, v8, v9
200+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v6, v6, v7
201+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v4, v4, v5
202+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
203+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
204+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
205+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
206+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
207+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
208+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
209+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v5, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
210+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v6, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
211+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v7, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
212+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
213+
; GFX950-SDAG-NEXT: v_add_f16_e32 v1, v2, v3
214+
; GFX950-SDAG-NEXT: v_add_f16_e32 v2, v4, v5
215+
; GFX950-SDAG-NEXT: v_add_f16_e32 v3, v6, v7
216+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
217+
; GFX950-SDAG-NEXT: v_add_f16_e32 v1, v2, v3
218+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
219+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
220+
;
221+
; GFX950-GISEL-LABEL: fptrunc_v16f32_v16f16_extract_uses:
222+
; GFX950-GISEL: ; %bb.0:
223+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
225+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
226+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
227+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
228+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v4, v8, v9
229+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v5, v10, v11
230+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v6, v12, v13
231+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v7, v14, v15
232+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
233+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
234+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
235+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
236+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
237+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
238+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
239+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
240+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
241+
; GFX950-GISEL-NEXT: v_add_f16_e32 v1, v2, v3
242+
; GFX950-GISEL-NEXT: v_add_f16_e32 v2, v4, v5
243+
; GFX950-GISEL-NEXT: v_add_f16_e32 v3, v6, v7
244+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
245+
; GFX950-GISEL-NEXT: v_add_f16_e32 v1, v2, v3
246+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
247+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
248+
%vec_half = fptrunc <16 x float> %vec_float to <16 x half>
249+
%f0 = extractelement <16 x half> %vec_half, i64 0
250+
%f1 = extractelement <16 x half> %vec_half, i64 1
251+
%f2 = extractelement <16 x half> %vec_half, i64 2
252+
%f3 = extractelement <16 x half> %vec_half, i64 3
253+
%f4 = extractelement <16 x half> %vec_half, i64 4
254+
%f5 = extractelement <16 x half> %vec_half, i64 5
255+
%f6 = extractelement <16 x half> %vec_half, i64 6
256+
%f7 = extractelement <16 x half> %vec_half, i64 7
257+
%f8 = extractelement <16 x half> %vec_half, i64 8
258+
%f9 = extractelement <16 x half> %vec_half, i64 9
259+
%f10 = extractelement <16 x half> %vec_half, i64 10
260+
%f11 = extractelement <16 x half> %vec_half, i64 11
261+
%f12 = extractelement <16 x half> %vec_half, i64 12
262+
%f13 = extractelement <16 x half> %vec_half, i64 13
263+
%f14 = extractelement <16 x half> %vec_half, i64 14
264+
%f15 = extractelement <16 x half> %vec_half, i64 15
265+
%sum0 = fadd half %f0, %f1
266+
%sum1 = fadd half %f2, %f3
267+
%sum2 = fadd half %f4, %f5
268+
%sum3 = fadd half %f6, %f7
269+
%sum4 = fadd half %f8, %f9
270+
%sum5 = fadd half %f10, %f11
271+
%sum6 = fadd half %f12, %f13
272+
%sum7 = fadd half %f14, %f15
273+
%sum8 = fadd half %sum0, %sum1
274+
%sum9 = fadd half %sum2, %sum3
275+
%sum10 = fadd half %sum4, %sum5
276+
%sum11 = fadd half %sum6, %sum7
277+
%sum12 = fadd half %sum8, %sum9
278+
%sum13 = fadd half %sum10, %sum11
279+
%rslt = fadd half %sum12, %sum13
280+
ret half %rslt
27281
}
28282

29283
define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) {

0 commit comments

Comments
 (0)