-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SDAG] Handle insert_subvector in isKnownNeverNaN #131989
base: main
Are you sure you want to change the base?
Conversation
Propagate nnan across insert_subvector.
@llvm/pr-subscribers-backend-amdgpu Author: Jim Lin (tclin914) ChangesPropagate nnan across insert_subvector. Full diff: https://github.com/llvm/llvm-project/pull/131989.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d1f92c9ef00e9..4b3dd5e0d6975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5625,6 +5625,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
(SNaN && !C->getValueAPF().isSignaling());
}
+ if (Op.isUndef())
+ return true;
+
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::FADD:
@@ -5727,6 +5730,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::EXTRACT_SUBVECTOR: {
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
+ case ISD::INSERT_SUBVECTOR:
+ return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
case ISD::BUILD_VECTOR: {
for (const SDValue &Opnd : Op->ops())
if (!isKnownNeverNaN(Opnd, SNaN, Depth + 1))
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 17c84d7371de1..a08228fc919b8 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2986,14 +2986,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3006,20 +3006,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -3747,16 +3747,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_max_f32_e32 v3, s0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
+; GFX6-NEXT: v_min_f32_e32 v2, s0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -3779,9 +3779,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
+; GFX8-NEXT: v_max_f16_e32 v2, s0, v2
; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
-; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_min_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3845,14 +3845,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3865,20 +3865,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 15cb404a3840a..7068ef9f99d73 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1071,55 +1071,51 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v3, v5, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, 0
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v6, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v7, v8 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v2, v2 clamp
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-FAKE16: ; %bb.0:
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v0, v6, v6 clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v6
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX900: ; %bb.0:
-; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, v6 clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX906: ; %bb.0:
-; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, v6 clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -1193,26 +1189,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX900: ; %bb.0:
-; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX906: ; %bb.0:
-; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 04e73ac1ea956..c6cd366497218 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -357,3 +357,52 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmax_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.maximum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index a0334a9a5d20a..568923db83591 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -357,3 +357,52 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmin_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.minimum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
|
@llvm/pr-subscribers-llvm-selectiondag Author: Jim Lin (tclin914) ChangesPropagate nnan across insert_subvector. Full diff: https://github.com/llvm/llvm-project/pull/131989.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d1f92c9ef00e9..4b3dd5e0d6975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5625,6 +5625,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
(SNaN && !C->getValueAPF().isSignaling());
}
+ if (Op.isUndef())
+ return true;
+
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::FADD:
@@ -5727,6 +5730,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::EXTRACT_SUBVECTOR: {
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
+ case ISD::INSERT_SUBVECTOR:
+ return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
case ISD::BUILD_VECTOR: {
for (const SDValue &Opnd : Op->ops())
if (!isKnownNeverNaN(Opnd, SNaN, Depth + 1))
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 17c84d7371de1..a08228fc919b8 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2986,14 +2986,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3006,20 +3006,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -3747,16 +3747,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_max_f32_e32 v3, s0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
+; GFX6-NEXT: v_min_f32_e32 v2, s0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -3779,9 +3779,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
+; GFX8-NEXT: v_max_f16_e32 v2, s0, v2
; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
-; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_min_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3845,14 +3845,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3865,20 +3865,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 15cb404a3840a..7068ef9f99d73 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1071,55 +1071,51 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v3, v5, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, 0
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v6, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v7, v8 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v2, v2 clamp
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-FAKE16: ; %bb.0:
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v0, v6, v6 clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v6
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX900: ; %bb.0:
-; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, v6 clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX906: ; %bb.0:
-; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, v6 clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -1193,26 +1189,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX900: ; %bb.0:
-; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX906: ; %bb.0:
-; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 04e73ac1ea956..c6cd366497218 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -357,3 +357,52 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmax_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.maximum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index a0334a9a5d20a..568923db83591 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -357,3 +357,52 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmin_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.minimum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 57288136fecfac35ef161cb8f147d269715d2634 1ece27bec3727f004acf2e5a83753af667f2491b llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp llvm/test/CodeGen/AMDGPU/clamp.ll llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
if (Op.isUndef()) | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is incorrect. You could do this for poison, but not undef tttt
@@ -5727,6 +5730,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const | |||
case ISD::EXTRACT_SUBVECTOR: { | |||
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); | |||
} | |||
case ISD::INSERT_SUBVECTOR: | |||
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && | |||
isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might need to consider adding DemandedElts handling to isKnownNeverNaN to correctly handle concatenation patterns:
insert_subvector(insert_subvector(undef,lo,0),hi,n/2)
Propagate nnan across insert_subvector.