Skip to content

Commit 45f5e40

Browse files
committed
AMDGPU: Make v2f16/v2bf16 copysign legal
Fixes #141931
1 parent 6a6aec6 commit 45f5e40

File tree

6 files changed

+1108
-1265
lines changed

6 files changed

+1108
-1265
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
756756
// allows matching fneg (fabs x) patterns)
757757
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
758758

759+
// Can do this in one BFI plus a constant materialize.
760+
setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom);
761+
759762
setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
760763
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
761764

@@ -6088,6 +6091,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
60886091
case ISD::SADDSAT:
60896092
case ISD::SSUBSAT:
60906093
return splitBinaryVectorOp(Op, DAG);
6094+
case ISD::FCOPYSIGN:
6095+
return lowerFCOPYSIGN(Op, DAG);
60916096
case ISD::MUL:
60926097
return lowerMUL(Op, DAG);
60936098
case ISD::SMULO:
@@ -7115,6 +7120,32 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
71157120
return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
71167121
}
71177122

7123+
SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7124+
SDValue Mag = Op.getOperand(0);
7125+
SDValue Sign = Op.getOperand(1);
7126+
7127+
EVT MagVT = Mag.getValueType();
7128+
EVT SignVT = Sign.getValueType();
7129+
7130+
assert(MagVT.isVector());
7131+
7132+
if (MagVT == SignVT)
7133+
return Op;
7134+
7135+
assert(MagVT.getVectorNumElements() == 2);
7136+
7137+
// fcopysign v2f16:mag, v2f32:sign ->
7138+
// fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7139+
7140+
SDLoc SL(Op);
7141+
SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7142+
SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7143+
7144+
SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7145+
7146+
return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7147+
}
7148+
71187149
// Custom lowering for vector multiplications and s_mul_u64.
71197150
SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
71207151
EVT VT = Op.getValueType();

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
149149
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
150150
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
151151
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
152+
SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
152153
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
153154
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
154155
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2062,6 +2062,16 @@ def : GCNPat <
20622062
>;
20632063
} // End foreach fp16vt = [f16, bf16]
20642064

2065+
2066+
foreach fp16vt = [v2f16, v2bf16] in {
2067+
2068+
def : GCNPat <
2069+
(fcopysign fp16vt:$src0, fp16vt:$src1),
2070+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
2071+
>;
2072+
2073+
}
2074+
20652075
/********** ================== **********/
20662076
/********** Immediate Patterns **********/
20672077
/********** ================== **********/

llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,12 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
3636
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
3737
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
3838
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
39-
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
39+
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
4040
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
4141
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
4242
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
4343
; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
44-
; GFX9-NEXT: v_bfi_b32 v2, s4, v1, v0
45-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
46-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4744
; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
48-
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
49-
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
5045
; GFX9-NEXT: s_setpc_b64 s[30:31]
5146
%y = or <2 x i32> %y.arg, <i32 1, i32 1>
5247
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)

0 commit comments

Comments
 (0)