Skip to content

Commit f88a9a3

Browse files
authored
[AMDGPU] Extend SRA i64 simplification for shift amts in range [33:62] (#138913)
Extend sra i64 simplification to shift constants in range [33:62]. Shift amounts 32 and 63 were already handled. New testing for shift amts 33 and 62 added in sra.ll. Changes to other test files were to adapt previous test results to this extension. --------- Signed-off-by: John Lu <John.Lu@amd.com>
1 parent 7a66b28 commit f88a9a3

17 files changed

+2512
-2259
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4162,22 +4162,17 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
41624162
SDLoc SL(N);
41634163
unsigned RHSVal = RHS->getZExtValue();
41644164

4165-
// (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4166-
if (RHSVal == 32) {
4165+
// For C >= 32
4166+
// (sra i64:x, C) -> build_pair (sra hi_32(x), C - 32), (sra hi_32(x), 31)
4167+
if (RHSVal >= 32) {
41674168
SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4168-
SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4169-
DAG.getConstant(31, SL, MVT::i32));
4169+
Hi = DAG.getFreeze(Hi);
4170+
SDValue HiShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4171+
DAG.getConstant(31, SL, MVT::i32));
4172+
SDValue LoShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4173+
DAG.getConstant(RHSVal - 32, SL, MVT::i32));
41704174

4171-
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4172-
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4173-
}
4174-
4175-
// (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4176-
if (RHSVal == 63) {
4177-
SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4178-
SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4179-
DAG.getConstant(31, SL, MVT::i32));
4180-
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4175+
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {LoShift, HiShift});
41814176
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
41824177
}
41834178

llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -685,16 +685,16 @@ define amdgpu_kernel void @ashr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
685685
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
686686
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
687687
; CI-NEXT: s_waitcnt vmcnt(0)
688-
; CI-NEXT: v_bfe_i32 v6, v3, 0, 16
689-
; CI-NEXT: v_ashr_i64 v[3:4], v[2:3], 56
690-
; CI-NEXT: v_bfe_i32 v5, v2, 0, 16
688+
; CI-NEXT: v_bfe_i32 v4, v2, 0, 16
689+
; CI-NEXT: v_bfe_i32 v5, v3, 0, 16
690+
; CI-NEXT: v_ashrrev_i32_e32 v3, 24, v3
691691
; CI-NEXT: v_ashrrev_i32_e32 v2, 24, v2
692-
; CI-NEXT: v_bfe_u32 v4, v6, 8, 16
693-
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
694-
; CI-NEXT: v_bfe_u32 v5, v5, 8, 16
695692
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
696-
; CI-NEXT: v_or_b32_e32 v3, v4, v3
697-
; CI-NEXT: v_or_b32_e32 v2, v5, v2
693+
; CI-NEXT: v_bfe_u32 v5, v5, 8, 16
694+
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
695+
; CI-NEXT: v_bfe_u32 v4, v4, 8, 16
696+
; CI-NEXT: v_or_b32_e32 v3, v5, v3
697+
; CI-NEXT: v_or_b32_e32 v2, v4, v2
698698
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
699699
; CI-NEXT: s_endpgm
700700
;

llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) {
150150
; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr:
151151
; CHECK: ; %bb.0: ; %bb
152152
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153-
; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0
154-
; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4]
155-
; CHECK-NEXT: flat_store_dword v[2:3], v4
153+
; CHECK-NEXT: v_mul_hi_u32 v0, v1, v0
154+
; CHECK-NEXT: flat_store_dword v[2:3], v0
155+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 1, v0
156156
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
157157
; CHECK-NEXT: s_setpc_b64 s[30:31]
158158
bb:

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4398,9 +4398,10 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
43984398
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
43994399
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
44004400
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
4401-
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
4402-
; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3]
4403-
; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
4401+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4
4402+
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
4403+
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v3
4404+
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v3
44044405
; GFX9-NEXT: s_setpc_b64 s[30:31]
44054406
;
44064407
; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k:

llvm/test/CodeGen/AMDGPU/fptoi.i128.ll

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,15 +1433,25 @@ define i128 @fptoui_f32_to_i128(float %x) {
14331433
}
14341434

14351435
define i128 @fptosi_f16_to_i128(half %x) {
1436-
; GCN-LABEL: fptosi_f16_to_i128:
1437-
; GCN: ; %bb.0:
1438-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1439-
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
1440-
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
1441-
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1442-
; GCN-NEXT: v_mov_b32_e32 v2, v1
1443-
; GCN-NEXT: v_mov_b32_e32 v3, v1
1444-
; GCN-NEXT: s_setpc_b64 s[30:31]
1436+
; SDAG-LABEL: fptosi_f16_to_i128:
1437+
; SDAG: ; %bb.0:
1438+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1439+
; SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
1440+
; SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
1441+
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1442+
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
1443+
; SDAG-NEXT: v_mov_b32_e32 v3, v2
1444+
; SDAG-NEXT: s_setpc_b64 s[30:31]
1445+
;
1446+
; GISEL-LABEL: fptosi_f16_to_i128:
1447+
; GISEL: ; %bb.0:
1448+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1449+
; GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
1450+
; GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
1451+
; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1452+
; GISEL-NEXT: v_mov_b32_e32 v2, v1
1453+
; GISEL-NEXT: v_mov_b32_e32 v3, v1
1454+
; GISEL-NEXT: s_setpc_b64 s[30:31]
14451455
%cvt = fptosi half %x to i128
14461456
ret i128 %cvt
14471457
}

0 commit comments

Comments
 (0)