-
Notifications
You must be signed in to change notification settings - Fork 14k
[AMDGPU] Add BFX Formation Combines to RegBankCombiner #141590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/pierre-vh/lower-sbfe-in-rbcomb
Are you sure you want to change the base?
[AMDGPU] Add BFX Formation Combines to RegBankCombiner #141590
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesThey're relatively safe to use there I believe. The only new registers Fixes #140040 Patch is 153.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141590.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b1e851183de0d..8981b13dac7ed 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4629,10 +4629,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits())
return false;
+ const RegisterBank *RB = getRegBank(ShiftSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto Cst1 = B.buildConstant(ExtractTy, ShiftImm);
auto Cst2 = B.buildConstant(ExtractTy, Width);
B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2);
+
+ if (RB) {
+ MRI.setRegBank(Cst1.getReg(0), *RB);
+ MRI.setRegBank(Cst2.getReg(0), *RB);
+ }
};
return true;
}
@@ -4667,10 +4674,18 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
return false;
uint64_t Width = APInt(Size, AndImm).countr_one();
+
+ const RegisterBank *RB = getRegBank(ShiftSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto LSBCst = B.buildConstant(ExtractTy, LSBImm);
B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(LSBCst.getReg(0), *RB);
+ }
};
return true;
}
@@ -4717,10 +4732,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
const int64_t Pos = ShrAmt - ShlAmt;
const int64_t Width = Size - ShrAmt;
+ const RegisterBank *RB = getRegBank(ShlSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto PosCst = B.buildConstant(ExtractTy, Pos);
B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(PosCst.getReg(0), *RB);
+ }
};
return true;
}
@@ -4775,10 +4797,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
return false;
+ const RegisterBank *RB = getRegBank(AndSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto PosCst = B.buildConstant(ExtractTy, Pos);
B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(PosCst.getReg(0), *RB);
+ }
};
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 94e1175b06b14..96be17c487130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,5 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
- lower_uniform_sbfx, lower_uniform_ubfx]> {
+ lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ff03cf1231d08..b0a239bef649e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX8-LABEL: s_ashr_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_ashr_i32 s1, s1, s3
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s2, s1, 16
+; GFX8-NEXT: s_sext_i32_i16 s3, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_ashr_i32 s0, s0, s2
+; GFX8-NEXT: s_ashr_i32 s1, s3, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v2i16:
@@ -1014,26 +1013,24 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX8-LABEL: s_ashr_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_ashr_i32 s2, s2, s6
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
-; GFX8-NEXT: s_ashr_i32 s1, s1, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s5
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_ashr_i32 s3, s3, s7
+; GFX8-NEXT: s_lshr_b32 s4, s2, 16
+; GFX8-NEXT: s_sext_i32_i16 s6, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_lshr_b32 s5, s3, 16
+; GFX8-NEXT: s_ashr_i32 s0, s0, s4
+; GFX8-NEXT: s_sext_i32_i16 s4, s1
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT: s_ashr_i32 s2, s6, s2
+; GFX8-NEXT: s_ashr_i32 s1, s1, s5
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_ashr_i32 s3, s4, s3
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_or_b32 s0, s2, s0
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v4i16:
@@ -1223,46 +1220,42 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX8-LABEL: s_ashr_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_ashr_i32 s4, s4, s12
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshr_b32 s13, s5, 16
-; GFX8-NEXT: s_ashr_i32 s1, s1, s5
-; GFX8-NEXT: s_sext_i32_i16 s5, s9
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NEXT: s_ashr_i32 s5, s5, s13
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_sext_i32_i16 s12, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_lshr_b32 s9, s5, 16
+; GFX8-NEXT: s_ashr_i32 s0, s0, s8
+; GFX8-NEXT: s_sext_i32_i16 s8, s1
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT: s_lshr_b32 s10, s6, 16
+; GFX8-NEXT: s_ashr_i32 s4, s12, s4
+; GFX8-NEXT: s_ashr_i32 s5, s8, s5
+; GFX8-NEXT: s_ashr_i32 s1, s1, s9
+; GFX8-NEXT: s_sext_i32_i16 s8, s2
+; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_lshr_b32 s14, s6, 16
-; GFX8-NEXT: s_ashr_i32 s2, s2, s6
-; GFX8-NEXT: s_sext_i32_i16 s6, s10
-; GFX8-NEXT: s_or_b32 s0, s0, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_lshr_b32 s11, s3, 16
-; GFX8-NEXT: s_ashr_i32 s6, s6, s14
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_lshr_b32 s11, s7, 16
+; GFX8-NEXT: s_ashr_i32 s6, s8, s6
+; GFX8-NEXT: s_ashr_i32 s2, s2, s10
+; GFX8-NEXT: s_sext_i32_i16 s8, s3
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: s_ashr_i32 s3, s3, s7
-; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_or_b32 s1, s1, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX8-NEXT: s_ashr_i32 s7, s7, s15
+; GFX8-NEXT: s_ashr_i32 s3, s3, s11
+; GFX8-NEXT: s_or_b32 s0, s4, s0
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_or_b32 s2, s2, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_ashr_i32 s7, s8, s7
+; GFX8-NEXT: s_or_b32 s1, s4, s1
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_or_b32 s3, s3, s4
+; GFX8-NEXT: s_or_b32 s2, s4, s2
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 768a4d039aef9..7077029747c84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX8-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX9-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX10-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX10-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX11-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -345,10 +340,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshl_i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -356,10 +351,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshl_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -367,10 +362,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshl_i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -378,10 +373,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshl_i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -463,42 +458,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
}
define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i8_4:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 4
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i8_4:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 4
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i8_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_lshr_b32 s1, s1, 4
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i8_4:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i8_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i8_4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -556,42 +526,17 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
}
define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i8_5:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 5
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x50003
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i8_5:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 5
-; GFX8-NEXT: s_lshr_b32 s1, s1, 3
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i8_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 5
-; GFX9-NEXT: s_lshr_b32 s1, s1, 3
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i8_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 5
-; GFX10-NEXT: s_lshr_b32 s1, s1, 3
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i8_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 5
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x50003
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i8_5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_lshr_b32 s1, s1, 3
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x50003
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -674,23 +619,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX8-LABEL: s_fshl_v2i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 8
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s5, s2, 8
-; GFX8-NEXT: s_and_b32 s6, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s5, s2, 7
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 7
-; GFX8-NEXT: s_and_b32 s2, s4, 0xff
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_andn2_b32 s3, 7, s5
-; GFX8-NEXT: s_lshr_b32 s2, s2, s3
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
+; GFX8-NEXT: s_bfe_u32 s5, s1, 0x70001
+; GFX8-NEXT: s_lshr_b32 s4, s2, 8
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s...
[truncated]
|
6e5a085
to
9b283cd
Compare
e5f2477
to
150fe8c
Compare
9b283cd
to
6c9e836
Compare
150fe8c
to
efa6a12
Compare
6c9e836
to
082f286
Compare
4a0cc51
to
80fdf31
Compare
082f286
to
9731c98
Compare
They're relatively safe to use there I believe. The only new registers they may create are the constants for the BFX. For those, borrow the RC from the source register. Fixes #140040
80fdf31
to
b1dc820
Compare
9731c98
to
8a7e773
Compare
ping |
They're relatively safe to use there I believe. The only new registers
they may create are the constants for the BFX. For those, borrow the
RC from the source register.
Fixes #140040