-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Move S_BFE lowering into RegBankCombiner #141589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/pierre-vh/gi-kb-sbfe
Are you sure you want to change the base?
[AMDGPU] Move S_BFE lowering into RegBankCombiner #141589
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesNFC Full diff: https://github.com/llvm/llvm-project/pull/141589.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
]>;
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx<Instruction bfx> : GICombineRule<
+ (defs root:$bfx),
+ (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx<G_SBFX>;
+def lower_uniform_ubfx : lower_uniform_bfx<G_UBFX>;
+
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+ cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+ lower_uniform_sbfx, lower_uniform_ubfx]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+ bool lowerUniformBFX(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+ const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+ assert(RB && "No RB?");
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
+ Register WidthReg = MI.getOperand(3).getReg();
+
+ const LLT S32 = LLT::scalar(32);
+ LLT Ty = MRI.getType(DstReg);
+
+ const unsigned Opc = (Ty == S32)
+ ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+ // Ensure the high bits are clear to insert the offset.
+ auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
+ auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+ // Zeros out the low bits, so don't bother clamping the input value.
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+ auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+ MRI.setRegBank(OffsetMask.getReg(0), *RB);
+ MRI.setRegBank(ClampOffset.getReg(0), *RB);
+ MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+ MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+ MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+ auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+ if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+ llvm_unreachable("failed to constrain BFE");
+
+ MI.eraseFromParent();
+ return true;
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..0b7d64ee67c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- if (DstBank == &AMDGPU::VGPRRegBank) {
- if (Ty == S32)
- return true;
-
- // There is no 64-bit vgpr bitfield extract instructions so the operation
- // is expanded to a sequence of instructions that implement the operation.
- ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- const LLT S64 = LLT::scalar(64);
- // Shift the source operand so that extracted bits start at bit 0.
- auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
- : B.buildLShr(S64, SrcReg, OffsetReg);
- auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
-
- // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
- // if the width is a constant.
- if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
- // Use the 32-bit bitfield extract instruction if the width is a constant.
- // Depending on the width size, use either the low or high 32-bits.
- auto Zero = B.buildConstant(S32, 0);
- auto WidthImm = ConstWidth->Value.getZExtValue();
- if (WidthImm <= 32) {
- // Use bitfield extract on the lower 32-bit source, and then sign-extend
- // or clear the upper 32-bits.
- auto Extract =
- Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
- : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
- auto Extend =
- Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
- B.buildMergeLikeInstr(DstReg, {Extract, Extend});
- } else {
- // Use bitfield extract on upper 32-bit source, and combine with lower
- // 32-bit source.
- auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
- auto Extract =
- Signed
- ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
- : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
- B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
- }
- MI.eraseFromParent();
+ if (DstBank != &AMDGPU::VGPRRegBank) {
+ // SGPR: Canonicalize to a G_S/UBFX
+ if (!isa<GIntrinsic>(MI))
return true;
- }
- // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
- // operations.
- auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
- auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
if (Signed)
- B.buildAShr(S64, SignBit, ExtShift);
+ B.buildSbfx(DstReg, SrcReg, OffsetReg, WidthReg);
else
- B.buildLShr(S64, SignBit, ExtShift);
+ B.buildUbfx(DstReg, SrcReg, OffsetReg, WidthReg);
MI.eraseFromParent();
return true;
}
- // The scalar form packs the offset and width in a single operand.
-
- ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
-
- // Ensure the high bits are clear to insert the offset.
- auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
- auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
-
- // Zeros out the low bits, so don't bother clamping the input value.
- auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
-
- // Transformation function, pack the offset and width of a BFE into
- // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
- // source, bits [5:0] contain the offset and bits [22:16] the width.
- auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+ // VGPR
+ if (Ty == S32)
+ return true;
- // TODO: It might be worth using a pseudo here to avoid scc clobber and
- // register class constraints.
- unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+ // There is no 64-bit vgpr bitfield extract instructions so the operation
+ // is expanded to a sequence of instructions that implement the operation.
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
- auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
- if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
- llvm_unreachable("failed to constrain BFE");
+ const LLT S64 = LLT::scalar(64);
+ // Shift the source operand so that extracted bits start at bit 0.
+ auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
+ : B.buildLShr(S64, SrcReg, OffsetReg);
+ auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
+
+ // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
+ // if the width is a constant.
+ if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
+ // Use the 32-bit bitfield extract instruction if the width is a constant.
+ // Depending on the width size, use either the low or high 32-bits.
+ auto Zero = B.buildConstant(S32, 0);
+ auto WidthImm = ConstWidth->Value.getZExtValue();
+ if (WidthImm <= 32) {
+ // Use bitfield extract on the lower 32-bit source, and then sign-extend
+ // or clear the upper 32-bits.
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
+ auto Extend =
+ Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
+ B.buildMergeLikeInstr(DstReg, {Extract, Extend});
+ } else {
+ // Use bitfield extract on upper 32-bit source, and combine with lower
+ // 32-bit source.
+ auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
+ B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+ // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
+ // operations.
+ auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
+ auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ if (Signed)
+ B.buildAShr(S64, SignBit, ExtShift);
+ else
+ B.buildLShr(S64, SignBit, ExtShift);
MI.eraseFromParent();
return true;
}
|
e5f2477
to
150fe8c
Compare
const LLT S32 = LLT::scalar(32); | ||
LLT Ty = MRI.getType(DstReg); | ||
|
||
const unsigned Opc = (Ty == S32) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const unsigned Opc = (Ty == S32) | |
const unsigned Opc = Ty == S32 |
// Early select of uniform BFX into S_BFE instructions. | ||
// These instructions encode the offset/width in a way that requires using | ||
// bitwise operations. Selecting these instructions early allow the combiner | ||
// to potentially fold these. | ||
class lower_uniform_bfx<Instruction bfx> : GICombineRule< | ||
(defs root:$bfx), | ||
(combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; | ||
|
||
def lower_uniform_sbfx : lower_uniform_bfx<G_SBFX>; | ||
def lower_uniform_ubfx : lower_uniform_bfx<G_UBFX>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs more elaboration; needs to be clear that this can't be a mandatory lowering performed in a combiner
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
needs to be clear that this can't be a mandatory lowering performed in a combiner
What do you mean? I don't understand
Do you mean the combine name should avoid "lower" and be named something like bfx_to_s_bfe
instead?
I just thought about this but should I also make sure ISel doesn't crash on scalar BFXs by making it select the vector version all the time & inserting copies if the inputs are SGPRs? It's not a good result but at least our ISel wouldn't crash if that combine is skipped for some reason.
Register OffsetReg = MI.getOperand(2).getReg(); | ||
Register WidthReg = MI.getOperand(3).getReg(); | ||
|
||
const LLT S32 = LLT::scalar(32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: is it necessary to use const
local scalar variable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean why is the const
necessary? I tend to favor putting const
whenever a variable won't change, even though we don't always do it
Or why using a S32
variable at all? It's to avoid the repetition of LLT::scalar(32)
all over the place as I'm using that type in multiple places here. Note that this is a type (i32) and not a value.
a1992f6
to
7a7b474
Compare
80fdf31
to
b1dc820
Compare
b1dc820
to
bde93e3
Compare
1d1567f
to
349c9f5
Compare
NFC