-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU] SIFoldOperands: Delay foldCopyToVGPROfScalarAddOfFrameIndex #141558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] SIFoldOperands: Delay foldCopyToVGPROfScalarAddOfFrameIndex #141558
Conversation
foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied to vector registers into v_adds. We don't want to do that if foldInstOperand (which so far runs later) can fold the sreg->vreg copy away. This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after foldInstOperand. This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids regressions in an upcoming patch to enable ISD::PTRADD nodes.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) ChangesfoldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids Full diff: https://github.com/llvm/llvm-project/pull/141558.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 8b12eeba15618..a28c6e2811abe 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1909,10 +1909,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (!DstReg.isVirtual())
return false;
- if (OpToFold.isReg() &&
- foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
- return true;
-
// Fold copy to AGPR through reg_sequence
// TODO: Handle with subregister extract
if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
@@ -1947,7 +1943,14 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
Changed = true;
}
- return Changed;
+ if (Changed)
+ return true;
+
+ // Run this after foldInstOperand to avoid turning scalar additions into
+ // vector additions when the result scalar result could just be folded into
+ // the user(s).
+ return (OpToFold.isReg() &&
+ foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI));
}
// Clamp patterns are canonically selected to v_max_* instructions, so only
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index bcd5d1e87954f..a98df5c97293c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -182,8 +182,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -356,8 +355,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -701,14 +699,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -884,14 +881,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1239,14 +1235,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1425,8 +1420,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v3
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1
|
Merge activity
|
…lvm#141558) foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied to vector registers into v_adds. We don't want to do that if foldInstOperand (which so far runs later) can fold the sreg->vreg copy away. This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after foldInstOperand. This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids regressions in an upcoming patch to enable ISD::PTRADD nodes.
foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied
to vector registers into v_adds. We don't want to do that if foldInstOperand
(which so far runs later) can fold the sreg->vreg copy away.
This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after
foldInstOperand.
This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids
regressions in an upcoming patch to enable ISD::PTRADD nodes.