-
Notifications
You must be signed in to change notification settings - Fork 14k
[AMDGPU] Treat image_msaa_load as a sampler operation #141726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
While image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing.
@llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) ChangesWhile image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing. Full diff: https://github.com/llvm/llvm-project/pull/141726.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 67c4cac7b4c88..eb0977b92d5ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -116,7 +116,7 @@ class SIInsertHardClauses {
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
if (BaseInfo->BVH)
return HARDCLAUSE_BVH;
- if (BaseInfo->Sampler)
+ if (BaseInfo->Sampler || BaseInfo->MSAA)
return HARDCLAUSE_MIMG_SAMPLE;
return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
: HARDCLAUSE_MIMG_LOAD
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 99c00e149c140..abaeb24ac0dd5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -189,9 +189,10 @@ VmemType getVmemType(const MachineInstr &Inst) {
// We have to make an additional check for isVSAMPLE here since some
// instructions don't have a sampler, but are still classified as sampler
// instructions for the purposes of e.g. waitcnt.
- return BaseInfo->BVH ? VMEM_BVH
- : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
- : VMEM_NOSAMPLER;
+ bool HasSampler = BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst);
+ return BaseInfo->BVH ? VMEM_BVH
+ : HasSampler ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
}
unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
index 947c838740d43..ca6bccda124c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@@ -90,7 +90,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -100,7 +99,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
@@ -166,7 +164,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -176,7 +173,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
index 4bd35138815ed..eea99e7203537 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
@@ -57,9 +57,9 @@ body: |
; GFX11: S_WAITCNT 1015
; GFX1150-NEXT: S_WAITCNT 1015
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
- ; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
- renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
S_ENDPGM 0
...
---
@@ -69,12 +69,12 @@ machineFunctionInfo:
body: |
bb.0:
; GCN-LABEL: name: waitcnt-load-sample
- ; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX11: S_WAITCNT 1015
; GFX1150-NEXT: S_WAITCNT 1015
; GFX12-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
- renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
S_ENDPGM 0
...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
index 8eb4be266dd3b..64d553cb24e00 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
@@ -13,7 +13,6 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: S_WAITCNT 0
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
- ; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
@@ -22,3 +21,25 @@ body: |
SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
...
+
+---
+name: atomic_msaa
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ ; GFX11-LABEL: name: atomic_msaa
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: S_WAITCNT 0
+ ; GFX11-NEXT: dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+ dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+
+...
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comments are just confirming my assumption.
LGTM
@@ -57,9 +57,9 @@ body: | | |||
; GFX11: S_WAITCNT 1015 | |||
; GFX1150-NEXT: S_WAITCNT 1015 | |||
; GFX12-NEXT: S_WAIT_SAMPLECNT 0 | |||
; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) | |||
; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is because MSAA is now classed as a sample, so no longer demonstrates the issue being tested?
@@ -22,3 +21,25 @@ body: | | |||
SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3 | |||
|
|||
... | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar to the previous one - I assume you've left the original as-is (with the waitcnt removed), and added another example which reproduces the original intent?
Your assumptions are correct. |
While image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing.
While image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing.