-
Notifications
You must be signed in to change notification settings - Fork 5k
Vector512.Shuffle does not produce optimal codegen in some case #115078
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
Note: you can use |
static Vector512<byte> M1N(Vector512<byte> v)
{
Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
Vector512<byte> l_v = v;
for (int i = 0; i < 100; i++)
{
l_v = Vector512.ShuffleNative(l_v, s);
}
return l_v;
} ; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 70
G_M000_IG01: ;; offset=0x0000
G_M000_IG02: ;; offset=0x0000
vmovups zmm0, zmmword ptr [reloc @RWD00]
vmovups zmm1, zmmword ptr [rdx]
mov eax, 100
align [0 bytes for IG03]
G_M000_IG03: ;; offset=0x0015
vpermb zmm1, zmm0, zmm1
dec eax
jne SHORT G_M000_IG03
G_M000_IG04: ;; offset=0x001F
vmovups zmmword ptr [rcx], zmm1
mov rax, rcx
G_M000_IG05: ;; offset=0x0028
vzeroupper
ret
RWD00 dq 0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h
; Total bytes of code 44 No more ==================================================================== As a side note, if the shuffle indices are in the loop but as a local, static Vector512<byte> M2L(Vector512<byte> v)
{
Vector512<byte> l_v = v;
for (int i = 0; i < 100; i++)
{
Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
l_v = Vector512.Shuffle(l_v, s);
}
return l_v;
} ; Assembly listing for method ConsoleApp1.Program:M2L(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 73
G_M000_IG01: ;; offset=0x0000
G_M000_IG02: ;; offset=0x0000
vmovups zmm0, zmmword ptr [rdx]
mov eax, 100
align [0 bytes for IG03]
G_M000_IG03: ;; offset=0x000B
vpshufb zmm0, zmm0, zmmword ptr [reloc @RWD00]
dec eax
jne SHORT G_M000_IG03
G_M000_IG04: ;; offset=0x0019
vmovups zmmword ptr [rcx], zmm0
mov rax, rcx
G_M000_IG05: ;; offset=0x0022
vzeroupper
ret
RWD00 dq 0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h
; Total bytes of code 38 It does use |
You have a range of APIs available (from xplat APIs to hardware specific intrinsics) based on your scenario and how critical the performance actually is to your scenario. Typical scenarios benefit greatly from the 2-64x perf increase from using vectorization and any minor performance degradation from ensuring deterministic results or handling edge cases may show up in a micro-benchmark, but are unlikely to show up in the end-to-end profile of the real world application. You're likely to lose far more performance simply by running on hardware without AVX512 or due to general machine latency caused by accessing memory, cache misses, branch mispredicts, etc. However, if it is a scenario where that performance is important, then you have access to various |
I'd guess that |
The worst case is when Details
First, ; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 74
; 0 inlinees with PGO data; 5 single block inlinees; 1 inlinees without PGO data
G_M000_IG01: ;; offset=0x0000
sub rsp, 248
vxorps xmm4, xmm4, xmm4
vmovdqu32 zmmword ptr [rsp+0x80], zmm4
G_M000_IG02: ;; offset=0x0013
vmovups zmm0, zmmword ptr [reloc @RWD00]
vmovups zmm1, zmmword ptr [rdx]
mov eax, 100
jmp SHORT G_M000_IG04
align [0 bytes for IG05]
G_M000_IG03: ;; offset=0x002A
vmovups zmm1, zmmword ptr [rsp+0x80]
dec eax
je SHORT G_M000_IG08
G_M000_IG04: ;; offset=0x0036
vmovups zmmword ptr [rsp], zmm1
vmovups zmmword ptr [rsp+0x40], zmm0
xor edx, edx
jmp SHORT G_M000_IG07
G_M000_IG05: ;; offset=0x0049
lea r9, [rsp]
mov r8d, r8d
movzx r9, byte ptr [r9+r8]
G_M000_IG06: ;; offset=0x0055
lea r8, [rsp+0x80]
mov byte ptr [r8+r10], r9b
inc edx
cmp edx, 64
jge SHORT G_M000_IG03
G_M000_IG07: ;; offset=0x0068
lea r8, [rsp+0x40]
movsxd r10, edx
movzx r8, byte ptr [r8+r10]
xor r9d, r9d
cmp r8d, 64
jge SHORT G_M000_IG06
jmp SHORT G_M000_IG05
G_M000_IG08: ;; offset=0x0080
vmovups zmmword ptr [rcx], zmm1
mov rax, rcx
G_M000_IG09: ;; offset=0x0089
vzeroupper
add rsp, 248
ret
RWD00 dq 0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h
; Total bytes of code 148 |
This won't happen in practice for .NET provided you're correctly checking We treat So, outside of a user override, we only report acceleration on |
Then why not let Otherwise, if some hypothetical CPU with AVX512F+CD+VL+DQ+BW but without AVX512VBMI but not "1st generation AVX512" is still considered supported for |
That's effectively what's happening already. Power users can override this, as they can do for many other considerations. We similarly don't mark
It's not really worth designing for unlikely hypotheticals. Such a world is outside the realm of what is being pushed for and explicitly documented in the design of AVX10, APX, X86S, and other features.
It already uses those where it can. When it can't, then there's not really a way to emulate the functionality. |
It is possible to emulate |
There are some fairly convoluted ways you can attempt to emulate, which require significant complexity in ensuring all edges are handled correctly. This is particularly true if the It ends up being far more trouble than its worth and doesn't really provide any substantial improvements as such hardware is unlikely to exist. For the simplest case, you end up with at least having to:
So you're left with this decently expensive 10+ instruction sequence to emulate a variable shuffle. One that is has a decently expensive dependency chain and will saturate the ports in a hot loop. |
Description
Code
Codegen
Note in
M1
,Vector512.Shuffle()
generates the slower and less available(vpermb
requires AVX512_VBMI) sequence ofbut in
M2
wheres
is inlined, or inM3
where the loop is removed,Vector512.Shuffle()
generates the intendedvpshufb
.As a side note, in the sequence
it should just use something like
vmovdqu8 zmm1 k1z, zmm1
to save an instruction.Configuration
.NET 10.0.100-preview.5.25224.4
Windows 11, x64
Regression?
Not known.
Analysis
Encountered while preparing #115069. Related #72793. Possibly related #76781.
The text was updated successfully, but these errors were encountered: