Vector512.Shuffle does not produce optimal codegen in some case #115078

tfenise · 2025-04-26T13:00:31Z

Description

Code

static Vector512<byte> M1(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.Shuffle(l_v, s);
    }
    return l_v;
}

static Vector512<byte> M2(Vector512<byte> v)
{
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.Shuffle(l_v, Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                                      0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte());
    }
    return l_v;
}

static Vector512<byte> M3(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    l_v = Vector512.Shuffle(l_v, s);
    return l_v;
}

Codegen

; Assembly listing for method ConsoleApp1.Program:M1(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 69

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpermb   zmm1, zmm0, zmm1
       vpcmpub  k1, zmm0, zmmword ptr [reloc @RWD64], 1
       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0036
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x003F
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h
RWD64   dq      4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h

; Total bytes of code 67

; Assembly listing for method ConsoleApp1.Program:M2(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 72

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [reloc @RWD00]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpshufb  zmm0, zmm0, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x001F
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0028
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 44

; Assembly listing for method ConsoleApp1.Program:M3(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vpshufb  zmm0, zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0019
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 29

Note in M1, Vector512.Shuffle() generates the slower and less available(vpermb requires AVX512_VBMI) sequence of

       vpermb   zmm1, zmm0, zmm1
       vpcmpub  k1, zmm0, zmmword ptr [reloc @RWD64], 1
       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1

but in M2 where s is inlined, or in M3 where the loop is removed, Vector512.Shuffle() generates the intended vpshufb.

As a side note, in the sequence

       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1

it should just use something like vmovdqu8 zmm1 k1z, zmm1 to save an instruction.

Configuration

.NET 10.0.100-preview.5.25224.4
Windows 11, x64

Regression?

Not known.

Analysis

Encountered while preparing #115069. Related #72793. Possibly related #76781.

The text was updated successfully, but these errors were encountered:

dotnet-policy-service · 2025-04-26T13:01:04Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

huoyaoyuan · 2025-04-28T08:50:46Z

Encountered while preparing #115069.

Note: you can use ShuffleNative to workaround JIT limitation if you can prove the behavior.

tfenise · 2025-04-28T12:16:36Z

ShuffleNative:

static Vector512<byte> M1N(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.ShuffleNative(l_v, s);
    }
    return l_v;
}

; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 70

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpermb   zmm1, zmm0, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x001F
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0028
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 44

No more vpcmpub vpmovm2b vpandd, but it's still using vpermb, not vpshufb, so ShuffleNative doesn't completely solve the problem.

====================================================================

As a side note, if the shuffle indices are in the loop but as a local,

static Vector512<byte> M2L(Vector512<byte> v)
{
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                             0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
        l_v = Vector512.Shuffle(l_v, s);
    }
    return l_v;
}

; Assembly listing for method ConsoleApp1.Program:M2L(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 73

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x000B
       vpshufb  zmm0, zmm0, zmmword ptr [reloc @RWD00]
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0019
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0022
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 38

It does use vpshufb, but keeps reloading zmmword ptr [reloc @RWD00] in the loop.

tannergooding · 2025-04-28T15:38:37Z

No more vpcmpub vpmovm2b vpandd, but it's still using vpermb, not vpshufb, so ShuffleNative doesn't completely solve the problem.

vpermb gets used because the indices parameter isn't known and therefore it must assume that shuffling across the 4x128-bit lanes may occur. This is a 3 vs 1 cycle latency and is unlikely to have significant impact to the overall performance of the algorithm or application.

You have a range of APIs available (from xplat APIs to hardware specific intrinsics) based on your scenario and how critical the performance actually is to your scenario. Typical scenarios benefit greatly from the 2-64x perf increase from using vectorization and any minor performance degradation from ensuring deterministic results or handling edge cases may show up in a micro-benchmark, but are unlikely to show up in the end-to-end profile of the real world application. You're likely to lose far more performance simply by running on hardware without AVX512 or due to general machine latency caused by accessing memory, cache misses, branch mispredicts, etc.

However, if it is a scenario where that performance is important, then you have access to various *Native APIs that allow more perf in favor of no longer being deterministic across hardware for certain edge cases. If even that isn't good enough, then you have access to the Isa.* methods that give fine grained control over the exact instruction, but which require you to write more code so you can cover all the hardware you need to support. -- Mixing xplat and platform specific APIs is also possible and works as expected, so you can write generally xplat code and still do opportunistic lightup for any perf sensitive areas.

tannergooding · 2025-04-28T15:48:05Z

M1 is probably getting pessimized a bit because something is missing from optIsProfitableToSubstitute and the Shuffle API isn't being told to propagate the constant into the loop: https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/assertionprop.cpp#L3081

I'd guess that ShouldConstantProp in particular is missing a check here.

tfenise · 2025-04-28T16:40:56Z

vpermb gets used because the indices parameter isn't known and therefore it must assume that shuffling across the 4x128-bit lanes may occur. This is a 3 vs 1 cycle latency and is unlikely to have significant impact to the overall performance of the algorithm or application.

The worst case is when AVX512VBMI is unavailable, then it uses software fallback. This may happen on some CPUs (https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512).

Details

First, $env:DOTNET_EnableAVX512VBMI="0"

; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 74
; 0 inlinees with PGO data; 5 single block inlinees; 1 inlinees without PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 248
       vxorps   xmm4, xmm4, xmm4
       vmovdqu32 zmmword ptr [rsp+0x80], zmm4

G_M000_IG02:                ;; offset=0x0013
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       jmp      SHORT G_M000_IG04
       align    [0 bytes for IG05]

G_M000_IG03:                ;; offset=0x002A
       vmovups  zmm1, zmmword ptr [rsp+0x80]
       dec      eax
       je       SHORT G_M000_IG08

G_M000_IG04:                ;; offset=0x0036
       vmovups  zmmword ptr [rsp], zmm1
       vmovups  zmmword ptr [rsp+0x40], zmm0
       xor      edx, edx
       jmp      SHORT G_M000_IG07

G_M000_IG05:                ;; offset=0x0049
       lea      r9, [rsp]
       mov      r8d, r8d
       movzx    r9, byte  ptr [r9+r8]

G_M000_IG06:                ;; offset=0x0055
       lea      r8, [rsp+0x80]
       mov      byte  ptr [r8+r10], r9b
       inc      edx
       cmp      edx, 64
       jge      SHORT G_M000_IG03

G_M000_IG07:                ;; offset=0x0068
       lea      r8, [rsp+0x40]
       movsxd   r10, edx
       movzx    r8, byte  ptr [r8+r10]
       xor      r9d, r9d
       cmp      r8d, 64
       jge      SHORT G_M000_IG06
       jmp      SHORT G_M000_IG05

G_M000_IG08:                ;; offset=0x0080
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG09:                ;; offset=0x0089
       vzeroupper
       add      rsp, 248
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 148

tannergooding · 2025-04-28T16:59:38Z

This may happen on some CPUs (https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512).

This won't happen in practice for .NET provided you're correctly checking Vector512.IsHardwareAccelerated.

We treat Skylake (Server), Cascade Lake, and Cannon Lake as "1st generation AVX512" hardware and so while Avx512F.IsSupported will report true, Vector512.IsHardwareAccelerated will report false. This is due to the extreme frequency throttling that exists with 512-bit vector usage on this hardware.

So, outside of a user override, we only report acceleration on Cannon Lake or later, all of which have VBMI.

tfenise · 2025-04-28T17:30:36Z

Then why not let Vector512.IsHardwareAccelerated report false if AVX512VBMI is unavailable?

Otherwise, if some hypothetical CPU with AVX512F+CD+VL+DQ+BW but without AVX512VBMI but not "1st generation AVX512" is still considered supported for Vector512, I think the fallback of Shuffle(Vector512<Byte>, Vector512<Byte>) should be improved to use some AVX512F+CD+VL+DQ+BW instructions, not just scalar codes, similar to Shuffle(Vector256<Byte>, Vector256<Byte>) with AVX2 but without AVX512.

tannergooding · 2025-04-28T17:49:58Z

Then why not let Vector512.IsHardwareAccelerated report false if AVX512VBMI is unavailable?

That's effectively what's happening already. Power users can override this, as they can do for many other considerations.

We similarly don't mark Vector128 as not being accelerated just because Ssse3 isn't available.

Otherwise, if some hypothetical CPU with AVX512F+CD+VL+DQ+BW but without AVX512VBMI but not "1st generation AVX512" is still considered supported for Vector512

It's not really worth designing for unlikely hypotheticals. Such a world is outside the realm of what is being pushed for and explicitly documented in the design of AVX10, APX, X86S, and other features.

I think the fallback of Shuffle(Vector512, Vector512) should be improved to use some AVX512F+CD+VL+DQ+BW instructions

It already uses those where it can. When it can't, then there's not really a way to emulate the functionality.

tfenise · 2025-04-28T18:43:34Z

It is possible to emulate Shuffle(Vector512<Byte>, Vector512<Byte>) with AVX512F+CD+VL+DQ+BW by trying to first get the lower byte of each word correct using vpermw and variable shift, do the same for the upper byte, then combine, although I guess it would be considered as "not really worth designing".

tannergooding · 2025-04-28T19:20:00Z

There are some fairly convoluted ways you can attempt to emulate, which require significant complexity in ensuring all edges are handled correctly. This is particularly true if the indices parameter is not a known constant.

It ends up being far more trouble than its worth and doesn't really provide any substantial improvements as such hardware is unlikely to exist. For the simplest case, you end up with at least having to:

Split indexes to get the even/odd int8
Adjust from int8 index to int16 index
- This includes ensuring the upper half of the int16 is clear
Do two int16 shuffles
Adjust each int16 part such that the bits you need are in the right position
- This involves some kind of variable shift based on the byte non adjusted byte index, as some will require left shift by 8, some right shift by 8, and some will involve no shift
Finally, merge them together selecting the even/odd bytes

So you're left with this decently expensive 10+ instruction sequence to emulate a variable shuffle. One that is has a decently expensive dependency chain and will saturate the ports in a hot loop.

tfenise added the tenet-performance label Apr 26, 2025

ghost added the area-CodeGen-coreclr label Apr 26, 2025

dotnet-policy-service bot added the untriaged label Apr 26, 2025

JulieLeeMSFT removed the untriaged label May 6, 2025

JulieLeeMSFT added this to the Future milestone May 29, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Vector512.Shuffle does not produce optimal codegen in some case #115078

Vector512.Shuffle does not produce optimal codegen in some case #115078

tfenise commented Apr 26, 2025

dotnet-policy-service bot commented Apr 26, 2025

Uh oh!

huoyaoyuan commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025 •

edited

Loading

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

Vector512.Shuffle does not produce optimal codegen in some case #115078

Vector512.Shuffle does not produce optimal codegen in some case #115078

Comments

tfenise commented Apr 26, 2025

Description

Configuration

Regression?

Analysis

dotnet-policy-service bot commented Apr 26, 2025

Uh oh!

huoyaoyuan commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tfenise commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025

Uh oh!

tannergooding commented Apr 28, 2025 •

edited

Loading