Skip to content

Vector512.Shuffle does not produce optimal codegen in some case #115078

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
tfenise opened this issue Apr 26, 2025 · 11 comments
Open

Vector512.Shuffle does not produce optimal codegen in some case #115078

tfenise opened this issue Apr 26, 2025 · 11 comments
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Milestone

Comments

@tfenise
Copy link
Contributor

tfenise commented Apr 26, 2025

Description

Code

static Vector512<byte> M1(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.Shuffle(l_v, s);
    }
    return l_v;
}

static Vector512<byte> M2(Vector512<byte> v)
{
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.Shuffle(l_v, Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                                      0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte());
    }
    return l_v;
}

static Vector512<byte> M3(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    l_v = Vector512.Shuffle(l_v, s);
    return l_v;
}

Codegen

; Assembly listing for method ConsoleApp1.Program:M1(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 69

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpermb   zmm1, zmm0, zmm1
       vpcmpub  k1, zmm0, zmmword ptr [reloc @RWD64], 1
       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0036
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x003F
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h
RWD64   dq      4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h, 4040404040404040h

; Total bytes of code 67

; Assembly listing for method ConsoleApp1.Program:M2(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 72

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [reloc @RWD00]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpshufb  zmm0, zmm0, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x001F
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0028
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 44

; Assembly listing for method ConsoleApp1.Program:M3(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vpshufb  zmm0, zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0019
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 29

Note in M1, Vector512.Shuffle() generates the slower and less available(vpermb requires AVX512_VBMI) sequence of

       vpermb   zmm1, zmm0, zmm1
       vpcmpub  k1, zmm0, zmmword ptr [reloc @RWD64], 1
       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1

but in M2 where s is inlined, or in M3 where the loop is removed, Vector512.Shuffle() generates the intended vpshufb.

As a side note, in the sequence

       vpmovm2b zmm2, k1
       vpandd   zmm1, zmm2, zmm1

it should just use something like vmovdqu8 zmm1 k1z, zmm1 to save an instruction.

Configuration

.NET 10.0.100-preview.5.25224.4
Windows 11, x64

Regression?

Not known.

Analysis

Encountered while preparing #115069. Related #72793. Possibly related #76781.

@tfenise tfenise added the tenet-performance Performance related issue label Apr 26, 2025
@ghost ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Apr 26, 2025
@dotnet-policy-service dotnet-policy-service bot added the untriaged New issue has not been triaged by the area owner label Apr 26, 2025
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

@huoyaoyuan
Copy link
Member

Encountered while preparing #115069.

Note: you can use ShuffleNative to workaround JIT limitation if you can prove the behavior.

@tfenise
Copy link
Contributor Author

tfenise commented Apr 28, 2025

ShuffleNative:

static Vector512<byte> M1N(Vector512<byte> v)
{
    Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                         0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        l_v = Vector512.ShuffleNative(l_v, s);
    }
    return l_v;
}
; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 70

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x0015
       vpermb   zmm1, zmm0, zmm1
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x001F
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0028
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 44

No more vpcmpub vpmovm2b vpandd, but it's still using vpermb, not vpshufb, so ShuffleNative doesn't completely solve the problem.

====================================================================

As a side note, if the shuffle indices are in the loop but as a local,

static Vector512<byte> M2L(Vector512<byte> v)
{
    Vector512<byte> l_v = v;
    for (int i = 0; i < 100; i++)
    {
        Vector512<byte> s = Vector512.Create(0x01020304_05060708, 0x090A0B0C_0D0E0F00, 0x11121314_15161718, 0x191A1B1C_1D1E1F10,
                                             0x21222324_25262728, 0x292A2B2C_2D2E2F20, 0x31323334_35363738, 0x393A3B3C_3D3E3F30).AsByte();
        l_v = Vector512.Shuffle(l_v, s);
    }
    return l_v;
}
; Assembly listing for method ConsoleApp1.Program:M2L(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 73

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       mov      eax, 100
       align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x000B
       vpshufb  zmm0, zmm0, zmmword ptr [reloc @RWD00]
       dec      eax
       jne      SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0019
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG05:                ;; offset=0x0022
       vzeroupper
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 38

It does use vpshufb, but keeps reloading zmmword ptr [reloc @RWD00] in the loop.

@tannergooding
Copy link
Member

tannergooding commented Apr 28, 2025

No more vpcmpub vpmovm2b vpandd, but it's still using vpermb, not vpshufb, so ShuffleNative doesn't completely solve the problem.

vpermb gets used because the indices parameter isn't known and therefore it must assume that shuffling across the 4x128-bit lanes may occur. This is a 3 vs 1 cycle latency and is unlikely to have significant impact to the overall performance of the algorithm or application.

You have a range of APIs available (from xplat APIs to hardware specific intrinsics) based on your scenario and how critical the performance actually is to your scenario. Typical scenarios benefit greatly from the 2-64x perf increase from using vectorization and any minor performance degradation from ensuring deterministic results or handling edge cases may show up in a micro-benchmark, but are unlikely to show up in the end-to-end profile of the real world application. You're likely to lose far more performance simply by running on hardware without AVX512 or due to general machine latency caused by accessing memory, cache misses, branch mispredicts, etc.

However, if it is a scenario where that performance is important, then you have access to various *Native APIs that allow more perf in favor of no longer being deterministic across hardware for certain edge cases. If even that isn't good enough, then you have access to the Isa.* methods that give fine grained control over the exact instruction, but which require you to write more code so you can cover all the hardware you need to support. -- Mixing xplat and platform specific APIs is also possible and works as expected, so you can write generally xplat code and still do opportunistic lightup for any perf sensitive areas.

@tannergooding
Copy link
Member

M1 is probably getting pessimized a bit because something is missing from optIsProfitableToSubstitute and the Shuffle API isn't being told to propagate the constant into the loop: https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/assertionprop.cpp#L3081

I'd guess that ShouldConstantProp in particular is missing a check here.

@tfenise
Copy link
Contributor Author

tfenise commented Apr 28, 2025

vpermb gets used because the indices parameter isn't known and therefore it must assume that shuffling across the 4x128-bit lanes may occur. This is a 3 vs 1 cycle latency and is unlikely to have significant impact to the overall performance of the algorithm or application.

The worst case is when AVX512VBMI is unavailable, then it uses software fallback. This may happen on some CPUs (https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512).

Details

First, $env:DOTNET_EnableAVX512VBMI="0"

; Assembly listing for method ConsoleApp1.Program:M1N(System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 74
; 0 inlinees with PGO data; 5 single block inlinees; 1 inlinees without PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 248
       vxorps   xmm4, xmm4, xmm4
       vmovdqu32 zmmword ptr [rsp+0x80], zmm4

G_M000_IG02:                ;; offset=0x0013
       vmovups  zmm0, zmmword ptr [reloc @RWD00]
       vmovups  zmm1, zmmword ptr [rdx]
       mov      eax, 100
       jmp      SHORT G_M000_IG04
       align    [0 bytes for IG05]

G_M000_IG03:                ;; offset=0x002A
       vmovups  zmm1, zmmword ptr [rsp+0x80]
       dec      eax
       je       SHORT G_M000_IG08

G_M000_IG04:                ;; offset=0x0036
       vmovups  zmmword ptr [rsp], zmm1
       vmovups  zmmword ptr [rsp+0x40], zmm0
       xor      edx, edx
       jmp      SHORT G_M000_IG07

G_M000_IG05:                ;; offset=0x0049
       lea      r9, [rsp]
       mov      r8d, r8d
       movzx    r9, byte  ptr [r9+r8]

G_M000_IG06:                ;; offset=0x0055
       lea      r8, [rsp+0x80]
       mov      byte  ptr [r8+r10], r9b
       inc      edx
       cmp      edx, 64
       jge      SHORT G_M000_IG03

G_M000_IG07:                ;; offset=0x0068
       lea      r8, [rsp+0x40]
       movsxd   r10, edx
       movzx    r8, byte  ptr [r8+r10]
       xor      r9d, r9d
       cmp      r8d, 64
       jge      SHORT G_M000_IG06
       jmp      SHORT G_M000_IG05

G_M000_IG08:                ;; offset=0x0080
       vmovups  zmmword ptr [rcx], zmm1
       mov      rax, rcx

G_M000_IG09:                ;; offset=0x0089
       vzeroupper
       add      rsp, 248
       ret

RWD00   dq      0102030405060708h, 090A0B0C0D0E0F00h, 1112131415161718h, 191A1B1C1D1E1F10h, 2122232425262728h, 292A2B2C2D2E2F20h, 3132333435363738h, 393A3B3C3D3E3F30h

; Total bytes of code 148

@tannergooding
Copy link
Member

This may happen on some CPUs (https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512).

This won't happen in practice for .NET provided you're correctly checking Vector512.IsHardwareAccelerated.

We treat Skylake (Server), Cascade Lake, and Cannon Lake as "1st generation AVX512" hardware and so while Avx512F.IsSupported will report true, Vector512.IsHardwareAccelerated will report false. This is due to the extreme frequency throttling that exists with 512-bit vector usage on this hardware.

So, outside of a user override, we only report acceleration on Cannon Lake or later, all of which have VBMI.

@tfenise
Copy link
Contributor Author

tfenise commented Apr 28, 2025

Then why not let Vector512.IsHardwareAccelerated report false if AVX512VBMI is unavailable?

Otherwise, if some hypothetical CPU with AVX512F+CD+VL+DQ+BW but without AVX512VBMI but not "1st generation AVX512" is still considered supported for Vector512, I think the fallback of Shuffle(Vector512<Byte>, Vector512<Byte>) should be improved to use some AVX512F+CD+VL+DQ+BW instructions, not just scalar codes, similar to Shuffle(Vector256<Byte>, Vector256<Byte>) with AVX2 but without AVX512.

@tannergooding
Copy link
Member

Then why not let Vector512.IsHardwareAccelerated report false if AVX512VBMI is unavailable?

That's effectively what's happening already. Power users can override this, as they can do for many other considerations.

We similarly don't mark Vector128 as not being accelerated just because Ssse3 isn't available.

Otherwise, if some hypothetical CPU with AVX512F+CD+VL+DQ+BW but without AVX512VBMI but not "1st generation AVX512" is still considered supported for Vector512

It's not really worth designing for unlikely hypotheticals. Such a world is outside the realm of what is being pushed for and explicitly documented in the design of AVX10, APX, X86S, and other features.

I think the fallback of Shuffle(Vector512, Vector512) should be improved to use some AVX512F+CD+VL+DQ+BW instructions

It already uses those where it can. When it can't, then there's not really a way to emulate the functionality.

@tfenise
Copy link
Contributor Author

tfenise commented Apr 28, 2025

It is possible to emulate Shuffle(Vector512<Byte>, Vector512<Byte>) with AVX512F+CD+VL+DQ+BW by trying to first get the lower byte of each word correct using vpermw and variable shift, do the same for the upper byte, then combine, although I guess it would be considered as "not really worth designing".

@tannergooding
Copy link
Member

There are some fairly convoluted ways you can attempt to emulate, which require significant complexity in ensuring all edges are handled correctly. This is particularly true if the indices parameter is not a known constant.

It ends up being far more trouble than its worth and doesn't really provide any substantial improvements as such hardware is unlikely to exist. For the simplest case, you end up with at least having to:

  • Split indexes to get the even/odd int8
  • Adjust from int8 index to int16 index
    • This includes ensuring the upper half of the int16 is clear
  • Do two int16 shuffles
  • Adjust each int16 part such that the bits you need are in the right position
    • This involves some kind of variable shift based on the byte non adjusted byte index, as some will require left shift by 8, some right shift by 8, and some will involve no shift
  • Finally, merge them together selecting the even/odd bytes

So you're left with this decently expensive 10+ instruction sequence to emulate a variable shuffle. One that is has a decently expensive dependency chain and will saturate the ports in a hot loop.

@JulieLeeMSFT JulieLeeMSFT removed the untriaged New issue has not been triaged by the area owner label May 6, 2025
@JulieLeeMSFT JulieLeeMSFT added this to the Future milestone May 29, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Projects
None yet
Development

No branches or pull requests

4 participants