why this shuffle is inefficient? #2832

MkazemAkhgary · 2024-04-09T21:39:23Z

following code

unmasked int32 Test(uniform int32* uniform data, int32 i, int32 j)
{
    int32 a;
    for(int32 _i = 0; _i <= 8; _i++)
    {
        if(i == _i) a = shuffle(data[programIndex + programCount * _i], j);
    }
    return a;
}

compiles to bunch of vpinsrd and vpextrd instead of using one vpermd.

Test___UM_un_3C_uni_3E_vyivyi:          # @Test___UM_un_3C_uni_3E_vyivyi
        push    rbp
        mov     rbp, rsp
        push    r14
        push    rbx
        and     rsp, -32
        sub     rsp, 32
        vpextrd eax, xmm1, 1
        vpextrd ecx, xmm1, 2
        vmovd   edx, xmm1
        vpextrd esi, xmm1, 3
        vextracti128    xmm1, ymm1, 1
        vpextrd r8d, xmm1, 1
        vpextrd r9d, xmm1, 2
        vmovd   r10d, xmm1
        vpextrd r11d, xmm1, 3
        vpxor   xmm4, xmm4, xmm4
        vpcmpeqd        ymm2, ymm2, ymm2
        xor     ebx, ebx
        vpbroadcastd    ymm3, dword ptr [rip + .LCPI1_0] # ymm3 = [7,7,7,7,7,7,7,7]
        and     edx, 7
        and     eax, 7
        and     ecx, 7
        and     esi, 7
        and     r10d, 7
        and     r8d, 7
        and     r9d, 7
        and     r11d, 7
        vpcmpeqd        ymm5, ymm5, ymm5
        jmp     .LBB1_1
.LBB1_2:                                #   in Loop: Header=BB1_1 Depth=1
        vpminud ymm6, ymm4, ymm3
        vpcmpeqd        ymm6, ymm4, ymm6
        vpsubd  ymm4, ymm4, ymm2
        vpand   ymm5, ymm6, ymm5
        vmovmskps       r14d, ymm5
        add     ebx, 32
        test    r14d, r14d
        je      .LBB1_3
.LBB1_1:                                # =>This Inner Loop Header: Depth=1
        vpcmpeqd        ymm6, ymm4, ymm0
        vpand   ymm6, ymm6, ymm5
        vmovmskps       r14d, ymm6
        test    r14d, r14d
        je      .LBB1_2
        movsxd  r14, ebx
        vmaskmovps      ymm7, ymm6, ymmword ptr [rdi + r14]
        vmovaps ymmword ptr [rsp], ymm7
        vmovd   xmm7, dword ptr [rsp + 4*r10]   # xmm7 = mem[0],zero,zero,zero
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r8], 1
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r9], 2
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r11], 3
        vmovd   xmm8, dword ptr [rsp + 4*rdx]   # xmm8 = mem[0],zero,zero,zero
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rax], 1
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rcx], 2
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rsi], 3
        vinserti128     ymm7, ymm8, xmm7, 1
        vblendvps       ymm1, ymm1, ymm7, ymm6
        jmp     .LBB1_2
.LBB1_3:
        vmovaps ymm0, ymm1
        lea     rsp, [rbp - 16]
        pop     rbx
        pop     r14
        pop     rbp
        ret

following code resembles more of what I had

unmasked int32 Test2(uniform int32 * uniform data, int32 i, int32 j)
{
    int32 a;
    foreach_unique(_i in i)
    {
        int32 b;
        unmasked
        {
            b = shuffle(data[programIndex + programCount * _i], j);
        }
	a = b;
    }
    return a;
}

The text was updated successfully, but these errors were encountered:

pbrubaker added the Performance All issues related to performance/code generation label Apr 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

why this shuffle is inefficient? #2832

why this shuffle is inefficient? #2832

MkazemAkhgary commented Apr 9, 2024 •

edited

why this shuffle is inefficient? #2832

why this shuffle is inefficient? #2832

Comments

MkazemAkhgary commented Apr 9, 2024 • edited

MkazemAkhgary commented Apr 9, 2024 •

edited