Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

why this shuffle is inefficient? #2832

Open
MkazemAkhgary opened this issue Apr 9, 2024 · 0 comments
Open

why this shuffle is inefficient? #2832

MkazemAkhgary opened this issue Apr 9, 2024 · 0 comments
Labels
Performance All issues related to performance/code generation

Comments

@MkazemAkhgary
Copy link

MkazemAkhgary commented Apr 9, 2024

following code

unmasked int32 Test(uniform int32* uniform data, int32 i, int32 j)
{
    int32 a;
    for(int32 _i = 0; _i <= 8; _i++)
    {
        if(i == _i) a = shuffle(data[programIndex + programCount * _i], j);
    }
    return a;
}

compiles to bunch of vpinsrd and vpextrd instead of using one vpermd.

Test___UM_un_3C_uni_3E_vyivyi:          # @Test___UM_un_3C_uni_3E_vyivyi
        push    rbp
        mov     rbp, rsp
        push    r14
        push    rbx
        and     rsp, -32
        sub     rsp, 32
        vpextrd eax, xmm1, 1
        vpextrd ecx, xmm1, 2
        vmovd   edx, xmm1
        vpextrd esi, xmm1, 3
        vextracti128    xmm1, ymm1, 1
        vpextrd r8d, xmm1, 1
        vpextrd r9d, xmm1, 2
        vmovd   r10d, xmm1
        vpextrd r11d, xmm1, 3
        vpxor   xmm4, xmm4, xmm4
        vpcmpeqd        ymm2, ymm2, ymm2
        xor     ebx, ebx
        vpbroadcastd    ymm3, dword ptr [rip + .LCPI1_0] # ymm3 = [7,7,7,7,7,7,7,7]
        and     edx, 7
        and     eax, 7
        and     ecx, 7
        and     esi, 7
        and     r10d, 7
        and     r8d, 7
        and     r9d, 7
        and     r11d, 7
        vpcmpeqd        ymm5, ymm5, ymm5
        jmp     .LBB1_1
.LBB1_2:                                #   in Loop: Header=BB1_1 Depth=1
        vpminud ymm6, ymm4, ymm3
        vpcmpeqd        ymm6, ymm4, ymm6
        vpsubd  ymm4, ymm4, ymm2
        vpand   ymm5, ymm6, ymm5
        vmovmskps       r14d, ymm5
        add     ebx, 32
        test    r14d, r14d
        je      .LBB1_3
.LBB1_1:                                # =>This Inner Loop Header: Depth=1
        vpcmpeqd        ymm6, ymm4, ymm0
        vpand   ymm6, ymm6, ymm5
        vmovmskps       r14d, ymm6
        test    r14d, r14d
        je      .LBB1_2
        movsxd  r14, ebx
        vmaskmovps      ymm7, ymm6, ymmword ptr [rdi + r14]
        vmovaps ymmword ptr [rsp], ymm7
        vmovd   xmm7, dword ptr [rsp + 4*r10]   # xmm7 = mem[0],zero,zero,zero
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r8], 1
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r9], 2
        vpinsrd xmm7, xmm7, dword ptr [rsp + 4*r11], 3
        vmovd   xmm8, dword ptr [rsp + 4*rdx]   # xmm8 = mem[0],zero,zero,zero
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rax], 1
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rcx], 2
        vpinsrd xmm8, xmm8, dword ptr [rsp + 4*rsi], 3
        vinserti128     ymm7, ymm8, xmm7, 1
        vblendvps       ymm1, ymm1, ymm7, ymm6
        jmp     .LBB1_2
.LBB1_3:
        vmovaps ymm0, ymm1
        lea     rsp, [rbp - 16]
        pop     rbx
        pop     r14
        pop     rbp
        ret

following code resembles more of what I had

unmasked int32 Test2(uniform int32 * uniform data, int32 i, int32 j)
{
    int32 a;
    foreach_unique(_i in i)
    {
        int32 b;
        unmasked
        {
            b = shuffle(data[programIndex + programCount * _i], j);
        }
	a = b;
    }
    return a;
}
@pbrubaker pbrubaker added the Performance All issues related to performance/code generation label Apr 14, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Performance All issues related to performance/code generation
Projects
None yet
Development

No branches or pull requests

2 participants