Skip to content

Commit

Permalink
SPU: add some AVX-512 variants
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekotekina committed Dec 19, 2017
1 parent 4aee4ed commit 6d34dcd
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 41 deletions.
25 changes: 24 additions & 1 deletion Utilities/sysinfo.cpp
Expand Up @@ -19,13 +19,26 @@ bool utils::has_avx()
return g_value;
}

bool utils::has_avx2()
{
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && get_cpuid(7, 0)[1] & 0x20;
return g_value;
}

bool utils::has_rtm()
{
// Check RTM and MPX extensions in order to filter out TSX on Haswell CPUs
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x4800) == 0x4800;
return g_value;
}

bool utils::has_512()
{
// Check AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL extensions (Skylake-X level support)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0xd0030000) == 0xd0030000;
return g_value;
}

std::string utils::get_system_info()
{
std::string result;
Expand Down Expand Up @@ -68,7 +81,17 @@ std::string utils::get_system_info()

if (has_avx())
{
result += " | AVX+";
result += " | AVX";

if (has_avx2())
{
result += '+';
}

if (has_512())
{
result += '+';
}
}

if (has_rtm())
Expand Down
10 changes: 7 additions & 3 deletions Utilities/sysinfo.h
Expand Up @@ -8,20 +8,24 @@ namespace utils
inline std::array<u32, 4> get_cpuid(u32 func, u32 subfunc)
{
int regs[4];
#ifdef _MSC_VER
#ifdef _MSC_VER
__cpuidex(regs, func, subfunc);
#else
#else
__asm__ volatile("cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) : "a" (func), "c" (subfunc));
#endif
#endif
return {0u+regs[0], 0u+regs[1], 0u+regs[2], 0u+regs[3]};
}

bool has_ssse3();

bool has_avx();

bool has_avx2();

bool has_rtm();

bool has_512();

inline bool transaction_enter()
{
while (true)
Expand Down
2 changes: 1 addition & 1 deletion asmjit
231 changes: 196 additions & 35 deletions rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
Expand Up @@ -113,6 +113,13 @@ void spu_recompiler::compile(spu_function_t& f)
vec.at(i) = vec_vars.data() + i;
}

compiler.alloc(vec_vars[0], asmjit::x86::xmm0);
compiler.alloc(vec_vars[1], asmjit::x86::xmm1);
compiler.alloc(vec_vars[2], asmjit::x86::xmm2);
compiler.alloc(vec_vars[3], asmjit::x86::xmm3);
compiler.alloc(vec_vars[4], asmjit::x86::xmm4);
compiler.alloc(vec_vars[5], asmjit::x86::xmm5);

// Initialize labels
std::vector<Label> pos_labels{ 0x10000 };
this->labels = pos_labels.data();
Expand Down Expand Up @@ -562,6 +569,16 @@ void spu_recompiler::ABSDB(spu_opcode_t op)

void spu_recompiler::ROT(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->vprolvd(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u32* t, const u32* a, const s32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
Expand Down Expand Up @@ -589,6 +606,18 @@ void spu_recompiler::ROT(spu_opcode_t op)

void spu_recompiler::ROTM(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->vpsrlvd(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
Expand Down Expand Up @@ -617,6 +646,18 @@ void spu_recompiler::ROTM(spu_opcode_t op)

void spu_recompiler::ROTMA(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->vpsravd(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](s32* t, const s32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
Expand Down Expand Up @@ -645,6 +686,17 @@ void spu_recompiler::ROTMA(spu_opcode_t op)

void spu_recompiler::SHL(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->pand(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->vpsllvd(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
Expand Down Expand Up @@ -672,6 +724,28 @@ void spu_recompiler::SHL(spu_opcode_t op)

void spu_recompiler::ROTH(spu_opcode_t op) //nf
{
if (utils::has_512())
{
// Trying to implement 16-bit rotates using 32-bit rotates and only XMM registers.
// 1) Cannot use YMM/ZMM: transition penalty in mixed code, CPU frequency penalty.
// 2) Cross-lane instructions like VPMOVZX are expensive.
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
const XmmLink& v2 = XmmAlloc();
c->vpunpckhwd(v1, va, va);
c->vpunpcklwd(v2, va, va);
c->vpunpckhwd(va, vb, vb);
c->vpunpcklwd(vb, vb, vb);
c->vprolvd(va, v1, va);
c->vprolvd(vb, v2, vb);
c->psrad(va, 16);
c->psrad(vb, 16);
c->packssdw(vb, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
return;
}

auto body = [](u16* t, const u16* a, const s16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
Expand Down Expand Up @@ -699,6 +773,18 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf

void spu_recompiler::ROTHM(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->vpsrlvw(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
Expand Down Expand Up @@ -727,6 +813,18 @@ void spu_recompiler::ROTHM(spu_opcode_t op)

void spu_recompiler::ROTMAH(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->vpsravw(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](s16* t, const s16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
Expand Down Expand Up @@ -755,6 +853,17 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)

void spu_recompiler::SHLH(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->vpsllvw(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
Expand Down Expand Up @@ -1783,6 +1892,15 @@ void spu_recompiler::HGT(spu_opcode_t op)

void spu_recompiler::CLZ(spu_opcode_t op)
{
if (utils::has_512())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->vplzcntd(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}

auto body = [](u32* t, const u32* a) noexcept
{
for (u32 i = 0; i < 4; i++)
Expand Down Expand Up @@ -3073,6 +3191,14 @@ void spu_recompiler::SELB(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vc = XmmGet(op.rc, XmmType::Int);

if (utils::has_512())
{
c->vpternlogd(vc, vb, SPU_OFF_128(gpr, op.ra), 0xca /* A?B:C */);
c->movdqa(SPU_OFF_128(gpr, op.rt4), vc);
return;
}

c->pand(vb, vc);
c->pandn(vc, SPU_OFF_128(gpr, op.ra));
c->por(vb, vc);
Expand All @@ -3081,6 +3207,30 @@ void spu_recompiler::SELB(spu_opcode_t op)

void spu_recompiler::SHUFB(spu_opcode_t op)
{
if (0 && utils::has_512())
{
// Deactivated due to poor performance of mask merge ops.
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vc = XmmGet(op.rc, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& vm = XmmAlloc();
c->vpcmpub(asmjit::x86::k1, vc, XmmConst(_mm_set1_epi8(-0x40)), 5 /* GE */);
c->vpxor(vm, vc, XmmConst(_mm_set1_epi8(0xf)));
c->setExtraReg(asmjit::x86::k1);
c->z().vblendmb(vc, vc, XmmConst(_mm_set1_epi8(-1))); // {k1}
c->vpcmpub(asmjit::x86::k2, vm, XmmConst(_mm_set1_epi8(-0x20)), 5 /* GE */);
c->vptestmb(asmjit::x86::k1, vm, XmmConst(_mm_set1_epi8(0x10)));
c->vpshufb(vt, va, vm);
c->setExtraReg(asmjit::x86::k2);
c->z().vblendmb(va, va, XmmConst(_mm_set1_epi8(0x7f))); // {k2}
c->setExtraReg(asmjit::x86::k1);
c->vpshufb(vt, vb, vm); // {k1}
c->vpternlogd(vt, va, vc, 0xf6 /* orAxorBC */);
c->movdqa(SPU_OFF_128(gpr, op.rt4), vt);
return;
}

alignas(16) static thread_local u8 s_lut[256]
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Expand Down Expand Up @@ -3136,41 +3286,52 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
return;
}

const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask
const XmmLink& v1 = XmmAlloc();
const XmmLink& v2 = XmmAlloc();
const XmmLink& v3 = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
const XmmLink& vFF = XmmAlloc();
c->movdqa(v2, v0); // v2 = mask
// generate specific values:
c->movdqa(v1, XmmConst(_mm_set1_epi8(-0x20))); // v1 = 11100000
c->movdqa(v3, XmmConst(_mm_set1_epi8(-0x80))); // v3 = 10000000
c->pand(v2, v1); // filter mask v2 = mask & 11100000
c->movdqa(vFF, v2); // and copy vFF = mask & 11100000
c->movdqa(v4, XmmConst(_mm_set1_epi8(-0x40))); // v4 = 11000000
c->pcmpeqb(vFF, v4); // gen 0xff vFF = (mask & 11100000 == 11000000) ? 0xff : 0
c->movdqa(v4, v2); // copy again v4 = mask & 11100000
c->pand(v4, v3); // filter mask v4 = mask & 10000000
c->pcmpeqb(v2, v1); // v2 = (mask & 11100000 == 11100000) ? 0xff : 0
c->pcmpeqb(v4, v3); // v4 = (mask & 10000000 == 10000000) ? 0xff : 0
c->pand(v2, v3); // generate 0x80 v2 = (mask & 11100000 == 11100000) ? 0x80 : 0
c->por(vFF, v2); // merge 0xff, 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0
c->pandn(v1, v0); // filter mask v1 = mask & 00011111
// select bytes from [op.rb]:
c->movdqa(v2, XmmConst(_mm_set1_epi8(0x0f))); // v2 = 00001111
c->pxor(v1, XmmConst(_mm_set1_epi8(0x10))); // v1 = (mask & 00011111) ^ 00010000
c->psubb(v2, v1); // v2 = 00001111 - ((mask & 00011111) ^ 00010000)
c->movdqa(v1, SPU_OFF_128(gpr, op.rb)); // v1 = op.rb
c->pshufb(v1, v2); // v1 = select(op.rb, 00001111 - ((mask & 00011111) ^ 00010000))
// select bytes from [op.ra]:
c->pxor(v2, XmmConst(_mm_set1_epi8(-0x10))); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000
c->movdqa(v3, SPU_OFF_128(gpr, op.ra)); // v3 = op.ra
c->pshufb(v3, v2); // v3 = select(op.ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000)
c->por(v1, v3); // v1 = select(op.rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3)
c->pandn(v4, v1); // filter result v4 = v1 & ((mask & 10000000 == 10000000) ? 0 : 0xff)
c->por(vFF, v4); // final merge vFF = (mask & 10000000 == 10000000) ? ((mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0) : (v1)
c->movdqa(SPU_OFF_128(gpr, op.rt4), vFF);
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vc = XmmGet(op.rc, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& vm = XmmAlloc();
const XmmLink& v5 = XmmAlloc();
c->movdqa(vm, XmmConst(_mm_set1_epi8(0xc0)));

// Test for (110xxxxx) and (11xxxxxx) bit values
if (utils::has_avx())
{
c->vpand(v5, vc, XmmConst(_mm_set1_epi8(0xe0)));
c->vpand(vt, vc, vm);
}
else
{
c->movdqa(v5, vc);
c->pand(v5, XmmConst(_mm_set1_epi8(0xe0)));
c->movdqa(vt, vc);
c->pand(vt, vm);
}

c->pxor(vc, XmmConst(_mm_set1_epi8(0xf)));
c->pshufb(va, vc);
c->pshufb(vb, vc);
c->pand(vc, XmmConst(_mm_set1_epi8(0x10)));
c->pcmpeqb(v5, vm); // If true, result should become 0xFF
c->pcmpeqb(vt, vm); // If true, result should become either 0xFF or 0x80
c->pavgb(vt, v5); // Generate result constant: AVG(0xff, 0x00) == 0x80
c->pxor(vm, vm);
c->pcmpeqb(vc, vm);

// Select result value from va or vb
if (utils::has_512())
{
c->vpternlogd(vc, va, vb, 0xca /* A?B:C */);
}
else
{
c->pand(va, vc);
c->pandn(vc, vb);
c->por(vc, va);
}

c->por(vt, vc);
c->movdqa(SPU_OFF_128(gpr, op.rt4), vt);
}

void spu_recompiler::MPYA(spu_opcode_t op)
Expand Down

0 comments on commit 6d34dcd

Please sign in to comment.