Skip to content

Commit

Permalink
Merge pull request #2003 from lioncash/svespill
Browse files Browse the repository at this point in the history
JITs: Handle spilling/filling 256-bit vectors
  • Loading branch information
Sonicadvance1 committed Sep 24, 2022
2 parents 8f70137 + 707db51 commit 6742e0c
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 42 deletions.
63 changes: 47 additions & 16 deletions External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Expand Up @@ -217,7 +217,8 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP
const auto Reg = SRAFPR[i];

if (((1U << Reg.GetCode()) & FPRSpillMask) != 0) {
str(Reg.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.avx.data[i][0])));
mov(TMP4, offsetof(Core::CpuStateFrame, State.xmm.avx.data[i][0]));
st1b(Reg.Z().VnB(), PRED_TMP_32B, SVEMemOperand(STATE, TMP4));
}
}
} else {
Expand Down Expand Up @@ -260,11 +261,19 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF

if (FPRs) {
if (EmitterCTX->HostFeatures.SupportsAVX) {
// Set up predicate registers.
// We don't bother spilling these in SpillStaticRegs,
// since all that matters is we restore them on a fill.
// It's not a concern if they get trounced by something else.
ptrue(PRED_TMP_16B.VnB(), SVE_VL16);
ptrue(PRED_TMP_32B.VnB(), SVE_VL32);

for (size_t i = 0; i < SRAFPR.size(); i++) {
const auto Reg = SRAFPR[i];

if (((1U << Reg.GetCode()) & FPRFillMask) != 0) {
ldr(Reg.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.avx.data[i][0])));
mov(TMP4, offsetof(Core::CpuStateFrame, State.xmm.avx.data[i][0]));
ld1b(Reg.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(STATE, TMP4));
}
}
} else {
Expand All @@ -289,20 +298,31 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
}

void Arm64Emitter::PushDynamicRegsAndLR() {
uint64_t SPOffset = AlignUp((RA64.size() + 1) * 8 + RAFPR.size() * 16, 16);
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
: Core::CPUState::XMM_SSE_REG_SIZE;
const auto FPRSize = RAFPR.size() * FPRRegSize;
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);

sub(sp, sp, SPOffset);
int i = 0;

for (auto RA : RAFPR)
{
str(RA.Q(), MemOperand(sp, i * 8));
i+=2;
if (CanUseSVE) {
for (const auto& RA : RAFPR) {
mov(TMP4, i * 8);
st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
i += 4;
}
} else {
for (const auto& RA : RAFPR) {
str(RA.Q(), MemOperand(sp, i * 8));
i += 2;
}
}

#if 0 // All GPRs should be caller saved
for (auto RA : RA64)
{
for (const auto& RA : RA64) {
str(RA, MemOperand(sp, i * 8));
i++;
}
Expand All @@ -312,18 +332,29 @@ void Arm64Emitter::PushDynamicRegsAndLR() {
}

void Arm64Emitter::PopDynamicRegsAndLR() {
uint64_t SPOffset = AlignUp((RA64.size() + 1) * 8 + RAFPR.size() * 16, 16);
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
: Core::CPUState::XMM_SSE_REG_SIZE;
const auto FPRSize = RAFPR.size() * FPRRegSize;
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
int i = 0;

for (auto RA : RAFPR)
{
ldr(RA.Q(), MemOperand(sp, i * 8));
i+=2;
if (CanUseSVE) {
for (const auto& RA : RAFPR) {
mov(TMP4, i * 8);
ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
i += 4;
}
} else {
for (const auto& RA : RAFPR) {
ldr(RA.Q(), MemOperand(sp, i * 8));
i += 2;
}
}

#if 0 // All GPRs should be caller saved
for (auto RA : RA64)
{
for (const auto& RA : RA64) {
ldr(RA, MemOperand(sp, i * 8));
i++;
}
Expand Down
Expand Up @@ -65,7 +65,7 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
// Contains the address to the currently available CPU state
#define STATE x28

// GPR temporaries (only x2 and x3 can be used across spill boundaries)
// GPR temporaries. Only x3 can be used across spill boundaries
// so if these ever need to change, be very careful about that.
#define TMP1 x0
#define TMP2 x1
Expand All @@ -77,6 +77,12 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
#define VTMP2 v2
#define VTMP3 v3

// Predicate register temporaries (used when AVX support is enabled)
// PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
// PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1.
#define PRED_TMP_16B p6
#define PRED_TMP_32B p7

// This class contains common emitter utility functions that can
// be used by both Arm64 JIT and ARM64 Dispatcher
class Arm64Emitter : public vixl::aarch64::Assembler {
Expand All @@ -86,6 +92,10 @@ class Arm64Emitter : public vixl::aarch64::Assembler {
FEXCore::Context::Context *EmitterCTX;
vixl::aarch64::CPU CPU;
void LoadConstant(vixl::aarch64::Register Reg, uint64_t Constant, bool NOPPad = false);

// NOTE: These functions WILL clobber the register TMP4 if AVX support is enabled
// and FPRs are being spilled or filled. If only GPRs are spilled/filled, then
// TMP4 is left alone.
void SpillStaticRegs(bool FPRs = true, uint32_t GPRSpillMask = ~0U, uint32_t FPRSpillMask = ~0U);
void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U);

Expand Down
Expand Up @@ -38,7 +38,7 @@ namespace FEXCore::CPU {
using namespace vixl;
using namespace vixl::aarch64;

constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096;
constexpr size_t MAX_DISPATCHER_CODE_SIZE = 8192;

Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const DispatcherConfig &config)
: FEXCore::CPU::Dispatcher(ctx, config), Arm64Emitter(ctx, MAX_DISPATCHER_CODE_SIZE)
Expand Down Expand Up @@ -437,10 +437,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
LUDIVHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV));

SpillStaticRegs();
#ifdef VIXL_SIMULATOR
GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
#else
Expand All @@ -460,10 +459,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
LDIVHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV));

SpillStaticRegs();
#ifdef VIXL_SIMULATOR
GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
#else
Expand All @@ -483,10 +481,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
LUREMHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM));

SpillStaticRegs();
#ifdef VIXL_SIMULATOR
GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
#else
Expand All @@ -506,10 +503,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
LREMHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM));

SpillStaticRegs();
#ifdef VIXL_SIMULATOR
GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
#else
Expand Down
Expand Up @@ -471,6 +471,7 @@ DEF_OP(CPUID) {
auto Op = IROp->C<IR::IROp_CPUID>();

PushDynamicRegsAndLR();
SpillStaticRegs();

// x0 = CPUID Handler
// x1 = CPUID Function
Expand All @@ -479,14 +480,13 @@ DEF_OP(CPUID) {
ldr(x3, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.CPUIDFunction)));
mov(x1, GetReg<RA_64>(Op->Function.ID()));
mov(x2, GetReg<RA_64>(Op->Leaf.ID()));
SpillStaticRegs();
#ifdef VIXL_SIMULATOR
GenerateIndirectRuntimeCall<__uint128_t, void*, uint64_t, uint64_t>(x3);
#else
blr(x3);
#endif
FillStaticRegs();

FillStaticRegs();
PopDynamicRegsAndLR();

// Results are in x0, x1
Expand Down
5 changes: 3 additions & 2 deletions External/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
Expand Up @@ -137,6 +137,7 @@ DEF_OP(Print) {
auto Op = IROp->C<IR::IROp_Print>();

PushDynamicRegsAndLR();
SpillStaticRegs();

if (IsGPR(Op->Value.ID())) {
mov(x0, GetReg<RA_64>(Op->Value.ID()));
Expand All @@ -148,10 +149,10 @@ DEF_OP(Print) {
fmov(x1, GetSrc(Op->Value.ID()).V1D(), 1);
ldr(x3, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.PrintVectorValue)));
}
SpillStaticRegs();

blr(x3);
FillStaticRegs();

FillStaticRegs();
PopDynamicRegsAndLR();
}

Expand Down
34 changes: 22 additions & 12 deletions External/FEXCore/Source/Interface/Core/JIT/x86_64/JIT.cpp
Expand Up @@ -60,32 +60,42 @@ static void PrintVectorValue(uint64_t Value, uint64_t ValueUpper) {
namespace FEXCore::CPU {

void X86JITCore::PushRegs() {
sub(rsp, 16 * RAXMM_x.size());
const auto AVXRegSize = Core::CPUState::XMM_AVX_REG_SIZE;

sub(rsp, AVXRegSize * RAXMM_x.size());
for (size_t i = 0; i < RAXMM_x.size(); ++i) {
movaps(ptr[rsp + i * 16], RAXMM_x[i]);
vmovups(ptr[rsp + i * AVXRegSize], ToYMM(RAXMM_x[i]));
}

for (auto &Reg : RA64)
for (const auto &Reg : RA64) {
push(Reg);
}

auto NumPush = RA64.size();
if (NumPush & 1)
sub(rsp, 8); // Align
const auto NumPush = RA64.size();
if ((NumPush & 1) != 0) {
// Align
sub(rsp, 8);
}
}

void X86JITCore::PopRegs() {
auto NumPush = RA64.size();
const auto AVXRegSize = Core::CPUState::XMM_AVX_REG_SIZE;
const auto NumPush = RA64.size();

if (NumPush & 1)
add(rsp, 8); // Align
for (uint32_t i = RA64.size(); i > 0; --i)
if ((NumPush & 1) != 0) {
// Align
add(rsp, 8);
}

for (uint32_t i = RA64.size(); i > 0; --i) {
pop(RA64[i - 1]);
}

for (size_t i = 0; i < RAXMM_x.size(); ++i) {
movaps(RAXMM_x[i], ptr[rsp + i * 16]);
vmovups(ToYMM(RAXMM_x[i]), ptr[rsp + i * AVXRegSize]);
}

add(rsp, 16 * RAXMM_x.size());
add(rsp, AVXRegSize * RAXMM_x.size());
}

void X86JITCore::Op_Unhandled(IR::IROp_Header *IROp, IR::NodeID Node) {
Expand Down

0 comments on commit 6742e0c

Please sign in to comment.