Skip to content

Commit

Permalink
Merge pull request #3274 from alyssarosenzweig/opt/pf
Browse files Browse the repository at this point in the history
Arm64Emitter: Dedicate registers for PF/AF
  • Loading branch information
Sonicadvance1 committed Nov 18, 2023
2 parents 1c11509 + 11993da commit 9b64674
Show file tree
Hide file tree
Showing 61 changed files with 17,987 additions and 17,972 deletions.
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Context/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ namespace FEXCore::Context {
void HandleCallback(FEXCore::Core::InternalThreadState *Thread, uint64_t RIP) override;

uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState *Thread, uint64_t HostPC) override;
uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState *Thread) override;
uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState *Thread, bool WasInJIT, uint64_t *HostGPRs, uint64_t PSTATE) override;
void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState *Thread, uint32_t EFLAGS) override;

/**
Expand Down
51 changes: 39 additions & 12 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: MIT
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
#include "FEXCore/Core/X86Enums.h"
#include "FEXCore/Utils/AllocatorHooks.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Registers.h"
Expand Down Expand Up @@ -28,31 +29,31 @@ namespace FEXCore::CPU {

namespace x64 {
// All but x19 and x29 are caller saved
constexpr std::array<FEXCore::ARMEmitter::Register, 16> SRA = {
constexpr std::array<FEXCore::ARMEmitter::Register, 18> SRA = {
FEXCore::ARMEmitter::Reg::r4, FEXCore::ARMEmitter::Reg::r5,
FEXCore::ARMEmitter::Reg::r6, FEXCore::ARMEmitter::Reg::r7,
FEXCore::ARMEmitter::Reg::r8, FEXCore::ARMEmitter::Reg::r9,
FEXCore::ARMEmitter::Reg::r10, FEXCore::ARMEmitter::Reg::r11,
FEXCore::ARMEmitter::Reg::r12, FEXCore::ARMEmitter::Reg::r13,
FEXCore::ARMEmitter::Reg::r14, FEXCore::ARMEmitter::Reg::r15,
FEXCore::ARMEmitter::Reg::r16, FEXCore::ARMEmitter::Reg::r17,
FEXCore::ARMEmitter::Reg::r19, FEXCore::ARMEmitter::Reg::r29
FEXCore::ARMEmitter::Reg::r19, FEXCore::ARMEmitter::Reg::r29,
// PF/AF must be last.
REG_PF, REG_AF,
};

constexpr std::array<FEXCore::ARMEmitter::Register, 9> RA = {
constexpr std::array<FEXCore::ARMEmitter::Register, 7> RA = {
// All these callee saved
FEXCore::ARMEmitter::Reg::r20, FEXCore::ARMEmitter::Reg::r21,
FEXCore::ARMEmitter::Reg::r22, FEXCore::ARMEmitter::Reg::r23,
FEXCore::ARMEmitter::Reg::r24, FEXCore::ARMEmitter::Reg::r25,
FEXCore::ARMEmitter::Reg::r26, FEXCore::ARMEmitter::Reg::r27,
FEXCore::ARMEmitter::Reg::r30,
};

constexpr std::array<std::pair<FEXCore::ARMEmitter::Register, FEXCore::ARMEmitter::Register>, 4> RAPair = {{
constexpr std::array<std::pair<FEXCore::ARMEmitter::Register, FEXCore::ARMEmitter::Register>, 3> RAPair = {{
{FEXCore::ARMEmitter::Reg::r20, FEXCore::ARMEmitter::Reg::r21},
{FEXCore::ARMEmitter::Reg::r22, FEXCore::ARMEmitter::Reg::r23},
{FEXCore::ARMEmitter::Reg::r24, FEXCore::ARMEmitter::Reg::r25},
{FEXCore::ARMEmitter::Reg::r26, FEXCore::ARMEmitter::Reg::r27},
}};

// All are caller saved
Expand Down Expand Up @@ -175,19 +176,20 @@ namespace x64 {

namespace x32 {
// All but x19 and x29 are caller saved
constexpr std::array<FEXCore::ARMEmitter::Register, 8> SRA = {
constexpr std::array<FEXCore::ARMEmitter::Register, 10> SRA = {
FEXCore::ARMEmitter::Reg::r4, FEXCore::ARMEmitter::Reg::r5,
FEXCore::ARMEmitter::Reg::r6, FEXCore::ARMEmitter::Reg::r7,
FEXCore::ARMEmitter::Reg::r8, FEXCore::ARMEmitter::Reg::r9,
FEXCore::ARMEmitter::Reg::r10, FEXCore::ARMEmitter::Reg::r11,
// PF/AF must be last.
REG_PF, REG_AF,
};

constexpr std::array<FEXCore::ARMEmitter::Register, 17> RA = {
constexpr std::array<FEXCore::ARMEmitter::Register, 15> RA = {
// All these callee saved
FEXCore::ARMEmitter::Reg::r20, FEXCore::ARMEmitter::Reg::r21,
FEXCore::ARMEmitter::Reg::r22, FEXCore::ARMEmitter::Reg::r23,
FEXCore::ARMEmitter::Reg::r24, FEXCore::ARMEmitter::Reg::r25,
FEXCore::ARMEmitter::Reg::r26, FEXCore::ARMEmitter::Reg::r27,

// Registers only available on 32-bit
// All these are caller saved (except for r19).
Expand All @@ -199,11 +201,10 @@ namespace x32 {
FEXCore::ARMEmitter::Reg::r19,
};

constexpr std::array<std::pair<FEXCore::ARMEmitter::Register, FEXCore::ARMEmitter::Register>, 8> RAPair = {{
constexpr std::array<std::pair<FEXCore::ARMEmitter::Register, FEXCore::ARMEmitter::Register>, 7> RAPair = {{
{FEXCore::ARMEmitter::Reg::r20, FEXCore::ARMEmitter::Reg::r21},
{FEXCore::ARMEmitter::Reg::r22, FEXCore::ARMEmitter::Reg::r23},
{FEXCore::ARMEmitter::Reg::r24, FEXCore::ARMEmitter::Reg::r25},
{FEXCore::ARMEmitter::Reg::r26, FEXCore::ARMEmitter::Reg::r27},

{FEXCore::ARMEmitter::Reg::r12, FEXCore::ARMEmitter::Reg::r13},
{FEXCore::ARMEmitter::Reg::r14, FEXCore::ARMEmitter::Reg::r15},
Expand Down Expand Up @@ -368,7 +369,7 @@ Arm64Emitter::Arm64Emitter(FEXCore::Context::ContextImpl *ctx, void* EmissionPtr
GeneralFPRegisters = x64::RAFPR;
}
else {
ConfiguredDynamicRegisterBase = std::span(x32::RA.begin() + 8, 8);
ConfiguredDynamicRegisterBase = std::span(x32::RA.begin() + 6, 8);

StaticRegisters = x32::SRA;
GeneralRegisters = x32::RA;
Expand Down Expand Up @@ -613,6 +614,11 @@ void Arm64Emitter::SpillStaticRegs(FEXCore::ARMEmitter::Register TmpReg, bool FP
return;
}

// PF/AF are special, remove them from the mask
uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx())));
unsigned PFAFSpillMask = GPRSpillMask & PFAFMask;
GPRSpillMask &= ~PFAFSpillMask;

for (size_t i = 0; i < StaticRegisters.size(); i+=2) {
auto Reg1 = StaticRegisters[i];
auto Reg2 = StaticRegisters[i+1];
Expand All @@ -628,6 +634,14 @@ void Arm64Emitter::SpillStaticRegs(FEXCore::ARMEmitter::Register TmpReg, bool FP
}
}

// Now handle PF/AF
if (PFAFSpillMask) {
LOGMAN_THROW_A_FMT(PFAFSpillMask == PFAFMask, "PF/AF not spilled together");

str(REG_PF.X(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.pf_raw));
str(REG_AF.X(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.af_raw));
}

if (FPRs) {
if (EmitterCTX->HostFeatures.SupportsAVX) {
for (size_t i = 0; i < StaticFPRegisters.size(); i++) {
Expand Down Expand Up @@ -770,6 +784,11 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
}
}

// PF/AF are special, remove them from the mask
uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx())));
uint32_t PFAFFillMask = GPRFillMask & PFAFMask;
GPRFillMask &= ~PFAFMask;

for (size_t i = 0; i < StaticRegisters.size(); i+=2) {
auto Reg1 = StaticRegisters[i];
auto Reg2 = StaticRegisters[i+1];
Expand All @@ -784,6 +803,14 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
ldr(Reg2.X(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i+1]));
}
}

// Now handle PF/AF
if (PFAFFillMask) {
LOGMAN_THROW_A_FMT(PFAFFillMask == PFAFMask, "PF/AF not filled together");

ldr(REG_PF.X(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.pf_raw));
ldr(REG_AF.X(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.af_raw));
}
}

void Arm64Emitter::PushVectorRegisters(FEXCore::ARMEmitter::Register TmpReg, bool SVERegs, std::span<const FEXCore::ARMEmitter::VRegister> VRegs) {
Expand Down
4 changes: 4 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ constexpr auto VTMP2 = FEXCore::ARMEmitter::VReg::v1;
constexpr FEXCore::ARMEmitter::PRegister PRED_TMP_16B = FEXCore::ARMEmitter::PReg::p6;
constexpr FEXCore::ARMEmitter::PRegister PRED_TMP_32B = FEXCore::ARMEmitter::PReg::p7;

// We pin r26/r27 as PF/AF respectively, this is internal FEX ABI.
constexpr auto REG_PF = FEXCore::ARMEmitter::Reg::r26;
constexpr auto REG_AF = FEXCore::ARMEmitter::Reg::r27;

// This class contains common emitter utility functions that can
// be used by both Arm64 JIT and ARM64 Dispatcher
class Arm64Emitter : public FEXCore::ARMEmitter::Emitter {
Expand Down
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/CodeEmitter/ALUOps.inl
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,16 @@ public:
dc32(Op);
}

void axflag() {
constexpr uint32_t Op = 0b1101'0101'0000'0000'0100'0000'0101'1111;
dc32(Op);
}

void xaflag() {
constexpr uint32_t Op = 0b1101'0101'0000'0000'0100'0000'0011'1111;
dc32(Op);
}

// Conditional compare - register
void ccmn(FEXCore::ARMEmitter::Size s, FEXCore::ARMEmitter::Register rn, FEXCore::ARMEmitter::Register rm, FEXCore::ARMEmitter::StatusFlags flags, FEXCore::ARMEmitter::Condition Cond) {
constexpr uint32_t Op = 0b0011'1010'010 << 21;
Expand Down
29 changes: 22 additions & 7 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ desc: Glues Frontend, OpDispatcher and IR Opts & Compilation, LookupCache, Dispa
#include <cstdint>
#include "FEXCore/Utils/DeferredSignalMutex.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/ArchHelpers//Arm64Emitter.h"
#include "Interface/Core/LookupCache.h"
#include "Interface/Core/CPUID.h"
#include "Interface/Core/Frontend.h"
Expand Down Expand Up @@ -220,7 +221,7 @@ namespace FEXCore::Context {
return Frame->State.rip;
}

uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState *Thread) {
uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState *Thread, bool WasInJIT, uint64_t *HostGPRs, uint64_t PSTATE) {
const auto Frame = Thread->CurrentFrame;
uint32_t EFLAGS{};

Expand All @@ -242,9 +243,23 @@ namespace FEXCore::Context {
}
}

// SF/ZF/CF/OF are packed in a 32-bit value in RFLAG_NZCV_LOC.
uint32_t Packed_NZCV{};
memcpy(&Packed_NZCV, &Frame->State.flags[X86State::RFLAG_NZCV_LOC], sizeof(Packed_NZCV));
if (WasInJIT) {
// If we were in the JIT then NZCV is in the CPU's PSTATE object.
// Packed in to the same bit locations as RFLAG_NZCV_LOC.
Packed_NZCV = PSTATE;

// If we were in the JIT then PF and AF are in registers.
// Move them to the CPUState frame now.
Frame->State.pf_raw = HostGPRs[CPU::REG_PF.Idx()];
Frame->State.af_raw = HostGPRs[CPU::REG_AF.Idx()];
}
else {
// If we were not in the JIT then the NZCV state is stored in the CPUState RFLAG_NZCV_LOC.
// SF/ZF/CF/OF are packed in a 32-bit value in RFLAG_NZCV_LOC.
memcpy(&Packed_NZCV, &Frame->State.flags[X86State::RFLAG_NZCV_LOC], sizeof(Packed_NZCV));
}

uint32_t OF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_OF_RAW_LOC)) & 1;
uint32_t CF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_CF_RAW_LOC)) & 1;
uint32_t ZF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_ZF_RAW_LOC)) & 1;
Expand All @@ -258,13 +273,13 @@ namespace FEXCore::Context {

// PF calculation is deferred, calculate it now.
// Popcount the 8-bit flag and then extract the lower bit.
uint32_t PFByte = Frame->State.flags[X86State::RFLAG_PF_RAW_LOC];
uint32_t PFByte = Frame->State.pf_raw & 0xff;
uint32_t PF = std::popcount(PFByte ^ 1) & 1;
EFLAGS |= PF << X86State::RFLAG_PF_RAW_LOC;

// AF calculation is deferred, calculate it now.
// XOR with PF byte and extract bit 4.
uint32_t AF = ((Frame->State.flags[X86State::RFLAG_AF_RAW_LOC] ^ PFByte) & (1 << 4)) ? 1 : 0;
uint32_t AF = ((Frame->State.af_raw ^ PFByte) & (1 << 4)) ? 1 : 0;
EFLAGS |= AF << X86State::RFLAG_AF_RAW_LOC;

return EFLAGS;
Expand All @@ -284,11 +299,11 @@ namespace FEXCore::Context {
// AF stored in bit 4 in our internal representation. It is also
// XORed with byte 4 of the PF byte, but we write that as zero here so
// we don't need any special handling for that.
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? (1 << 4) : 0;
Frame->State.af_raw = (EFLAGS & (1U << i)) ? (1 << 4) : 0;
break;
case X86State::RFLAG_PF_RAW_LOC:
// PF is inverted in our internal representation.
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 0 : 1;
Frame->State.pf_raw = (EFLAGS & (1U << i)) ? 0 : 1;
break;
default:
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 1 : 0;
Expand Down
36 changes: 5 additions & 31 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ DEF_OP(RmifNZCV) {
rmif(GetReg(Op->Src.ID()).X(), Op->Rotate, Op->Mask);
}

DEF_OP(AXFlag) {
LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM2, "Unsupported flagm2 op");
axflag();
}

ARMEmitter::Condition MapSelectCC(IR::CondClassType Cond) {
switch (Cond.Val) {
case FEXCore::IR::COND_EQ: return ARMEmitter::Condition::CC_EQ;
Expand Down Expand Up @@ -1531,41 +1536,10 @@ DEF_OP(FCmp) {
auto Op = IROp->C<IR::IROp_FCmp>();
const auto EmitSubSize = Op->ElementSize == 8 ? ARMEmitter::ScalarRegSize::i64Bit : ARMEmitter::ScalarRegSize::i32Bit;

ARMEmitter::Register Dst = GetReg(Node);
ARMEmitter::VRegister Scalar1 = GetVReg(Op->Scalar1.ID());
ARMEmitter::VRegister Scalar2 = GetVReg(Op->Scalar2.ID());

fcmp(EmitSubSize, Scalar1, Scalar2);
bool set = false;

if (Op->Flags & (1 << IR::FCMP_FLAG_EQ)) {
LOGMAN_THROW_AA_FMT(IR::FCMP_FLAG_EQ == 0, "IR::FCMP_FLAG_EQ must equal 0");
// EQ or unordered
cset(ARMEmitter::Size::i64Bit, Dst, ARMEmitter::Condition::CC_EQ); // Z = 1
csinc(ARMEmitter::Size::i64Bit, Dst, Dst, ARMEmitter::Reg::zr, ARMEmitter::Condition::CC_VC); // IF !V ? Z : 1
set = true;
}

if (Op->Flags & (1 << IR::FCMP_FLAG_LT)) {
// LT or unordered
cset(ARMEmitter::Size::i64Bit, TMP2, ARMEmitter::Condition::CC_LT);
if (!set) {
lsl(ARMEmitter::Size::i64Bit, Dst, TMP2, IR::FCMP_FLAG_LT);
set = true;
} else {
bfi(ARMEmitter::Size::i64Bit, Dst, TMP2, IR::FCMP_FLAG_LT, 1);
}
}

if (Op->Flags & (1 << IR::FCMP_FLAG_UNORDERED)) {
cset(ARMEmitter::Size::i64Bit, TMP2, ARMEmitter::Condition::CC_VS);
if (!set) {
lsl(ARMEmitter::Size::i64Bit, Dst, TMP2, IR::FCMP_FLAG_UNORDERED);
set = true;
} else {
bfi(ARMEmitter::Size::i64Bit, Dst, TMP2, IR::FCMP_FLAG_UNORDERED, 1);
}
}
}

#undef DEF_OP
Expand Down
23 changes: 20 additions & 3 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ tags: backend|arm64
$end_info$
*/

#include "FEXCore/Core/X86Enums.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Registers.h"
Expand Down Expand Up @@ -295,7 +296,11 @@ DEF_OP(LoadRegisterSRA) {
const auto OpSize = IROp->Size;

if (Op->Class == IR::GPRClass) {
const auto regId = (Op->Offset - offsetof(Core::CpuStateFrame, State.gregs[0])) / Core::CPUState::GPR_REG_SIZE;
const auto regId =
Op->Offset == offsetof(Core::CpuStateFrame, State.pf_raw) ? (StaticRegisters.size() - 2) :
Op->Offset == offsetof(Core::CpuStateFrame, State.af_raw) ? (StaticRegisters.size() - 1) :
(Op->Offset - offsetof(Core::CpuStateFrame, State.gregs[0])) / Core::CPUState::GPR_REG_SIZE;

const auto regOffs = Op->Offset & 7;

LOGMAN_THROW_A_FMT(regId < StaticRegisters.size(), "out of range regId");
Expand Down Expand Up @@ -473,10 +478,14 @@ DEF_OP(StoreRegisterSRA) {
const auto OpSize = IROp->Size;

if (Op->Class == IR::GPRClass) {
const auto regId = (Op->Offset / Core::CPUState::GPR_REG_SIZE) - 1;
const auto regOffs = Op->Offset & 7;

LOGMAN_THROW_A_FMT(regId < StaticFPRegisters.size(), "out of range regId");
const auto regId =
Op->Offset == offsetof(Core::CpuStateFrame, State.pf_raw) ? (StaticRegisters.size() - 2) :
Op->Offset == offsetof(Core::CpuStateFrame, State.af_raw) ? (StaticRegisters.size() - 1) :
(Op->Offset - offsetof(Core::CpuStateFrame, State.gregs[0])) / Core::CPUState::GPR_REG_SIZE;

LOGMAN_THROW_A_FMT(regId < StaticRegisters.size(), "out of range regId");

const auto reg = StaticRegisters[regId];
const auto Src = GetReg(Op->Value.ID());
Expand Down Expand Up @@ -1047,12 +1056,20 @@ DEF_OP(LoadFlag) {
auto Op = IROp->C<IR::IROp_LoadFlag>();
auto Dst = GetReg(Node);

LOGMAN_THROW_A_FMT(Op->Flag != X86State::RFLAG_PF_RAW_LOC &&
Op->Flag != X86State::RFLAG_AF_RAW_LOC,
"PF/AF must be accessed as registers");

ldrb(Dst, STATE, offsetof(FEXCore::Core::CPUState, flags[0]) + Op->Flag);
}

DEF_OP(StoreFlag) {
auto Op = IROp->C<IR::IROp_StoreFlag>();

LOGMAN_THROW_A_FMT(Op->Flag != X86State::RFLAG_PF_RAW_LOC &&
Op->Flag != X86State::RFLAG_AF_RAW_LOC,
"PF/AF must be accessed as registers");

strb(GetReg(Op->Value.ID()), STATE, offsetof(FEXCore::Core::CPUState, flags[0]) + Op->Flag);
}

Expand Down

0 comments on commit 9b64674

Please sign in to comment.