Merge pull request #2003 from lioncash/svespill

JITs: Handle spilling/filling 256-bit vectors
FEX-Emu · Sep 24, 2022 · 6742e0c · 6742e0c
2 parents 8f70137 + 707db51
commit 6742e0c
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 42 deletions.
diff --git a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@@ -217,7 +217,8 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP
           const auto Reg = SRAFPR[i];
 
           if (((1U << Reg.GetCode()) & FPRSpillMask) != 0) {
-            str(Reg.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.avx.data[i][0])));
+            mov(TMP4, offsetof(Core::CpuStateFrame, State.xmm.avx.data[i][0]));
+            st1b(Reg.Z().VnB(), PRED_TMP_32B, SVEMemOperand(STATE, TMP4));
           }
         }
       } else {
@@ -260,11 +261,19 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
 
     if (FPRs) {
       if (EmitterCTX->HostFeatures.SupportsAVX) {
+        // Set up predicate registers.
+        // We don't bother spilling these in SpillStaticRegs,
+        // since all that matters is we restore them on a fill.
+        // It's not a concern if they get trounced by something else.
+        ptrue(PRED_TMP_16B.VnB(), SVE_VL16);
+        ptrue(PRED_TMP_32B.VnB(), SVE_VL32);
+
         for (size_t i = 0; i < SRAFPR.size(); i++) {
           const auto Reg = SRAFPR[i];
 
           if (((1U << Reg.GetCode()) & FPRFillMask) != 0) {
-            ldr(Reg.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.avx.data[i][0])));
+            mov(TMP4, offsetof(Core::CpuStateFrame, State.xmm.avx.data[i][0]));
+            ld1b(Reg.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(STATE, TMP4));
           }
         }
       } else {
@@ -289,20 +298,31 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
 }
 
 void Arm64Emitter::PushDynamicRegsAndLR() {
-  uint64_t SPOffset = AlignUp((RA64.size() + 1) * 8 + RAFPR.size() * 16, 16);
+  const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
+  const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
+  const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
+                                    : Core::CPUState::XMM_SSE_REG_SIZE;
+  const auto FPRSize = RAFPR.size() * FPRRegSize;
+  const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
 
   sub(sp, sp, SPOffset);
   int i = 0;
 
-  for (auto RA : RAFPR)
-  {
-    str(RA.Q(), MemOperand(sp, i * 8));
-    i+=2;
+  if (CanUseSVE) {
+    for (const auto& RA : RAFPR) {
+      mov(TMP4, i * 8);
+      st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
+      i += 4;
+    }
+  } else {
+    for (const auto& RA : RAFPR) {
+      str(RA.Q(), MemOperand(sp, i * 8));
+      i += 2;
+    }
   }
 
 #if 0 // All GPRs should be caller saved
-  for (auto RA : RA64)
-  {
+  for (const auto& RA : RA64) {
     str(RA, MemOperand(sp, i * 8));
     i++;
   }
@@ -312,18 +332,29 @@ void Arm64Emitter::PushDynamicRegsAndLR() {
 }
 
 void Arm64Emitter::PopDynamicRegsAndLR() {
-  uint64_t SPOffset = AlignUp((RA64.size() + 1) * 8 + RAFPR.size() * 16, 16);
+  const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
+  const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
+  const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
+                                    : Core::CPUState::XMM_SSE_REG_SIZE;
+  const auto FPRSize = RAFPR.size() * FPRRegSize;
+  const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
   int i = 0;
 
-  for (auto RA : RAFPR)
-  {
-    ldr(RA.Q(), MemOperand(sp, i * 8));
-    i+=2;
+  if (CanUseSVE) {
+    for (const auto& RA : RAFPR) {
+      mov(TMP4, i * 8);
+      ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
+      i += 4;
+    }
+  } else {
+    for (const auto& RA : RAFPR) {
+      ldr(RA.Q(), MemOperand(sp, i * 8));
+      i += 2;
+    }
   }
 
 #if 0 // All GPRs should be caller saved
-  for (auto RA : RA64)
-  {
+  for (const auto& RA : RA64) {
     ldr(RA, MemOperand(sp, i * 8));
     i++;
   }

diff --git a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
@@ -65,7 +65,7 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
 // Contains the address to the currently available CPU state
 #define STATE x28
 
-// GPR temporaries (only x2 and x3 can be used across spill boundaries)
+// GPR temporaries. Only x3 can be used across spill boundaries
 // so if these ever need to change, be very careful about that.
 #define TMP1 x0
 #define TMP2 x1
@@ -77,6 +77,12 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
 #define VTMP2 v2
 #define VTMP3 v3
 
+// Predicate register temporaries (used when AVX support is enabled)
+// PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
+// PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1.
+#define PRED_TMP_16B p6
+#define PRED_TMP_32B p7
+
 // This class contains common emitter utility functions that can
 // be used by both Arm64 JIT and ARM64 Dispatcher
 class Arm64Emitter : public vixl::aarch64::Assembler {
@@ -86,6 +92,10 @@ class Arm64Emitter : public vixl::aarch64::Assembler {
   FEXCore::Context::Context *EmitterCTX;
   vixl::aarch64::CPU CPU;
   void LoadConstant(vixl::aarch64::Register Reg, uint64_t Constant, bool NOPPad = false);
+
+  // NOTE: These functions WILL clobber the register TMP4 if AVX support is enabled
+  //       and FPRs are being spilled or filled. If only GPRs are spilled/filled, then
+  //       TMP4 is left alone.
   void SpillStaticRegs(bool FPRs = true, uint32_t GPRSpillMask = ~0U, uint32_t FPRSpillMask = ~0U);
   void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U);
 

diff --git a/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp b/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp
@@ -38,7 +38,7 @@ namespace FEXCore::CPU {
 using namespace vixl;
 using namespace vixl::aarch64;
 
-constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096;
+constexpr size_t MAX_DISPATCHER_CODE_SIZE = 8192;
 
 Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const DispatcherConfig &config)
   : FEXCore::CPU::Dispatcher(ctx, config), Arm64Emitter(ctx, MAX_DISPATCHER_CODE_SIZE)
@@ -437,10 +437,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
     LUDIVHandlerAddress = GetCursorAddress<uint64_t>();
 
     PushDynamicRegsAndLR();
+    SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV));
-
-    SpillStaticRegs();
 #ifdef VIXL_SIMULATOR
     GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
 #else
@@ -460,10 +459,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
     LDIVHandlerAddress = GetCursorAddress<uint64_t>();
 
     PushDynamicRegsAndLR();
+    SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV));
-
-    SpillStaticRegs();
 #ifdef VIXL_SIMULATOR
     GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
 #else
@@ -483,10 +481,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
     LUREMHandlerAddress = GetCursorAddress<uint64_t>();
 
     PushDynamicRegsAndLR();
+    SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM));
-
-    SpillStaticRegs();
 #ifdef VIXL_SIMULATOR
     GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
 #else
@@ -506,10 +503,9 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
     LREMHandlerAddress = GetCursorAddress<uint64_t>();
 
     PushDynamicRegsAndLR();
+    SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM));
-
-    SpillStaticRegs();
 #ifdef VIXL_SIMULATOR
     GenerateIndirectRuntimeCall<uint64_t, uint64_t, uint64_t, uint64_t>(x3);
 #else

diff --git a/External/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp b/External/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp
@@ -471,6 +471,7 @@ DEF_OP(CPUID) {
   auto Op = IROp->C<IR::IROp_CPUID>();
 
   PushDynamicRegsAndLR();
+  SpillStaticRegs();
 
   // x0 = CPUID Handler
   // x1 = CPUID Function
@@ -479,14 +480,13 @@ DEF_OP(CPUID) {
   ldr(x3, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.CPUIDFunction)));
   mov(x1, GetReg<RA_64>(Op->Function.ID()));
   mov(x2, GetReg<RA_64>(Op->Leaf.ID()));
-  SpillStaticRegs();
 #ifdef VIXL_SIMULATOR
   GenerateIndirectRuntimeCall<__uint128_t, void*, uint64_t, uint64_t>(x3);
 #else
   blr(x3);
 #endif
-  FillStaticRegs();
 
+  FillStaticRegs();
   PopDynamicRegsAndLR();
 
   // Results are in x0, x1

diff --git a/External/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp b/External/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
@@ -137,6 +137,7 @@ DEF_OP(Print) {
   auto Op = IROp->C<IR::IROp_Print>();
 
   PushDynamicRegsAndLR();
+  SpillStaticRegs();
 
   if (IsGPR(Op->Value.ID())) {
     mov(x0, GetReg<RA_64>(Op->Value.ID()));
@@ -148,10 +149,10 @@ DEF_OP(Print) {
     fmov(x1, GetSrc(Op->Value.ID()).V1D(), 1);
     ldr(x3, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.PrintVectorValue)));
   }
-  SpillStaticRegs();
+
   blr(x3);
-  FillStaticRegs();
 
+  FillStaticRegs();
   PopDynamicRegsAndLR();
 }
 

diff --git a/External/FEXCore/Source/Interface/Core/JIT/x86_64/JIT.cpp b/External/FEXCore/Source/Interface/Core/JIT/x86_64/JIT.cpp
@@ -60,32 +60,42 @@ static void PrintVectorValue(uint64_t Value, uint64_t ValueUpper) {
 namespace FEXCore::CPU {
 
 void X86JITCore::PushRegs() {
-  sub(rsp, 16 * RAXMM_x.size());
+  const auto AVXRegSize = Core::CPUState::XMM_AVX_REG_SIZE;
+
+  sub(rsp, AVXRegSize * RAXMM_x.size());
   for (size_t i = 0; i < RAXMM_x.size(); ++i) {
-    movaps(ptr[rsp + i * 16], RAXMM_x[i]);
+    vmovups(ptr[rsp + i * AVXRegSize], ToYMM(RAXMM_x[i]));
   }
 
-  for (auto &Reg : RA64)
+  for (const auto &Reg : RA64) {
     push(Reg);
+  }
 
-  auto NumPush = RA64.size();
-  if (NumPush & 1)
-    sub(rsp, 8); // Align
+  const auto NumPush = RA64.size();
+  if ((NumPush & 1) != 0) {
+    // Align
+    sub(rsp, 8);
+  }
 }
 
 void X86JITCore::PopRegs() {
-  auto NumPush = RA64.size();
+  const auto AVXRegSize = Core::CPUState::XMM_AVX_REG_SIZE;
+  const auto NumPush = RA64.size();
 
-  if (NumPush & 1)
-    add(rsp, 8); // Align
-  for (uint32_t i = RA64.size(); i > 0; --i)
+  if ((NumPush & 1) != 0) {
+    // Align
+    add(rsp, 8);
+  }
+
+  for (uint32_t i = RA64.size(); i > 0; --i) {
     pop(RA64[i - 1]);
+  }
 
   for (size_t i = 0; i < RAXMM_x.size(); ++i) {
-    movaps(RAXMM_x[i], ptr[rsp + i * 16]);
+    vmovups(ToYMM(RAXMM_x[i]), ptr[rsp + i * AVXRegSize]);
   }
 
-  add(rsp, 16 * RAXMM_x.size());
+  add(rsp, AVXRegSize * RAXMM_x.size());
 }
 
 void X86JITCore::Op_Unhandled(IR::IROp_Header *IROp, IR::NodeID Node) {