Merge pull request #3330 from Sonicadvance1/optimize_sib_addr_calc

OpcodeDispatcher: Optimize SIB addr calculation
FEX-Emu · Dec 19, 2023 · 81c85d7 · 81c85d7
2 parents 5b4e9c6 + f2da70c
commit 81c85d7
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 73 deletions.
diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
@@ -85,6 +85,16 @@ DEF_OP(Add) {
   }
 }
 
+DEF_OP(AddShift) {
+  auto Op = IROp->C<IR::IROp_AddShift>();
+  const uint8_t OpSize = IROp->Size;
+
+  LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
+  const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
+
+  add(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
+}
+
 DEF_OP(AddNZCV) {
   auto Op = IROp->C<IR::IROp_AddNZCV>();
   const auto OpSize = IROp->Size;

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@@ -4763,39 +4763,46 @@ OrderedNode *OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, X
     const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
     OrderedNode *Tmp{};
 
-    // NOTE: VSIB cannot have the index * scale portion calculated ahead of time,
-    //       since the index in this case is a vector. So, we can't just apply the scale
-    //       to it, since this needs to be applied to each element in the index register
-    //       after said element has been sign extended. So, we pass this through for the
-    //       instruction implementation to handle.
-    //
-    //       What we do handle though, is the applying the displacement value to
-    //       the base register (if a base register is provided), since this is a
-    //       part of the address calculation that can be done ahead of time.
-    if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && !IsVSIB) {
-      Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
-
-      if (Operand.Data.SIB.Scale != 1) {
-        auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
-        Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
-      }
-      if (Operand.Data.SIB.Index == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
-        AccessType = MemoryAccessType::NONTSO;
-      }
+    if (!IsVSIB && Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
+      auto Base = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
+      auto Index = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
+      Tmp = _AddShift(IR::SizeToOpSize(GPRSize), Base, Index, ShiftType::LSL, FEXCore::ilog2(Operand.Data.SIB.Scale));
     }
+    else {
+      // NOTE: VSIB cannot have the index * scale portion calculated ahead of time,
+      //       since the index in this case is a vector. So, we can't just apply the scale
+      //       to it, since this needs to be applied to each element in the index register
+      //       after said element has been sign extended. So, we pass this through for the
+      //       instruction implementation to handle.
+      //
+      //       What we do handle though, is the applying the displacement value to
+      //       the base register (if a base register is provided), since this is a
+      //       part of the address calculation that can be done ahead of time.
+      if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && !IsVSIB) {
+        Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
+
+        if (Operand.Data.SIB.Scale != 1) {
+          auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
+          Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
+        }
+        if (Operand.Data.SIB.Index == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
+          AccessType = MemoryAccessType::NONTSO;
+        }
+      }
 
-    if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
-      auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
+      if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
+        auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
 
-      if (Tmp != nullptr) {
-        Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
-      }
-      else {
-        Tmp = GPR;
-      }
+        if (Tmp != nullptr) {
+          Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
+        }
+        else {
+          Tmp = GPR;
+        }
 
-      if (Operand.Data.SIB.Base == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
-        AccessType = MemoryAccessType::NONTSO;
+        if (Operand.Data.SIB.Base == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
+          AccessType = MemoryAccessType::NONTSO;
+        }
       }
     }
 
@@ -5009,23 +5016,31 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
   }
   else if (Operand.IsSIB()) {
     OrderedNode *Tmp {};
-    if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID) {
-      Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
 
-      if (Operand.Data.SIB.Scale != 1) {
-        auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
-        Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
-      }
+    if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
+      auto Base = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
+      auto Index = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
+      Tmp = _AddShift(IR::SizeToOpSize(GPRSize), Base, Index, ShiftType::LSL, FEXCore::ilog2(Operand.Data.SIB.Scale));
     }
+    else {
+      if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID) {
+        Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
 
-    if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
-      auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
-
-      if (Tmp != nullptr) {
-        Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
+        if (Operand.Data.SIB.Scale != 1) {
+          auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
+          Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
+        }
       }
-      else {
-        Tmp = GPR;
+
+      if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
+        auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
+
+        if (Tmp != nullptr) {
+          Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
+        }
+        else {
+          Tmp = GPR;
+        }
       }
     }
 

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -953,6 +953,16 @@
           "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
         ]
       },
+      "GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
+        "Desc": [ "Integer Add with shifted register",
+                  "Will truncate to 64 or 32bits"
+                ],
+        "DestSize": "Size",
+        "EmitValidation": [
+          "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit",
+          "_Shift != ShiftType::ROR"
+        ]
+      },
       "AddNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": {
         "Desc": ["Set NZCV for the sum of two GPRs"],
         "HasSideEffects": true,

diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json b/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json
@@ -27,9 +27,9 @@
         "cmp     esi, ebx"
       ],
       "ExpectedArm64ASM": [
-        "add w20, w5, w10",
+        "add w20, w10, w5",
         "ldrb w6, [x20]",
-        "add w20, w11, w10",
+        "add w20, w10, w11",
         "ldrb w5, [x20]",
         "orr w20, w6, #0xffff0000",
         "mov w6, w20",

diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json
@@ -2476,15 +2476,15 @@
       "ExpectedInstructionCount": 2,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "add x20, x5, x7",
+        "add x20, x7, x5",
         "bfxil x4, x20, #0, #16"
       ]
     },
     "lea eax, [rbx+rcx*1 + 0]": {
       "ExpectedInstructionCount": 3,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "add x20, x5, x7",
+        "add x20, x7, x5",
         "mov x20, x20",
         "mov w4, w20"
       ]
@@ -2493,88 +2493,179 @@
       "ExpectedInstructionCount": 1,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "add x4, x5, x7"
+        "add x4, x7, x5"
       ]
     },
     "lea ax, [rbx+rcx*2 + 0]": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 2,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #1",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #1",
         "bfxil x4, x20, #0, #16"
       ]
     },
     "lea eax, [rbx+rcx*2 + 0]": {
-      "ExpectedInstructionCount": 4,
+      "ExpectedInstructionCount": 3,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #1",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #1",
         "mov x20, x20",
         "mov w4, w20"
       ]
     },
     "lea rax, [rbx+rcx*2 + 0]": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 1,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #1",
-        "add x4, x20, x7"
+        "add x4, x7, x5, lsl #1"
       ]
     },
     "lea ax, [rbx+rcx*4 + 0]": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 2,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #2",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #2",
         "bfxil x4, x20, #0, #16"
       ]
     },
     "lea eax, [rbx+rcx*4 + 0]": {
-      "ExpectedInstructionCount": 4,
+      "ExpectedInstructionCount": 3,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #2",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #2",
         "mov x20, x20",
         "mov w4, w20"
       ]
     },
     "lea rax, [rbx+rcx*4 + 0]": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 1,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #2",
-        "add x4, x20, x7"
+        "add x4, x7, x5, lsl #2"
       ]
     },
     "lea ax, [rbx+rcx*8 + 0]": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 2,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #3",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #3",
         "bfxil x4, x20, #0, #16"
       ]
     },
     "lea eax, [rbx+rcx*8 + 0]": {
-      "ExpectedInstructionCount": 4,
+      "ExpectedInstructionCount": 3,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #3",
-        "add x20, x20, x7",
+        "add x20, x7, x5, lsl #3",
         "mov x20, x20",
         "mov w4, w20"
       ]
     },
     "lea rax, [rbx+rcx*8 + 0]": {
+      "ExpectedInstructionCount": 1,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x4, x7, x5, lsl #3"
+      ]
+    },
+    "lea ax, [ebx+ecx*1 + 0]": {
+      "ExpectedInstructionCount": 3,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5",
+        "mov w20, w20",
+        "bfxil x4, x20, #0, #16"
+      ]
+    },
+    "lea eax, [ebx+ecx*1 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5",
+        "mov w4, w20"
+      ]
+    },
+    "lea rax, [ebx+ecx*1 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5",
+        "mov w4, w20"
+      ]
+    },
+    "lea ax, [ebx+ecx*2 + 0]": {
+      "ExpectedInstructionCount": 3,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #1",
+        "mov w20, w20",
+        "bfxil x4, x20, #0, #16"
+      ]
+    },
+    "lea eax, [ebx+ecx*2 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #1",
+        "mov w4, w20"
+      ]
+    },
+    "lea rax, [ebx+ecx*2 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #1",
+        "mov w4, w20"
+      ]
+    },
+    "lea ax, [ebx+ecx*4 + 0]": {
+      "ExpectedInstructionCount": 3,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #2",
+        "mov w20, w20",
+        "bfxil x4, x20, #0, #16"
+      ]
+    },
+    "lea eax, [ebx+ecx*4 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #2",
+        "mov w4, w20"
+      ]
+    },
+    "lea rax, [ebx+ecx*4 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #2",
+        "mov w4, w20"
+      ]
+    },
+    "lea ax, [ebx+ecx*8 + 0]": {
+      "ExpectedInstructionCount": 3,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #3",
+        "mov w20, w20",
+        "bfxil x4, x20, #0, #16"
+      ]
+    },
+    "lea eax, [ebx+ecx*8 + 0]": {
       "ExpectedInstructionCount": 2,
       "Comment": "0x8d",
       "ExpectedArm64ASM": [
-        "lsl x20, x5, #3",
-        "add x4, x20, x7"
+        "add x20, x7, x5, lsl #3",
+        "mov w4, w20"
+      ]
+    },
+    "lea rax, [ebx+ecx*8 + 0]": {
+      "ExpectedInstructionCount": 2,
+      "Comment": "0x8d",
+      "ExpectedArm64ASM": [
+        "add x20, x7, x5, lsl #3",
+        "mov w4, w20"
       ]
     },
     "mov cs, ax": {