Skip to content

Commit

Permalink
Merge pull request #3330 from Sonicadvance1/optimize_sib_addr_calc
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Optimize SIB addr calculation
  • Loading branch information
lioncash committed Dec 19, 2023
2 parents 5b4e9c6 + f2da70c commit 81c85d7
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 73 deletions.
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,16 @@ DEF_OP(Add) {
}
}

DEF_OP(AddShift) {
auto Op = IROp->C<IR::IROp_AddShift>();
const uint8_t OpSize = IROp->Size;

LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

add(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
}

DEF_OP(AddNZCV) {
auto Op = IROp->C<IR::IROp_AddNZCV>();
const auto OpSize = IROp->Size;
Expand Down
99 changes: 57 additions & 42 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4763,39 +4763,46 @@ OrderedNode *OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, X
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
OrderedNode *Tmp{};

// NOTE: VSIB cannot have the index * scale portion calculated ahead of time,
// since the index in this case is a vector. So, we can't just apply the scale
// to it, since this needs to be applied to each element in the index register
// after said element has been sign extended. So, we pass this through for the
// instruction implementation to handle.
//
// What we do handle though, is the applying the displacement value to
// the base register (if a base register is provided), since this is a
// part of the address calculation that can be done ahead of time.
if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && !IsVSIB) {
Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);

if (Operand.Data.SIB.Scale != 1) {
auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
}
if (Operand.Data.SIB.Index == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
AccessType = MemoryAccessType::NONTSO;
}
if (!IsVSIB && Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto Base = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
auto Index = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
Tmp = _AddShift(IR::SizeToOpSize(GPRSize), Base, Index, ShiftType::LSL, FEXCore::ilog2(Operand.Data.SIB.Scale));
}
else {
// NOTE: VSIB cannot have the index * scale portion calculated ahead of time,
// since the index in this case is a vector. So, we can't just apply the scale
// to it, since this needs to be applied to each element in the index register
// after said element has been sign extended. So, we pass this through for the
// instruction implementation to handle.
//
// What we do handle though, is the applying the displacement value to
// the base register (if a base register is provided), since this is a
// part of the address calculation that can be done ahead of time.
if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && !IsVSIB) {
Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);

if (Operand.Data.SIB.Scale != 1) {
auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
}
if (Operand.Data.SIB.Index == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
AccessType = MemoryAccessType::NONTSO;
}
}

if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);

if (Tmp != nullptr) {
Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
}
else {
Tmp = GPR;
}
if (Tmp != nullptr) {
Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
}
else {
Tmp = GPR;
}

if (Operand.Data.SIB.Base == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
AccessType = MemoryAccessType::NONTSO;
if (Operand.Data.SIB.Base == FEXCore::X86State::REG_RSP && AccessType == MemoryAccessType::DEFAULT) {
AccessType = MemoryAccessType::NONTSO;
}
}
}

Expand Down Expand Up @@ -5009,23 +5016,31 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
}
else if (Operand.IsSIB()) {
OrderedNode *Tmp {};
if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID) {
Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);

if (Operand.Data.SIB.Scale != 1) {
auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
}
if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID && Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto Base = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);
auto Index = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);
Tmp = _AddShift(IR::SizeToOpSize(GPRSize), Base, Index, ShiftType::LSL, FEXCore::ilog2(Operand.Data.SIB.Scale));
}
else {
if (Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID) {
Tmp = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize);

if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);

if (Tmp != nullptr) {
Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
if (Operand.Data.SIB.Scale != 1) {
auto Constant = _Constant(GPRSize * 8, Operand.Data.SIB.Scale);
Tmp = _Mul(IR::SizeToOpSize(GPRSize), Tmp, Constant);
}
}
else {
Tmp = GPR;

if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) {
auto GPR = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize);

if (Tmp != nullptr) {
Tmp = _Add(IR::SizeToOpSize(GPRSize), Tmp, GPR);
}
else {
Tmp = GPR;
}
}
}

Expand Down
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,16 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
"Desc": [ "Integer Add with shifted register",
"Will truncate to 64 or 32bits"
],
"DestSize": "Size",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit",
"_Shift != ShiftType::ROR"
]
},
"AddNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Set NZCV for the sum of two GPRs"],
"HasSideEffects": true,
Expand Down
4 changes: 2 additions & 2 deletions unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
"cmp esi, ebx"
],
"ExpectedArm64ASM": [
"add w20, w5, w10",
"add w20, w10, w5",
"ldrb w6, [x20]",
"add w20, w11, w10",
"add w20, w10, w11",
"ldrb w5, [x20]",
"orr w20, w6, #0xffff0000",
"mov w6, w20",
Expand Down
149 changes: 120 additions & 29 deletions unittests/InstructionCountCI/Primary.json
Original file line number Diff line number Diff line change
Expand Up @@ -2476,15 +2476,15 @@
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x5, x7",
"add x20, x7, x5",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [rbx+rcx*1 + 0]": {
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x5, x7",
"add x20, x7, x5",
"mov x20, x20",
"mov w4, w20"
]
Expand All @@ -2493,88 +2493,179 @@
"ExpectedInstructionCount": 1,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x4, x5, x7"
"add x4, x7, x5"
]
},
"lea ax, [rbx+rcx*2 + 0]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #1",
"add x20, x20, x7",
"add x20, x7, x5, lsl #1",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [rbx+rcx*2 + 0]": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #1",
"add x20, x20, x7",
"add x20, x7, x5, lsl #1",
"mov x20, x20",
"mov w4, w20"
]
},
"lea rax, [rbx+rcx*2 + 0]": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #1",
"add x4, x20, x7"
"add x4, x7, x5, lsl #1"
]
},
"lea ax, [rbx+rcx*4 + 0]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #2",
"add x20, x20, x7",
"add x20, x7, x5, lsl #2",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [rbx+rcx*4 + 0]": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #2",
"add x20, x20, x7",
"add x20, x7, x5, lsl #2",
"mov x20, x20",
"mov w4, w20"
]
},
"lea rax, [rbx+rcx*4 + 0]": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #2",
"add x4, x20, x7"
"add x4, x7, x5, lsl #2"
]
},
"lea ax, [rbx+rcx*8 + 0]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #3",
"add x20, x20, x7",
"add x20, x7, x5, lsl #3",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [rbx+rcx*8 + 0]": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #3",
"add x20, x20, x7",
"add x20, x7, x5, lsl #3",
"mov x20, x20",
"mov w4, w20"
]
},
"lea rax, [rbx+rcx*8 + 0]": {
"ExpectedInstructionCount": 1,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x4, x7, x5, lsl #3"
]
},
"lea ax, [ebx+ecx*1 + 0]": {
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5",
"mov w20, w20",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [ebx+ecx*1 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5",
"mov w4, w20"
]
},
"lea rax, [ebx+ecx*1 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5",
"mov w4, w20"
]
},
"lea ax, [ebx+ecx*2 + 0]": {
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #1",
"mov w20, w20",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [ebx+ecx*2 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #1",
"mov w4, w20"
]
},
"lea rax, [ebx+ecx*2 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #1",
"mov w4, w20"
]
},
"lea ax, [ebx+ecx*4 + 0]": {
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #2",
"mov w20, w20",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [ebx+ecx*4 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #2",
"mov w4, w20"
]
},
"lea rax, [ebx+ecx*4 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #2",
"mov w4, w20"
]
},
"lea ax, [ebx+ecx*8 + 0]": {
"ExpectedInstructionCount": 3,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #3",
"mov w20, w20",
"bfxil x4, x20, #0, #16"
]
},
"lea eax, [ebx+ecx*8 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"lsl x20, x5, #3",
"add x4, x20, x7"
"add x20, x7, x5, lsl #3",
"mov w4, w20"
]
},
"lea rax, [ebx+ecx*8 + 0]": {
"ExpectedInstructionCount": 2,
"Comment": "0x8d",
"ExpectedArm64ASM": [
"add x20, x7, x5, lsl #3",
"mov w4, w20"
]
},
"mov cs, ax": {
Expand Down

0 comments on commit 81c85d7

Please sign in to comment.