Skip to content

Commit

Permalink
Merge pull request #3410 from alyssarosenzweig/opt/nzcv-pass-2
Browse files Browse the repository at this point in the history
Add NZCV+PF/AF optimization pass
  • Loading branch information
Sonicadvance1 committed Feb 10, 2024
2 parents ba41da7 + de0b690 commit df3d693
Show file tree
Hide file tree
Showing 13 changed files with 799 additions and 277 deletions.
30 changes: 24 additions & 6 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Expand Up @@ -568,15 +568,33 @@ DEF_OP(AndWithFlags) {
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

uint64_t Const;
const auto Dst = GetReg(Node);
const auto Src1 = GetReg(Op->Src1.ID());
auto Src1 = GetReg(Op->Src1.ID());

uint64_t Const;
if (IsInlineConstant(Op->Src2, &Const)) {
ands(EmitSize, Dst, Src1, Const);
// See TestNZ
if (OpSize < 4) {
if (IsInlineConstant(Op->Src2, &Const)) {
and_(EmitSize, Dst, Src1, Const);
} else {
auto Src2 = GetReg(Op->Src2.ID());

if (Src1 != Src2) {
and_(EmitSize, Dst, Src1, Src2);
} else if (Dst != Src1) {
mov(ARMEmitter::Size::i64Bit, Dst, Src1);
}
}

unsigned Shift = 32 - (OpSize * 8);
cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift);
} else {
const auto Src2 = GetReg(Op->Src2.ID());
ands(EmitSize, Dst, Src1, Src2);
if (IsInlineConstant(Op->Src2, &Const)) {
ands(EmitSize, Dst, Src1, Const);
} else {
const auto Src2 = GetReg(Op->Src2.ID());
ands(EmitSize, Dst, Src1, Src2);
}
}
}

Expand Down
67 changes: 46 additions & 21 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Expand Up @@ -354,7 +354,7 @@ void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) {
case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 4):
case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 4):
case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 4):
IROp = FEXCore::IR::IROps::OP_AND;
IROp = FEXCore::IR::IROps::OP_ANDWITHFLAGS;
break;
case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 5):
case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 5):
Expand All @@ -373,7 +373,7 @@ void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) {
};
#undef OPD
// Logical ops can tolerate garbage in the upper bits, so don't mask.
bool AllowUpperGarbage = IROp == FEXCore::IR::IROps::OP_AND ||
bool AllowUpperGarbage = IROp == FEXCore::IR::IROps::OP_ANDWITHFLAGS ||
IROp == FEXCore::IR::IROps::OP_XOR ||
IROp == FEXCore::IR::IROps::OP_OR;

Expand Down Expand Up @@ -403,9 +403,9 @@ void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) {
Result = _Or(IR::SizeToOpSize(std::max<uint8_t>(4u, std::max(GetOpSize(Dest), GetOpSize(Src)))), Dest, Src);
break;
}
case FEXCore::IR::IROps::OP_AND: {
case FEXCore::IR::IROps::OP_ANDWITHFLAGS: {
Dest = _AtomicFetchAnd(IR::SizeToOpSize(Size), Src, DestMem);
Result = _And(IR::SizeToOpSize(std::max<uint8_t>(4u, std::max(GetOpSize(Dest), GetOpSize(Src)))), Dest, Src);
Result = _AndWithFlags(IR::SizeToOpSize(std::max(GetOpSize(Dest), GetOpSize(Src))), Dest, Src);
break;
}
case FEXCore::IR::IROps::OP_XOR: {
Expand All @@ -420,7 +420,11 @@ void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) {
}
else {
Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = AllowUpperGarbage || Size >= 4});
DeriveOp(ALUOp, IROp, _Add(IR::SizeToOpSize(std::max<uint8_t>(4u, Size)), Dest, Src));

if (IROp != FEXCore::IR::IROps::OP_ANDWITHFLAGS)
Size = std::max<uint8_t>(4u, Size);

DeriveOp(ALUOp, IROp, _AndWithFlags(IR::SizeToOpSize(Size), Dest, Src));

Result = ALUOp;

Expand All @@ -436,12 +440,24 @@ void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) {
case FEXCore::IR::IROps::OP_SUB:
GenerateFlags_SUB(Op, Result, Dest, Src);
break;
case FEXCore::IR::IROps::OP_AND:
case FEXCore::IR::IROps::OP_XOR:
case FEXCore::IR::IROps::OP_OR: {
GenerateFlags_Logical(Op, Result, Dest, Src);
break;
}
case FEXCore::IR::IROps::OP_ANDWITHFLAGS: {
InvalidateDeferredFlags();

// SF/ZF/CF/OF
CachedNZCV = nullptr;
PossiblySetNZCVBits = (1u << 31) | (1u << 30);
NZCVDirty = false;

// PF/AF
CalculatePF(Result);
_InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC);
break;
}
default: break;
}
}
Expand Down Expand Up @@ -1290,15 +1306,7 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs) {
InvalidateDeferredFlags();

// SF/ZF/CF/OF
OrderedNode *ALUOp;

if (Size >= 4) {
ALUOp = _AndWithFlags(IR::SizeToOpSize(Size), Dest, Src);
} else {
ALUOp = _And(OpSize::i32Bit, Dest, Src);
_TestNZ(IR::SizeToOpSize(Size), ALUOp, ALUOp);
}

OrderedNode *ALUOp = _AndWithFlags(IR::SizeToOpSize(Size), Dest, Src);
CachedNZCV = nullptr;
PossiblySetNZCVBits = (1u << 31) | (1u << 30);
NZCVDirty = false;
Expand Down Expand Up @@ -5214,10 +5222,15 @@ void OpDispatchBuilder::MOVGPRNTOp(OpcodeArgs) {

void OpDispatchBuilder::ALUOpImpl(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::IR::IROps AtomicFetchOp) {
auto Size = GetDstSize(Op);
const auto OpSize = Size == 8 ? OpSize::i64Bit : OpSize::i32Bit;

auto RoundedSize = Size;
if (ALUIROp != FEXCore::IR::IROps::OP_ANDWITHFLAGS)
RoundedSize = std::max<uint8_t>(4u, RoundedSize);

const auto OpSize = IR::SizeToOpSize(RoundedSize);

// Logical ops can tolerate garbage in the upper bits, so don't mask.
bool AllowUpperGarbage = ALUIROp == FEXCore::IR::IROps::OP_AND ||
bool AllowUpperGarbage = ALUIROp == FEXCore::IR::IROps::OP_ANDWITHFLAGS ||
ALUIROp == FEXCore::IR::IROps::OP_XOR ||
ALUIROp == FEXCore::IR::IROps::OP_OR;

Expand All @@ -5236,7 +5249,7 @@ void OpDispatchBuilder::ALUOpImpl(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCor
DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(IR::SizeToOpSize(Size), Src, DestMem));
Dest = FetchOp;

DeriveOp(ALUOp, ALUIROp, _Add(OpSize, Dest, Src));
DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src));
Result = ALUOp;
}
else {
Expand All @@ -5255,7 +5268,7 @@ void OpDispatchBuilder::ALUOpImpl(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCor

Result = _Constant(0);
} else {
DeriveOp(ALUOp, ALUIROp, _Add(OpSize, Dest, Src));
DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src));
Result = ALUOp;
}

Expand All @@ -5271,12 +5284,24 @@ void OpDispatchBuilder::ALUOpImpl(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCor
case FEXCore::IR::IROps::OP_SUB:
GenerateFlags_SUB(Op, Result, Dest, Src);
break;
case FEXCore::IR::IROps::OP_AND:
case FEXCore::IR::IROps::OP_XOR:
case FEXCore::IR::IROps::OP_OR: {
GenerateFlags_Logical(Op, Result, Dest, Src);
break;
}
case FEXCore::IR::IROps::OP_ANDWITHFLAGS: {
InvalidateDeferredFlags();

// SF/ZF/CF/OF
CachedNZCV = nullptr;
PossiblySetNZCVBits = (1u << 31) | (1u << 30);
NZCVDirty = false;

// PF/AF
CalculatePF(Result);
_InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC);
break;
}
default: break;
}
}
Expand Down Expand Up @@ -6083,7 +6108,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {

{0x18, 6, &OpDispatchBuilder::SBBOp<0, true>},

{0x20, 6, &OpDispatchBuilder::ALUOp<FEXCore::IR::IROps::OP_AND, FEXCore::IR::IROps::OP_ATOMICFETCHAND>},
{0x20, 6, &OpDispatchBuilder::ALUOp<FEXCore::IR::IROps::OP_ANDWITHFLAGS, FEXCore::IR::IROps::OP_ATOMICFETCHAND>},

{0x28, 6, &OpDispatchBuilder::ALUOp<FEXCore::IR::IROps::OP_SUB, FEXCore::IR::IROps::OP_ATOMICFETCHSUB>},

Expand Down
5 changes: 1 addition & 4 deletions FEXCore/Source/Interface/IR/IR.json
Expand Up @@ -1093,10 +1093,7 @@
"Desc": ["Integer binary and"
],
"DestSize": "Size",
"HasSideEffects": true,
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
"HasSideEffects": true
},
"GPR = Andn OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Integer binary AND NOT. Performs the equivalent of Src1 & ~Src2"],
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/IR/PassManager.cpp
Expand Up @@ -82,7 +82,7 @@ void PassManager::AddDefaultPasses(FEXCore::Context::ContextImpl *ctx, bool Inli
InsertPass(CreatePassDeadCodeElimination());
InsertPass(CreateConstProp(InlineConstants, ctx->HostFeatures.SupportsTSOImm9));

////// InsertPass(CreateDeadFlagCalculationEliminination());
InsertPass(CreateDeadFlagCalculationEliminination());

InsertPass(CreateInlineCallOptimization(&ctx->CPUID));
InsertPass(CreatePassDeadCodeElimination());
Expand Down

0 comments on commit df3d693

Please sign in to comment.