Skip to content

Commit

Permalink
Merge pull request #3381 from alyssarosenzweig/opt/masking
Browse files Browse the repository at this point in the history
Allow upper garbage on a bunch of instructions
  • Loading branch information
lioncash committed Jan 30, 2024
2 parents b937885 + 10cca02 commit fa33520
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 160 deletions.
7 changes: 7 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,11 @@ DEF_OP(FindTrailingZeroes) {
rbit(EmitSize, Dst, Src);

if (OpSize == 2) {
// This orr does two things. First, if the (masked) source is zero, it
// reverses to zero in the top so it forces clz to return 16. Second, it
// ensures garbage in the upper bits of the source don't affect clz, because
// they'll rbit to garbage in the bottom below the 0x8000 and be ignored by
// the clz. So we handle Src upper garbage without explicitly masking.
orr(EmitSize, Dst, Dst, 0x8000);
}

Expand All @@ -1230,6 +1235,8 @@ DEF_OP(CountLeadingZeroes) {
const auto Src = GetReg(Op->Src.ID());

if (OpSize == 2) {
// Expressing as lsl+orr+clz clears away any garbage in the upper bits
// (alternatively could do uxth+clz+sub.. equal cost in total).
lsl(EmitSize, Dst, Src, 16);
orr(EmitSize, Dst, Dst, 0x8000);
clz(EmitSize, Dst, Dst);
Expand Down
41 changes: 24 additions & 17 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2113,8 +2113,9 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
// Essentially (Src1 >> Start) & ((1 << Length) - 1)
// along with some edge-case handling and flag setting.

auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});

const auto Size = GetSrcSize(Op);
const auto SrcSize = Size * 8;
Expand Down Expand Up @@ -2152,10 +2153,12 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {

void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {
// Equivalent to performing: SRC & -SRC
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto Size = OpSizeFromSrc(Op);

auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto NegatedSrc = _Neg(OpSizeFromSrc(Op), Src);
auto Result = _And(OpSizeFromSrc(Op), Src, NegatedSrc);
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto NegatedSrc = _Neg(Size, Src);
auto Result = _And(Size, Src, NegatedSrc);

// ...and we're done. Painless!
StoreResult(GPRClass, Op, Result, -1);
Expand All @@ -2165,22 +2168,23 @@ void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) {

void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
// Equivalent to: (Src - 1) ^ Src
auto One = _Constant(1);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto Size = OpSizeFromSrc(Op);

auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto Result = _Xor(OpSize::i64Bit, _Sub(OpSize::i64Bit, Src, One), Src);
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _Xor(Size, _Sub(Size, Src, _Constant(1)), Src);

StoreResult(GPRClass, Op, Result, -1);
GenerateFlags_BLSMSK(Op, Src);
}

void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) {
// Equivalent to: (Src - 1) & Src
auto One = _Constant(1);

auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto Result = _And(OpSize::i64Bit, _Sub(OpSize::i64Bit, Src, One), Src);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Size = OpSizeFromSrc(Op);

auto Result = _And(Size, _Sub(Size, Src, _Constant(1)), Src);
StoreResult(GPRClass, Op, Result, -1);

GenerateFlags_BLSR(Op, Result, Src);
Expand Down Expand Up @@ -2316,8 +2320,9 @@ void OpDispatchBuilder::PDEP(OpcodeArgs) {
}

void OpDispatchBuilder::PEXT(OpcodeArgs) {
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _PExt(OpSizeFromSrc(Op), Input, Mask);

StoreResult(GPRClass, Op, Op->Dest, Result, -1);
Expand Down Expand Up @@ -3180,7 +3185,7 @@ void OpDispatchBuilder::XADDOp(OpcodeArgs) {
}

void OpDispatchBuilder::PopcountOp(OpcodeArgs) {
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = GetSrcSize(Op) >= 4});
Src = _Popcount(OpSizeFromSrc(Op), Src);
StoreResult(GPRClass, Op, Src, -1);

Expand Down Expand Up @@ -5367,7 +5372,8 @@ void OpDispatchBuilder::INTOp(OpcodeArgs) {
}

void OpDispatchBuilder::TZCNT(OpcodeArgs) {
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
// _FindTrailingZeroes ignores upper garbage so we don't need to mask
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});

Src = _FindTrailingZeroes(OpSizeFromSrc(Op), Src);
StoreResult(GPRClass, Op, Src, -1);
Expand All @@ -5376,7 +5382,8 @@ void OpDispatchBuilder::TZCNT(OpcodeArgs) {
}

void OpDispatchBuilder::LZCNT(OpcodeArgs) {
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
// _CountLeadingZeroes clears upper garbage so we don't need to mask
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});

auto Res = _CountLeadingZeroes(OpSizeFromSrc(Op), Src);
StoreResult(GPRClass, Op, Res, -1);
Expand Down
51 changes: 23 additions & 28 deletions unittests/InstructionCountCI/FlagM/Secondary_REP.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@
]
},
"popcnt eax, ebx": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 10,
"Comment": "0xf3 0x0f 0xb8",
"ExpectedArm64ASM": [
"mov w20, w7",
"fmov s0, w20",
"fmov s0, w7",
"cnt v0.8b, v0.8b",
"addv b0, v0.8b",
"umov w4, v0.b[0]",
Expand Down Expand Up @@ -65,11 +64,10 @@
]
},
"tzcnt ax, bx": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": "0xf3 0x0f 0xbc",
"ExpectedArm64ASM": [
"uxth w20, w7",
"rbit w20, w20",
"rbit w20, w7",
"orr w20, w20, #0x8000",
"clz w20, w20",
"bfxil x4, x20, #0, #16",
Expand All @@ -81,11 +79,10 @@
]
},
"tzcnt eax, ebx": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 7,
"Comment": "0xf3 0x0f 0xbc",
"ExpectedArm64ASM": [
"mov w20, w7",
"rbit w4, w20",
"rbit w4, w7",
"clz w4, w4",
"cmp x4, #0x0 (0)",
"cset x20, eq",
Expand All @@ -108,32 +105,30 @@
]
},
"lzcnt ax, bx": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": "0xf3 0x0f 0xbd",
"ExpectedArm64ASM": [
"uxth w20, w7",
"lsl w21, w20, #16",
"orr w21, w21, #0x8000",
"clz w21, w21",
"bfxil x4, x21, #0, #16",
"cmp x20, #0x0 (0)",
"cset x21, eq",
"lsl x21, x21, #29",
"msr nzcv, x21",
"rmif x20, #13, #nZcv"
"lsl w20, w7, #16",
"orr w20, w20, #0x8000",
"clz w20, w20",
"bfxil x4, x20, #0, #16",
"cmp x7, #0x0 (0)",
"cset x20, eq",
"lsl x20, x20, #29",
"msr nzcv, x20",
"rmif x7, #13, #nZcv"
]
},
"lzcnt eax, ebx": {
"ExpectedInstructionCount": 7,
"ExpectedInstructionCount": 6,
"Comment": "0xf3 0x0f 0xbd",
"ExpectedArm64ASM": [
"mov w20, w7",
"clz w4, w20",
"cmp x20, #0x0 (0)",
"cset x21, eq",
"lsl x21, x21, #29",
"msr nzcv, x21",
"rmif x20, #29, #nZcv"
"clz w4, w7",
"cmp x7, #0x0 (0)",
"cset x20, eq",
"lsl x20, x20, #29",
"msr nzcv, x20",
"rmif x7, #29, #nZcv"
]
},
"lzcnt rax, rbx": {
Expand Down
26 changes: 12 additions & 14 deletions unittests/InstructionCountCI/FlagM/VEX_map2.json
Original file line number Diff line number Diff line change
Expand Up @@ -442,26 +442,24 @@
]
},
"bextr eax, ebx, ecx": {
"ExpectedInstructionCount": 19,
"ExpectedInstructionCount": 17,
"Comment": [
"Map 2 0b00 0xf7 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"mov w21, w5",
"mov w22, #0x1f",
"uxtb w23, w21",
"lsr w20, w20, w23",
"mov w24, #0x0",
"cmp w23, #0x1f (31)",
"csel w20, w20, w24, ls",
"ubfx w21, w21, #8, #8",
"mov w20, #0x1f",
"uxtb w21, w5",
"lsr w22, w7, w21",
"mov w23, #0x0",
"cmp w21, #0x1f (31)",
"csel w21, w21, w22, ls",
"csel w21, w22, w23, ls",
"ubfx w22, w5, #8, #8",
"cmp w22, #0x1f (31)",
"csel w20, w22, w20, ls",
"mov w22, #0x1",
"lsl w21, w22, w21",
"sub w21, w21, #0x1 (1)",
"and w4, w20, w21",
"lsl w20, w22, w20",
"sub w20, w20, #0x1 (1)",
"and w4, w21, w20",
"cmp x4, #0x0 (0)",
"cset x20, eq",
"lsl x20, x20, #30",
Expand Down
47 changes: 21 additions & 26 deletions unittests/InstructionCountCI/FlagM/VEX_map_group.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,18 @@
},
"Instructions": {
"blsr eax, ebx": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 8,
"Comment": [
"Map group 17 0b001 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"sub x21, x20, #0x1 (1)",
"and x21, x21, x20",
"mov w4, w21",
"tst w21, w21",
"mrs x21, nzcv",
"cmp x20, #0x0 (0)",
"cset x20, ne",
"orr w20, w21, w20, lsl #29",
"sub w20, w7, #0x1 (1)",
"and w4, w20, w7",
"tst w4, w4",
"mrs x20, nzcv",
"cmp x7, #0x0 (0)",
"cset x21, ne",
"orr w20, w20, w21, lsl #29",
"msr nzcv, x20"
]
},
Expand All @@ -44,22 +42,20 @@
]
},
"blsmsk eax, ebx": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 9,
"Comment": [
"Map group 17 0b010 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"sub x21, x20, #0x1 (1)",
"eor x21, x21, x20",
"mov w4, w21",
"mov w21, #0x50000000",
"mrs x22, nzcv",
"bic x21, x22, x21",
"cmp x20, #0x0 (0)",
"cset x20, ne",
"msr nzcv, x21",
"rmif x20, #63, #nzCv"
"sub w20, w7, #0x1 (1)",
"eor w4, w20, w7",
"mov w20, #0x50000000",
"mrs x21, nzcv",
"bic x20, x21, x20",
"cmp x7, #0x0 (0)",
"cset x21, ne",
"msr nzcv, x20",
"rmif x21, #63, #nzCv"
]
},
"blsmsk rax, rbx": {
Expand All @@ -80,14 +76,13 @@
]
},
"blsi eax, ebx": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Comment": [
"Map group 17 0b011 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"neg w21, w20",
"and w4, w20, w21",
"neg w20, w7",
"and w4, w7, w20",
"tst w4, w4",
"mrs x20, nzcv",
"cmp x4, #0x0 (0)",
Expand Down

0 comments on commit fa33520

Please sign in to comment.