Skip to content

Commit

Permalink
Merge pull request #3766 from alyssarosenzweig/opt/f16c-round
Browse files Browse the repository at this point in the history
Optimize vcvtps2ph
  • Loading branch information
Sonicadvance1 committed Jun 26, 2024
2 parents 6226c7f + cf834aa commit 756fa2e
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 149 deletions.
33 changes: 33 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,39 @@ DEF_OP(SetRoundingMode) {
msr(ARMEmitter::SystemRegister::FPCR, TMP1);
}

DEF_OP(PushRoundingMode) {
auto Op = IROp->C<IR::IROp_PushRoundingMode>();
auto Dest = GetReg(Node);

// Save the old rounding mode
mrs(Dest, ARMEmitter::SystemRegister::FPCR);

// vixl simulator doesn't support anything beyond ties-to-even rounding
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
return;
}

// Insert the rounding flags, reversing the mode bits as above
if (Op->RoundMode == 3) {
orr(ARMEmitter::Size::i64Bit, TMP1, Dest, 3 << 22);
} else if (Op->RoundMode == 0) {
and_(ARMEmitter::Size::i64Bit, TMP1, Dest, ~(3 << 22));
} else {
LOGMAN_THROW_AA_FMT(Op->RoundMode == 1 || Op->RoundMode == 2, "expect a valid round mode");

and_(ARMEmitter::Size::i64Bit, TMP1, Dest, ~(Op->RoundMode << 22));
orr(ARMEmitter::Size::i64Bit, TMP1, TMP1, (Op->RoundMode == 2 ? 1 : 2) << 22);
}

// Now save the new FPCR
msr(ARMEmitter::SystemRegister::FPCR, TMP1);
}

DEF_OP(PopRoundingMode) {
auto Op = IROp->C<IR::IROp_PopRoundingMode>();
msr(ARMEmitter::SystemRegister::FPCR, GetReg(Op->FPCR.ID()));
}

DEF_OP(Print) {
auto Op = IROp->C<IR::IROp_Print>();

Expand Down
7 changes: 2 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4493,13 +4493,10 @@ void OpDispatchBuilder::VCVTPS2PHOp(OpcodeArgs) {
// the RM field in the FPCR. And so! We have to do some ugly
// rounding mode shuffling.
const auto NewRMode = Imm8 & 0b11;

Ref OldRMode = _GetRoundingMode();
_SetRoundingMode(_Constant(NewRMode));
Ref SavedFPCR = _PushRoundingMode(NewRMode);

Result = _Vector_FToF(SrcSize, 2, Src, 4);

_SetRoundingMode(OldRMode);
_PopRoundingMode(SavedFPCR);
}

// We need to eliminate upper junk if we're storing into a register with
Expand Down
11 changes: 11 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@
],
"HasSideEffects": true
},
"GPR = PushRoundingMode u8:$RoundMode": {
"Desc": ["Override the current rounding mode options for the thread, returning old FPCR"
],
"DestSize": "8",
"HasSideEffects": true
},
"PopRoundingMode GPR:$FPCR": {
"Desc": ["Resets rounding mode after PushRoundingMode operation"
],
"HasSideEffects": true
},
"Print SSA:$Value": {
"HasSideEffects": true,
"Desc": ["Debug operation that prints an SSA value to the console",
Expand Down
172 changes: 28 additions & 144 deletions unittests/InstructionCountCI/VEX_map3.json
Original file line number Diff line number Diff line change
Expand Up @@ -3058,119 +3058,61 @@
]
},
"vcvtps2ph xmm0, xmm1, 00000000b": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 5,
"Comment": [
"nearest rounding",
"Map 3 0b01 0x1D 128-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x0",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffff3fffff",
"msr fpcr, x0",
"fcvtn v16.4h, v17.4s",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0"
"msr fpcr, x20"
]
},
"vcvtps2ph xmm0, xmm1, 00000001b": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 6,
"Comment": [
"-inf rounding",
"Map 3 0b01 0x1D 128-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x1",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffffbfffff",
"orr x0, x0, #0x800000",
"msr fpcr, x0",
"fcvtn v16.4h, v17.4s",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0"
"msr fpcr, x20"
]
},
"vcvtps2ph xmm0, xmm1, 00000010b": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 6,
"Comment": [
"+inf rounding",
"Map 3 0b01 0x1D 128-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x2",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffff7fffff",
"orr x0, x0, #0x400000",
"msr fpcr, x0",
"fcvtn v16.4h, v17.4s",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0"
"msr fpcr, x20"
]
},
"vcvtps2ph xmm0, xmm1, 00000011b": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 5,
"Comment": [
"truncate rounding",
"Map 3 0b01 0x1D 128-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x3",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"orr x0, x20, #0xc00000",
"msr fpcr, x0",
"fcvtn v16.4h, v17.4s",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0"
"msr fpcr, x20"
]
},
"vcvtps2ph xmm0, xmm1, 00000100b": {
Expand All @@ -3184,126 +3126,68 @@
]
},
"vcvtps2ph xmm0, ymm1, 00000000b": {
"ExpectedInstructionCount": 22,
"ExpectedInstructionCount": 7,
"Comment": [
"nearest rounding",
"Map 3 0b01 0x1D 256-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x0",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffff3fffff",
"msr fpcr, x0",
"fcvtnt z2.h, p7/m, z17.s",
"uzp2 z2.h, z2.h, z2.h",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"msr fpcr, x20",
"mov v16.16b, v2.16b"
]
},
"vcvtps2ph xmm0, ymm1, 00000001b": {
"ExpectedInstructionCount": 22,
"ExpectedInstructionCount": 8,
"Comment": [
"-inf rounding",
"Map 3 0b01 0x1D 256-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x1",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffffbfffff",
"orr x0, x0, #0x800000",
"msr fpcr, x0",
"fcvtnt z2.h, p7/m, z17.s",
"uzp2 z2.h, z2.h, z2.h",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"msr fpcr, x20",
"mov v16.16b, v2.16b"
]
},
"vcvtps2ph xmm0, ymm1, 00000010b": {
"ExpectedInstructionCount": 22,
"ExpectedInstructionCount": 8,
"Comment": [
"+inf rounding",
"Map 3 0b01 0x1D 256-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x2",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"and x0, x20, #0xffffffffff7fffff",
"orr x0, x0, #0x400000",
"msr fpcr, x0",
"fcvtnt z2.h, p7/m, z17.s",
"uzp2 z2.h, z2.h, z2.h",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"msr fpcr, x20",
"mov v16.16b, v2.16b"
]
},
"vcvtps2ph xmm0, ymm1, 00000011b": {
"ExpectedInstructionCount": 22,
"ExpectedInstructionCount": 7,
"Comment": [
"truncate rounding",
"Map 3 0b01 0x1D 256-bit"
],
"ExpectedArm64ASM": [
"mrs x20, fpcr",
"ubfx x20, x20, #22, #3",
"rbit w0, w20",
"bfi x20, x0, #30, #2",
"mov w21, #0x3",
"rbit w1, w21",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"bfi x0, x1, #24, #1",
"orr x0, x20, #0xc00000",
"msr fpcr, x0",
"fcvtnt z2.h, p7/m, z17.s",
"uzp2 z2.h, z2.h, z2.h",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"msr fpcr, x20",
"mov v16.16b, v2.16b"
]
},
Expand Down

0 comments on commit 756fa2e

Please sign in to comment.