Skip to content

Commit 13c30d4

Browse files
khushal1996Ruihan-Yinkunalspathak
authored
Unmasking EGPRs in register allocator (#114867)
* unmask HWIntrinsic node. * IMUL, GT_INDEX_ADDR * GT_CKFINITE GT_RETURNTRAP GT_SWITCH_TABLE GT_JMPTABLE genFnProlog * Complete BuildHWIntrinsics * Let genFnProlog have EGPR when APX and EVEX are both available. * Indir * clean-up in lsrabuild * Enable egprs in cast nodes * unmask BuildIntrinsic * unmask BuildShiftRotate * Unmask BlockStore * run formatting * Use regMask when looking for srcReg candidate in BlkStore * Use EGPR for NI_System_Math_Abs only and remove EGPR usage from dstReg * Enable asserts for additional 8 reg * Resolve error * Apply suggestions from code review Co-authored-by: Kunal Pathak <Kunal.Pathak@microsoft.com> * Unmask remaining instructions and cache evexIsSupported and canUseApxRegs * revert changes in SetContainsAVXFlags * Add APX bypass Knob * Add Instruction Count reporting in superpmi * remove debug comment * Revert "revert changes in SetContainsAVXFlags" This reverts commit 34c693f. * Revert "Add APX bypass Knob" This reverts commit 4be2048. * revert changes in SetContainsAVXFlags * Revert "Add Instruction Count reporting in superpmi" This reverts commit c0793c3. * Narrow down the cases where EGPRs cannot be alloted * Run formatting and remove stale comments * Move Apx Checks to Evex checks for egpr mask * Mask Shift Rotate nodes when evex is not available * Resolve code style comments --------- Co-authored-by: Ruihan-Yin <ruihan.yin@intel.com> Co-authored-by: Kunal Pathak <Kunal.Pathak@microsoft.com>
1 parent 4fea27c commit 13c30d4

File tree

9 files changed

+365
-242
lines changed

9 files changed

+365
-242
lines changed

src/coreclr/jit/codegencommon.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5044,8 +5044,11 @@ void CodeGen::genFnProlog()
50445044
bool initRegZeroed = false;
50455045
regMaskTP excludeMask = intRegState.rsCalleeRegArgMaskLiveIn;
50465046
#if defined(TARGET_AMD64)
5047-
// TODO-Xarch-apx : Revert. Excluding eGPR so that it's not used for non REX2 supported movs.
5048-
excludeMask = excludeMask | RBM_HIGHINT;
5047+
// we'd require eEVEX present to enable EGPRs in HWIntrinsics.
5048+
if (!compiler->canUseEvexEncoding())
5049+
{
5050+
excludeMask = excludeMask | RBM_HIGHINT;
5051+
}
50495052
#endif // !defined(TARGET_AMD64)
50505053

50515054
#ifdef TARGET_ARM

src/coreclr/jit/codegenxarch.cpp

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,30 @@ void CodeGen::genCodeForBswap(GenTree* tree)
756756
}
757757
else
758758
{
759-
GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand);
759+
instruction ins = INS_movbe;
760+
#ifdef TARGET_AMD64
761+
bool needsEvex = false;
762+
763+
if (GetEmitter()->IsExtendedGPReg(tree->GetRegNum()))
764+
{
765+
needsEvex = true;
766+
}
767+
else if (operand->isIndir())
768+
{
769+
GenTreeIndir* indir = operand->AsIndir();
770+
if (indir->HasBase() && GetEmitter()->IsExtendedGPReg(indir->Base()->GetRegNum()))
771+
{
772+
needsEvex = true;
773+
}
774+
else if (indir->HasIndex() && GetEmitter()->IsExtendedGPReg(indir->Index()->GetRegNum()))
775+
{
776+
needsEvex = true;
777+
}
778+
}
779+
780+
ins = needsEvex ? INS_movbe_apx : INS_movbe;
781+
#endif
782+
GetEmitter()->emitInsBinary(ins, emitTypeSize(operand), tree, operand);
760783
}
761784

762785
if (tree->OperIs(GT_BSWAP16) && !genCanOmitNormalizationForBswap16(tree))
@@ -5679,6 +5702,22 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
56795702
if (data->OperIs(GT_BSWAP, GT_BSWAP16))
56805703
{
56815704
ins = INS_movbe;
5705+
#ifdef TARGET_AMD64
5706+
bool needsEvex = false;
5707+
if (GetEmitter()->IsExtendedGPReg(data->gtGetOp1()->GetRegNum()))
5708+
{
5709+
needsEvex = true;
5710+
}
5711+
else if (tree->HasBase() && GetEmitter()->IsExtendedGPReg(tree->Base()->GetRegNum()))
5712+
{
5713+
needsEvex = true;
5714+
}
5715+
else if (tree->HasIndex() && GetEmitter()->IsExtendedGPReg(tree->Index()->GetRegNum()))
5716+
{
5717+
needsEvex = true;
5718+
}
5719+
ins = needsEvex ? INS_movbe_apx : INS_movbe;
5720+
#endif // TARGET_AMD64
56825721
}
56835722
#if defined(FEATURE_HW_INTRINSICS)
56845723
else if (data->OperIsHWIntrinsic())
@@ -7331,9 +7370,11 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
73317370
// addsd xmm0, xmm0
73327371
//.LABEL
73337372
//
7334-
regNumber argReg = treeNode->gtGetOp1()->GetRegNum();
7335-
regNumber tmpReg1 = internalRegisters.Extract(treeNode);
7373+
regNumber argReg = treeNode->gtGetOp1()->GetRegNum();
7374+
// Get the APXIncompatible register first
73367375
regNumber tmpReg2 = internalRegisters.Extract(treeNode);
7376+
// tmpReg1 can be EGPR
7377+
regNumber tmpReg1 = internalRegisters.Extract(treeNode);
73377378

73387379
inst_Mov(TYP_LONG, tmpReg1, argReg, /* canSkip */ false, EA_8BYTE);
73397380
inst_RV_SH(INS_shr, EA_8BYTE, tmpReg1, 1);

src/coreclr/jit/emitinl.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ inline bool emitter::instrIs3opImul(instruction ins)
1313
#ifdef TARGET_X86
1414
return ((ins >= INS_imul_AX) && (ins <= INS_imul_DI));
1515
#else // TARGET_AMD64
16-
return ((ins >= INS_imul_AX) && (ins <= INS_imul_15));
16+
return ((ins >= INS_imul_AX) && (ins <= INS_imul_31));
1717
#endif
1818
}
1919

@@ -23,7 +23,7 @@ inline bool emitter::instrIsExtendedReg3opImul(instruction ins)
2323
#ifdef TARGET_X86
2424
return false;
2525
#else // TARGET_AMD64
26-
return ((ins >= INS_imul_08) && (ins <= INS_imul_15));
26+
return ((ins >= INS_imul_08) && (ins <= INS_imul_31));
2727
#endif
2828
}
2929

@@ -55,6 +55,22 @@ inline void emitter::check3opImulValues()
5555
assert(INS_imul_13 - INS_imul_AX == REG_R13);
5656
assert(INS_imul_14 - INS_imul_AX == REG_R14);
5757
assert(INS_imul_15 - INS_imul_AX == REG_R15);
58+
assert(INS_imul_16 - INS_imul_AX == REG_R16);
59+
assert(INS_imul_17 - INS_imul_AX == REG_R17);
60+
assert(INS_imul_18 - INS_imul_AX == REG_R18);
61+
assert(INS_imul_19 - INS_imul_AX == REG_R19);
62+
assert(INS_imul_20 - INS_imul_AX == REG_R20);
63+
assert(INS_imul_21 - INS_imul_AX == REG_R21);
64+
assert(INS_imul_22 - INS_imul_AX == REG_R22);
65+
assert(INS_imul_23 - INS_imul_AX == REG_R23);
66+
assert(INS_imul_24 - INS_imul_AX == REG_R24);
67+
assert(INS_imul_25 - INS_imul_AX == REG_R25);
68+
assert(INS_imul_26 - INS_imul_AX == REG_R26);
69+
assert(INS_imul_27 - INS_imul_AX == REG_R27);
70+
assert(INS_imul_28 - INS_imul_AX == REG_R28);
71+
assert(INS_imul_29 - INS_imul_AX == REG_R29);
72+
assert(INS_imul_30 - INS_imul_AX == REG_R30);
73+
assert(INS_imul_31 - INS_imul_AX == REG_R31);
5874
#endif
5975
}
6076

src/coreclr/jit/emitxarch.cpp

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const
402402
return true;
403403
}
404404

405-
if (ins == INS_crc32_apx)
405+
if (ins == INS_crc32_apx || ins == INS_movbe_apx)
406406
{
407407
// With the new opcode, CRC32 is promoted to EVEX with APX.
408408
return true;
@@ -1102,6 +1102,22 @@ bool emitter::emitIsInstrWritingToReg(instrDesc* id, regNumber reg)
11021102
case INS_imul_13:
11031103
case INS_imul_14:
11041104
case INS_imul_15:
1105+
case INS_imul_16:
1106+
case INS_imul_17:
1107+
case INS_imul_18:
1108+
case INS_imul_19:
1109+
case INS_imul_20:
1110+
case INS_imul_21:
1111+
case INS_imul_22:
1112+
case INS_imul_23:
1113+
case INS_imul_24:
1114+
case INS_imul_25:
1115+
case INS_imul_26:
1116+
case INS_imul_27:
1117+
case INS_imul_28:
1118+
case INS_imul_29:
1119+
case INS_imul_30:
1120+
case INS_imul_31:
11051121
#endif // TARGET_AMD64
11061122
if (reg == inst3opImulReg(ins))
11071123
{
@@ -1879,6 +1895,12 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
18791895
return true;
18801896
}
18811897

1898+
if (ins >= INS_imul_16 && ins <= INS_imul_31)
1899+
{
1900+
// The instructions have implicit use of EGPRs.
1901+
return true;
1902+
}
1903+
18821904
#if defined(DEBUG)
18831905
if (emitComp->DoJitStressRex2Encoding())
18841906
{
@@ -1931,7 +1953,7 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const
19311953
return true;
19321954
}
19331955

1934-
if (ins == INS_crc32_apx)
1956+
if (ins == INS_crc32_apx || ins == INS_movbe_apx)
19351957
{
19361958
return true;
19371959
}
@@ -6085,7 +6107,11 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m
60856107

60866108
if (data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained())
60876109
{
6110+
#ifdef TARGET_AMD64
6111+
assert(ins == INS_movbe || ins == INS_movbe_apx);
6112+
#else
60886113
assert(ins == INS_movbe);
6114+
#endif
60896115
data = data->gtGetOp1();
60906116
}
60916117

@@ -14701,7 +14727,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
1470114727
break;
1470214728
}
1470314729
#ifdef TARGET_AMD64
14704-
if (ins == INS_crc32_apx)
14730+
if (ins == INS_crc32_apx || ins == INS_movbe_apx)
1470514731
{
1470614732
code |= (insEncodeReg345(id, id->idReg1(), size, &code) << 8);
1470714733
}
@@ -15586,7 +15612,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
1558615612
break;
1558715613
}
1558815614
#ifdef TARGET_AMD64
15589-
if (ins == INS_crc32_apx)
15615+
if (ins == INS_crc32_apx || ins == INS_movbe_apx)
1559015616
{
1559115617
// The promoted CRC32 is in 1-byte opcode, unlike other instructions on this path, the register encoding for
1559215618
// CRC32 need to be done here.
@@ -20427,6 +20453,22 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
2042720453
case INS_imul_13:
2042820454
case INS_imul_14:
2042920455
case INS_imul_15:
20456+
case INS_imul_16:
20457+
case INS_imul_17:
20458+
case INS_imul_18:
20459+
case INS_imul_19:
20460+
case INS_imul_20:
20461+
case INS_imul_21:
20462+
case INS_imul_22:
20463+
case INS_imul_23:
20464+
case INS_imul_24:
20465+
case INS_imul_25:
20466+
case INS_imul_26:
20467+
case INS_imul_27:
20468+
case INS_imul_28:
20469+
case INS_imul_29:
20470+
case INS_imul_30:
20471+
case INS_imul_31:
2043020472
#endif // TARGET_AMD64
2043120473
case INS_imul:
2043220474
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
@@ -21941,6 +21983,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
2194121983
}
2194221984

2194321985
case INS_movbe:
21986+
#ifdef TARGET_AMD64
21987+
case INS_movbe_apx:
21988+
#endif
2194421989
if (memAccessKind == PERFSCORE_MEMORY_READ)
2194521990
{
2194621991
result.insThroughput = PERFSCORE_THROUGHPUT_2X;

0 commit comments

Comments
 (0)