Skip to content

Commit 97d1fc2

Browse files
authored
Switch to ymm after zmm in genZeroInitFrameUsingBlockInit (#115981)
1 parent 2486c13 commit 97d1fc2

File tree

1 file changed

+31
-15
lines changed

1 file changed

+31
-15
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11253,29 +11253,45 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1125311253
// movups xmmword ptr [ebp/esp-OFFS], xmm4
1125411254
// ...
1125511255
// movups xmmword ptr [ebp/esp-OFFS], xmm4
11256-
// mov qword ptr [ebp/esp-OFFS], rax
11257-
//
11256+
1125811257
// NOTE: it implicitly zeroes YMM4 and ZMM4 as well.
1125911258
emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg, INS_OPTS_NONE);
1126011259

11261-
int i = 0;
11262-
if (maxSimdSize > XMM_REGSIZE_BYTES)
11260+
assert((blkSize % XMM_REGSIZE_BYTES) == 0);
11261+
11262+
int regSize = (int)compiler->roundDownSIMDSize(blkSize);
11263+
int lenRemaining = blkSize;
11264+
while (lenRemaining > 0)
1126311265
{
11264-
for (; i <= blkSize - maxSimdSize; i += maxSimdSize)
11266+
// Overlap with the previously zeroed memory if we can clear the remainder
11267+
// with just single store. Example: say we have 112 bytes to clear:
11268+
//
11269+
// Option 1 (no overlapping):
11270+
// movups zmmword ptr [+0]
11271+
// movups ymmword ptr [+64]
11272+
// movups xmmword ptr [+96]
11273+
//
11274+
// Option 2 (overlapping):
11275+
// movups zmmword ptr [+0]
11276+
// movups zmmword ptr [+48]
11277+
//
11278+
if ((regSize > lenRemaining) && !isPow2(lenRemaining))
1126511279
{
11266-
// We previously aligned data to 16 bytes which might not be aligned to maxSimdSize
11267-
emit->emitIns_AR_R(simdUnalignedMovIns(), EA_ATTR(maxSimdSize), zeroSIMDReg, frameReg,
11268-
alignedLclLo + i);
11280+
lenRemaining = regSize;
1126911281
}
11270-
// Remainder will be handled by the xmm loop below
11271-
}
1127211282

11273-
for (; i < blkSize; i += XMM_REGSIZE_BYTES)
11274-
{
11275-
emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i);
11276-
}
11283+
// Use the largest SIMD register size that fits in the remaining length
11284+
regSize = (int)compiler->roundDownSIMDSize(lenRemaining);
11285+
assert(regSize >= XMM_REGSIZE_BYTES);
11286+
11287+
// frameReg is definitely not known to be 32B/64B aligned -> switch to unaligned movs
11288+
instruction ins = regSize > XMM_REGSIZE_BYTES ? simdUnalignedMovIns() : simdMov;
11289+
const int offset = blkSize - lenRemaining;
11290+
emit->emitIns_AR_R(ins, EA_ATTR(regSize), zeroSIMDReg, frameReg, alignedLclLo + offset);
1127711291

11278-
assert(i == blkSize);
11292+
lenRemaining -= regSize;
11293+
}
11294+
assert(lenRemaining == 0);
1127911295
}
1128011296
else
1128111297
{

0 commit comments

Comments
 (0)