@@ -11253,29 +11253,45 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
11253
11253
// movups xmmword ptr [ebp/esp-OFFS], xmm4
11254
11254
// ...
11255
11255
// movups xmmword ptr [ebp/esp-OFFS], xmm4
11256
- // mov qword ptr [ebp/esp-OFFS], rax
11257
- //
11256
+
11258
11257
// NOTE: it implicitly zeroes YMM4 and ZMM4 as well.
11259
11258
emit->emitIns_SIMD_R_R_R (INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg, INS_OPTS_NONE);
11260
11259
11261
- int i = 0 ;
11262
- if (maxSimdSize > XMM_REGSIZE_BYTES)
11260
+ assert ((blkSize % XMM_REGSIZE_BYTES) == 0 );
11261
+
11262
+ int regSize = (int )compiler->roundDownSIMDSize (blkSize);
11263
+ int lenRemaining = blkSize;
11264
+ while (lenRemaining > 0 )
11263
11265
{
11264
- for (; i <= blkSize - maxSimdSize; i += maxSimdSize)
11266
+ // Overlap with the previously zeroed memory if we can clear the remainder
11267
+ // with just single store. Example: say we have 112 bytes to clear:
11268
+ //
11269
+ // Option 1 (no overlapping):
11270
+ // movups zmmword ptr [+0]
11271
+ // movups ymmword ptr [+64]
11272
+ // movups xmmword ptr [+96]
11273
+ //
11274
+ // Option 2 (overlapping):
11275
+ // movups zmmword ptr [+0]
11276
+ // movups zmmword ptr [+48]
11277
+ //
11278
+ if ((regSize > lenRemaining) && !isPow2 (lenRemaining))
11265
11279
{
11266
- // We previously aligned data to 16 bytes which might not be aligned to maxSimdSize
11267
- emit->emitIns_AR_R (simdUnalignedMovIns (), EA_ATTR (maxSimdSize), zeroSIMDReg, frameReg,
11268
- alignedLclLo + i);
11280
+ lenRemaining = regSize;
11269
11281
}
11270
- // Remainder will be handled by the xmm loop below
11271
- }
11272
11282
11273
- for (; i < blkSize; i += XMM_REGSIZE_BYTES)
11274
- {
11275
- emit->emitIns_AR_R (simdMov, EA_ATTR (XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i);
11276
- }
11283
+ // Use the largest SIMD register size that fits in the remaining length
11284
+ regSize = (int )compiler->roundDownSIMDSize (lenRemaining);
11285
+ assert (regSize >= XMM_REGSIZE_BYTES);
11286
+
11287
+ // frameReg is definitely not known to be 32B/64B aligned -> switch to unaligned movs
11288
+ instruction ins = regSize > XMM_REGSIZE_BYTES ? simdUnalignedMovIns () : simdMov;
11289
+ const int offset = blkSize - lenRemaining;
11290
+ emit->emitIns_AR_R (ins, EA_ATTR (regSize), zeroSIMDReg, frameReg, alignedLclLo + offset);
11277
11291
11278
- assert (i == blkSize);
11292
+ lenRemaining -= regSize;
11293
+ }
11294
+ assert (lenRemaining == 0 );
11279
11295
}
11280
11296
else
11281
11297
{
0 commit comments