Skip to content

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28520,8 +28520,10 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
{
#ifdef TARGET_XARCH
case NI_X86Base_DivRem:
case NI_X86Base_BigMul:
return compiler->typGetBlkLayout(genTypeSize(GetSimdBaseType()) * 2);
case NI_X86Base_X64_DivRem:
case NI_X86Base_X64_BigMul:
return compiler->typGetBlkLayout(16);
#endif // TARGET_XARCH
#ifdef TARGET_ARM64
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,8 @@ struct HWIntrinsicInfo
#ifdef TARGET_XARCH
case NI_X86Base_DivRem:
case NI_X86Base_X64_DivRem:
case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
return 2;
#endif // TARGET_XARCH

Expand Down
54 changes: 54 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2386,6 +2386,60 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(node->GetOperandCount() == 2);
assert(instOptions == INS_OPTS_NONE);
assert(!node->Op(1)->isContained());

// SIMD base type is from signature and can distinguish signed and unsigned
var_types targetType = node->GetSimdBaseType();
GenTree* regOp = node->Op(1);
GenTree* rmOp = node->Op(2);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType, compiler);

emitAttr attr = emitTypeSize(targetType);
emitter* emit = GetEmitter();

// Unsigned multiplication can use mulx on BMI2-capable CPUs
if (ins == INS_mulEAX && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
{
// op1: EDX, op2: reg/mem (operand 3) => hiRes: (operand 1), lowReg: (operand 2)
if (rmOp->isUsedFromReg() && rmOp->GetRegNum() == REG_EDX)
{
std::swap(rmOp, regOp);
}

// mov the first operand into implicit source operand EDX/RDX
emit->emitIns_Mov(INS_mov, attr, REG_EDX, regOp->GetRegNum(), /* canSkip */ true);

// emit MULX instruction
regNumber lowReg = node->GetRegByIndex(0);
regNumber hiReg = node->GetRegByIndex(1);
inst_RV_RV_TT(INS_mulx, attr, hiReg, lowReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
}
else
{
if (rmOp->isUsedFromReg() && rmOp->GetRegNum() == REG_EAX)
{
std::swap(rmOp, regOp);
}

// op1: EAX, op2: reg/mem
emit->emitIns_Mov(INS_mov, attr, REG_EAX, regOp->GetRegNum(), /* canSkip */ true);

// emit the MUL/IMUL instruction
emit->emitInsBinary(ins, attr, node, rmOp);

// verify target registers are as expected
assert(node->GetRegByIndex(0) == REG_EAX);
assert(node->GetRegByIndex(1) == REG_EDX);
}

break;
}

case NI_X86Base_X64_ConvertScalarToVector128Double:
case NI_X86Base_X64_ConvertScalarToVector128Single:
{
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ HARDWARE_INTRINSIC(X86Base, AddScalar,
HARDWARE_INTRINSIC(X86Base, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_pandd, INS_pandd, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
Expand Down Expand Up @@ -528,7 +529,8 @@ HARDWARE_INTRINSIC(X86Base, Xor,
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// 64-bit only Intrinsics for X86Base, SSE, SSE2
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BitScanForward
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BigMul
HARDWARE_INTRINSIC(X86Base_X64, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4210,6 +4210,26 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(sig->numArgs == 2);
assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(retType == TYP_STRUCT);
assert(simdBaseJitType != CORINFO_TYPE_UNDEF);

op2 = impPopStack().val;
op1 = impPopStack().val;

GenTreeHWIntrinsic* multiplyIntrinsic = gtNewScalarHWIntrinsicNode(retType, op1, op2, intrinsic);

// Store the type from signature into SIMD base type for convenience
multiplyIntrinsic->SetSimdBaseJitType(simdBaseJitType);

retNode = impStoreMultiRegValueToVar(multiplyIntrinsic,
sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}
case NI_X86Base_CompareScalarGreaterThan:
case NI_X86Base_CompareScalarGreaterThanOrEqual:
case NI_X86Base_CompareScalarNotGreaterThan:
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10496,6 +10496,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,13 @@ regMaskTP LinearScan::getKillSetForHWIntrinsic(GenTreeHWIntrinsic* node)
killMask = RBM_EDI;
break;

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
// For MUL instruction we produxe RAX:RDX effectibly killing both, for mulx RDX is implecit register
// se comment above for mask
killMask = RBM_EDX;
break;

default:
// Leave killMask as RBM_NONE
break;
Expand Down
51 changes: 49 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2481,6 +2481,51 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(numArgs == 2);
assert(dstCount == 2);
assert(isRMW);

if ((baseType == TYP_ULONG || baseType == TYP_UINT) &&
compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
isRMW = false;

SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op2, RBM_NONE, canHWIntrinsicUseApxRegs);
// mulx, prefer op1 in EDX
SingleTypeRegSet op1RegCandidates = op2->isContained() ? SRBM_EDX : apxAwareRegCandidates;
srcCount = BuildOperandUses(op1, op1RegCandidates);
srcCount += BuildOperandUses(op2, apxAwareRegCandidates);

// result in any register
SingleTypeRegSet apxAwareDestCandidates =
ForceLowGprForApxIfNeeded(intrinsicTree, RBM_NONE, canHWIntrinsicUseApxRegs);
BuildDef(intrinsicTree, apxAwareDestCandidates, 0);
BuildDef(intrinsicTree, apxAwareDestCandidates, 1);
}
else // Signed multiply or normal unsigned multiply in one operand form
{
SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op2, RBM_NONE, canHWIntrinsicUseApxRegs);

// mulEAX always use EAX, if one operand is contained, we use EAX as the target reg
// if not then we do not fix it, we might get the second parameter in EAX
SingleTypeRegSet op1RegCandidates = op2->isContained() ? SRBM_EAX : apxAwareRegCandidates;
srcCount = BuildOperandUses(op1, op1RegCandidates);
srcCount += BuildOperandUses(op2, apxAwareRegCandidates);

// result put in EAX and EDX
BuildDef(intrinsicTree, SRBM_EAX, 0);
BuildDef(intrinsicTree, SRBM_EDX, 1);
}

buildUses = false;
break;
}

case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down Expand Up @@ -2999,9 +3044,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else
{
// Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
// Currently dstCount = 2 is only used for DivRem and Multiply, which has special constraints and is handled
Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson Jun 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiply is renamed to BigMul, can update comment later after review

// above
assert((dstCount == 0) ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
(intrinsicId == NI_X86Base_BigMul) || (intrinsicId == NI_X86Base_X64_BigMul))));
}

*pDstCount = dstCount;
Expand Down
24 changes: 19 additions & 5 deletions src/libraries/System.Private.CoreLib/src/System/Math.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ public static long BigMul(int a, int b)
return ((long)a) * b;
}


#if !(TARGET_ARM64 || (TARGET_AMD64 && !MONO)) // BigMul 64*64 has high performance intrinsics on ARM64 and AMD64 (but not yet on MONO)
/// <summary>
/// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
/// </summary>
Expand All @@ -180,21 +180,18 @@ public static long BigMul(int a, int b)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(ulong a, uint b, out ulong low)
{
#if TARGET_64BIT
return Math.BigMul((ulong)a, (ulong)b, out low);
#else
ulong prodL = ((ulong)(uint)a) * b;
ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b);

low = ((prodH << 32) | (uint)prodL);
return (prodH >> 32);
#endif
}

/// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(uint a, ulong b, out ulong low)
=> BigMul(b, a, out low);
#endif

/// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
/// <param name="a">The first number to multiply.</param>
Expand All @@ -205,13 +202,23 @@ internal static ulong BigMul(uint a, ulong b, out ulong low)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong BigMul(ulong a, ulong b, out ulong low)
{
#if !MONO // X64.BigMul is not yet implemented in MONO
// X86Base.X64.BigMul is more performant than X86Base.X64.MultiplyNoFlags that has performance issues (#11782)
// so we don't need a separate BMI2 path
if (X86Base.X64.IsSupported)
{
(low, ulong hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#else
if (Bmi2.X64.IsSupported)
{
ulong tmp;
ulong high = Bmi2.X64.MultiplyNoFlags(a, b, &tmp);
low = tmp;
return high;
}
#endif
else if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down Expand Up @@ -251,6 +258,13 @@ static ulong SoftwareFallback(ulong a, ulong b, out ulong low)
/// <returns>The high 64-bit of the product of the specified numbers.</returns>
public static long BigMul(long a, long b, out long low)
{
#if !MONO // Multiply is not yet implemented in MONO
if (X86Base.X64.IsSupported)
{
(low, long hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#endif
if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) BigMul(ulong left, ulong right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (long Lower, long Upper) BigMul(long left, long right) { throw new PlatformNotSupportedException(); }
}

/// <summary>
Expand Down Expand Up @@ -109,6 +126,28 @@ internal X64() { }
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> MUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (uint Lower, uint Upper) BigMul(uint left, uint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (int Lower, int Upper) BigMul(int left, int right) { throw new PlatformNotSupportedException(); }

/// <summary> MUL reg/m</summary>
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); }

/// <summary> IMUL reg/m</summary>
internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Loading
Loading