Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Daniel-Svensson · 2025-05-24T20:49:34Z

The biggest improvements are signed long and for platforms without BMI2.
A nice side effect is that the ready2run code can now emit a simple mul instead of having to fallback to the 32bit code.

This pull request introduces a internal Multiply hardware intrinsics (NI_X86Base_Multiply and NI_X86Base_X64_Multiply) for x86 and x64 architectures in the JIT compiler and calls them from Math.BigMul

This improves the machine code for signed BigMul which should fix #75594 based on the API shape suggested in #58263
It can also help with implementing IntPtr.BigMul #114731

NOTES:

The code is heavily based on the DivRem code introduced in Implement DivRem intrinsic for X86 #66551 (I went through the current version of all the files touched and tried to add similar code for multiply).
I did not do Mono so I did try to use conditional compilation to exclude it from Mono (since it does not seem as straightforward and I do not know how to test the various combinations). Also it seems like it might already has special cases for bigmul
I have not tuched the jit compiler before, so while the code executes and seems to work fine i might have missed something.
Since it uses tuples it has some of the downsides of DivRem (especially on windows) where extra temp variables and stackspill, so there might be a few scenarios where performance is slighly worse or the same. (There was some discussion in Consume DivRem intrinsics from Math.DivRem #82194 )
There might be other better solutions including handing Math.BigMul as an instrinct in itself (or change it to a pair of MUL/HI_MUL with some extra logic) but that would probably to many new changes to the JTI for me to take on

Exampels of generated code code

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul2(ref ulong x, ref ulong y)
{
    x = Math.BigMul(x, y, out y);
}

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul1(ref long x, ref long y)
{
    x = Math.BigMul(x, y, out y);
}

Produces the following

; Method Program:<<Main>$>g__TestBigMul2|0_1(byref,byref) (FullOpts)
G_M19919_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M19919_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       mul      rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M19919_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23


; Method Program:<<Main>$>g__TestBigMul1|0_2(byref,byref) (FullOpts)
G_M20175_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M20175_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       imul     rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M20175_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23

WIth BMI2 (mulx)

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M36427_IG01:  ;; offset=0x0000
       mov      bword ptr [rsp+0x10], rdx
						;; size=5 bbWeight=1 PerfScore 1.00

G_M36427_IG02:  ;; offset=0x0005
       mov      rdx, qword ptr [rcx]
       mov      rax, bword ptr [rsp+0x10]
       mulx     r8, rdx, qword ptr [rax]
       mov      qword ptr [rax], rdx
       mov      qword ptr [rcx], r8
						;; size=19 bbWeight=1 PerfScore 11.00

G_M36427_IG03:  ;; offset=0x0018
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 25

code before

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
       mov      bword ptr [rsp+0x18], rdx

G_M000_IG02:                ;; offset=0x0006
       mov      rdx, qword ptr [rcx]
       mov      rax, bword ptr [rsp+0x18]
       mov      r8, qword ptr [rax]
       lea      r10, [rsp]
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r8, qword ptr [rsp]
       mov      qword ptr [rax], r8
       mov      qword ptr [rcx], rdx

G_M000_IG03:                ;; offset=0x0027
       add      rsp, 8
       ret      
; Total bytes of code: 44

; Assembly listing for method Program:<<Main>$>g__TestBigMul1|0_6(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
 
G_M000_IG02:                ;; offset=0x0001
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x18], rdx
       mov      r8, qword ptr [rdx]
       lea      r10, [rsp]
       mov      rdx, rax
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r10, qword ptr [rsp]
       mov      r9, bword ptr [rsp+0x18]
       mov      qword ptr [r9], r10
       mov      r10, rax
       sar      r10, 63
       and      r10, r8
       sub      rdx, r10
       sar      r8, 63
       and      rax, r8
       sub      rdx, rax
       mov      qword ptr [rcx], rdx
 
G_M000_IG03:                ;; offset=0x0041
       add      rsp, 8
       ret      
 
; Total bytes of code 70

Further code samples with array access

static long TestBigMulArr2(long[] x, ref long y)
{
    return Math.BigMul(y, x[1], out y);
}

				
static void TestBigMulArr12(long[] x, ref long y)
{
    x[1] = Math.BigMul(y, x[1], out y);
}

; Method Program:<<Main>$>g__TestBigMulArr2|0_5(long[],byref):long (FullOpts)
G_M60604_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M60604_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       cmp      dword ptr [rcx+0x08], 1
       jbe      SHORT G_M60604_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      rcx, bword ptr [rsp+0x38]
       mov      qword ptr [rcx], rax
       mov      rax, rdx
						;; size=29 bbWeight=1 PerfScore 15.25

G_M60604_IG03:  ;; offset=0x0021
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
											
						; Method Program:<<Main>$>g__TestBigMulArr12|0_6(long[],byref) (FullOpts)
G_M53374_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M53374_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       mov      r8d, dword ptr [rcx+0x08]
       cmp      r8d, 1
       jbe      SHORT G_M53374_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      r8, bword ptr [rsp+0x38]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx+0x18], rdx
						;; size=34 bbWeight=1 PerfScore 15.25

G_M53374_IG03:  ;; offset=0x0026
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25

G_M53374_IG04:  ;; offset=0x002B
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
						;; size=6 bbWeight=0 PerfScore 0.00
; Total bytes of code: 49

Benchmarks

The Full Benchmark code is found here

The benchmarks are based on a becnhmark suggested for MultplyNoFlags below does the following

        [Benchmark]
        public ulong BenchBigMulUnsigned()
        {
            ulong accLo = TestA;
            ulong accHi = TestB;
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            return accLo + accHi;
        }
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private unsafe void MathBigMulAcc(ulong a, ulong b, ref ulong accHi, ref ulong accLo)
        {
            ulong lo;
            ulong hi = Math.BigMul(a, b, out lo);
            accHi += hi;
            accLo += lo;
        }

Gnerated code with DOTNET_EnableBMI2=1

; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56495_IG02:  ;; offset=0x0000
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rcx, rdx
       add      rax, rcx
       add      rax, r8
						;; size=55 bbWeight=1 PerfScore 18.25

G_M56495_IG03:  ;; offset=0x0037
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 56

Generated code with DOTNET_EnableBMI2=0

A single push to the stack and several reads/writes since rax is spilled.

; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
       push     rax
						;; size=1 bbWeight=1 PerfScore 1.00

G_M56495_IG02:  ;; offset=0x0001
       mov      rax, qword ptr [rcx+0x08]
       mov      rcx, qword ptr [rcx+0x10]
       mov      qword ptr [rsp], rax
       mul      rdx:rax, rcx
       add      rcx, rdx
       mov      rdx, qword ptr [rsp]
       add      rdx, rax
       mov      qword ptr [rsp], rdx
       mov      rax, rdx
       mul      rdx:rax, rcx
       add      rcx, rdx
       mov      rdx, qword ptr [rsp]
       add      rdx, rax
       mov      qword ptr [rsp], rdx
       mov      rax, rdx
       mul      rdx:rax, rcx
       add      rcx, rdx
       mov      rdx, qword ptr [rsp]
       add      rdx, rax
       mov      qword ptr [rsp], rdx
       mov      rax, rdx
       mul      rdx:rax, rcx
       mov      r8, qword ptr [rsp]
       add      rax, r8
       add      rax, rcx
       add      rax, rdx
						;; size=88 bbWeight=1 PerfScore 27.00

G_M56495_IG03:  ;; offset=0x0059
       add      rsp, 8
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 94

Baseline: Calling old MultiplyNoFlags

; Method Benchmarks.Scenarios.BigMulTests:BenchMultiplyNoFlags3Ards():ulong:this (FullOpts)
G_M20411_IG01:  ;; offset=0x0000
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+0x08], rax
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+0x10], xmm4
       mov      qword ptr [rsp+0x20], rax
						;; size=26 bbWeight=1 PerfScore 4.83

G_M20411_IG02:  ;; offset=0x001A
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       lea      rcx, [rsp+0x20]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x20]
       lea      rcx, [rsp+0x18]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x18]
       lea      rcx, [rsp+0x10]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x10]
       lea      rcx, [rsp+0x08]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       mov      rcx, rdx
       add      rcx, qword ptr [rsp+0x08]
       add      rax, rcx
       add      rax, r10
						;; size=98 bbWeight=1 PerfScore 31.50

G_M20411_IG03:  ;; offset=0x007C
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 129

Results for Math.Bigmul with BMI2


BenchmarkDotNet v0.14.0, Windows 11 (10.0.26100.4202)
AMD Ryzen 7 5800X, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-preview.3.25201.16
  [Host]     : .NET 10.0.0 (10.0.25.17105), X64 RyuJIT AVX2
  Job-ZAWDHL : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2
  Job-RAEHQD : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2

Method	Job	Toolchain	TestA	TestB	Mean	Error	StdDev	Ratio
BenchBigMulUnsigned	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	0.6420 ns	0.0065 ns	0.0061 ns	0.42
BenchBigMulUnsigned	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.5354 ns	0.0141 ns	0.0125 ns	1.00

BenchBigMulSigned	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.2853 ns	0.0070 ns	0.0065 ns	0.43
BenchBigMulSigned	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	2.9852 ns	0.0263 ns	0.0246 ns	1.00

BenchMultiplyNoFlags3Ards	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	3.3122 ns	0.0060 ns	0.0056 ns	1.00
BenchMultiplyNoFlags3Ards	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	3.3080 ns	0.0058 ns	0.0054 ns	1.00

Hardware without BMI2, "~10 times faster"

Method	Job	Toolchain	TestA	TestB	Mean	Error	StdDev	Ratio	RatioSD
BenchBigMulUnsigned	Job-JCYSGS	\net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.283 ns	0.0104 ns	0.0092 ns	0.10	0.00
BenchBigMulUnsigned	Job-SJSTKO	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	12.256 ns	0.0133 ns	0.0118 ns	1.00	0.00

BenchBigMulSigned	Job-JCYSGS	\net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.275 ns	0.0051 ns	0.0048 ns	0.12	0.00
BenchBigMulSigned	Job-SJSTKO	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	10.783 ns	0.0743 ns	0.0620 ns	1.00	0.00

Additional benchmarks results

Additional resutls can be found under https://github.com/Daniel-Svensson/ClrExperiments/tree/7acd61943336356fa363763914a5b963de962065/ClrDecimal/Benchmarks/BenchmarkDotNet.Artifacts/results , I mostly checked that there was no significant regressions to decimal performance since Math.BigMul is has several usages there. There were a few minor improvements, mostly in the composite "InterestBenchmarks" which contains a mix of operations similar to interest calculation.

Copilot Summary

Summary

JIT Compiler Enhancements

Added support for Multiply intrinsics in the JIT compiler, including updates to ContainCheckHWIntrinsic, BuildHWIntrinsic, and impSpecialIntrinsic to handle the new instructions and their constraints (src/coreclr/jit/lowerxarch.cpp, src/coreclr/jit/lsraxarch.cpp, src/coreclr/jit/hwintrinsicxarch.cpp). [1] [2] [3]
Updated HWIntrinsicInfo and GenTreeHWIntrinsic to include the Multiply intrinsics and their associated properties (src/coreclr/jit/hwintrinsic.h, src/coreclr/jit/gentree.cpp). [1] [2]
Extended hwintrinsiclistxarch.h to define the Multiply intrinsics and their characteristics, such as instruction mapping and flags (src/coreclr/jit/hwintrinsiclistxarch.h). [1] [2]

Runtime Library Updates

Introduced X86Base.Multiply methods for both signed and unsigned multiplication in the runtime intrinsics API, providing platform-specific implementations or fallback behavior (src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs, src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs). [1] [2]
Updated the Math class to use the new Multiply intrinsics for optimized BigMul operations, improving performance on supported platforms (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

Code Cleanup

Removed outdated and unused code paths related to older multiplication implementations in the Math class (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

These changes collectively enhance the performance and capabilities of multiplication operations in .NET, leveraging hardware acceleration where available.Summary:

src/libraries/System.Private.CoreLib/src/System/Math.cs

src/coreclr/jit/lsraxarch.cpp

- TODO: lsra and containment - containment in

…SSE2 intrinsics

Daniel-Svensson · 2025-06-03T15:04:43Z

Due to conflict with new changes in main I had to rename the X86 method to BigMul

Daniel-Svensson · 2025-06-03T22:04:53Z

I've decided to push the bmi2 / mulx support and updated the "BigMul" test results.
There are also some nice improvement to XXHash from this commit

src/coreclr/jit/lsraxarch.cpp

Daniel-Svensson · 2025-06-04T19:10:21Z

src/coreclr/jit/lsraxarch.cpp

@@ -2989,9 +3030,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
    }
    else
    {
-        // Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
+        // Currently dstCount = 2 is only used for DivRem and Multiply, which has special constraints and is handled


Multiply is renamed to BigMul, can update comment later after review

Daniel-Svensson · 2025-06-11T16:57:07Z

FYI: @dotnet/jit-contrib

JIT: Emit mulx for GT_MULHI and GT_MUL_LONG if BMI2 is available #116198 is related and you might want to have the same set of eyes reviewing that one
I am holding off resolving conflicts until someone has the time to review this (don't expect any more updates), there seems to be some churn in the affected files so more conflicts might come

* InstructionSet_BMI2 => InstructionSet_AVX2 * More flexible MUL (no fixed rax arg)

github-actions bot added the area-CodeGen-coreclr label May 24, 2025

dotnet-policy-service bot added the community-contribution label May 24, 2025

This was referenced May 24, 2025

Improve Math.BigMul to fix Decimal compare perf regression #115182

Closed

[Perf] Windows/x64: Decimal Regressions on 2/3/2025 6:32:46 PM +00:00 #112432

Open

This was referenced May 25, 2025

The Operation will be canceled. The next steps may not contain expected logs. dotnet/dnceng#3008

Open

System.Net.Quic.Tests.MsQuicTests.WriteTests failed with "System.Net.Quic.QuicException : The connection timed out from inactivity." #105177

Open

Daniel-Svensson marked this pull request as ready for review May 25, 2025 06:59

pentp reviewed May 25, 2025

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Math.cs Outdated Show resolved Hide resolved

src/libraries/System.Private.CoreLib/src/System/Math.cs Outdated Show resolved Hide resolved

Daniel-Svensson commented May 29, 2025

View reviewed changes

src/coreclr/jit/lsraxarch.cpp Outdated Show resolved Hide resolved

Daniel-Svensson added 13 commits May 30, 2025 17:55

Improve BigMul to fix VarDecCmpSub regression dotnet#11243

357995e

Fix typo

c0421a5

Add multiply based on DivRem

0921d91

- TODO: lsra and containment - containment in

keep old behaviour on mono

f61ea92

Change Multiply to internal

8708639

remove whitespace and remove some comments

41b2e1c

apply format.patch

82a0084

update PlatformNotSupported file

0a1ea05

update #if

3b1560a

#if MONO

347ace2

add back unsafe for mono

b184829

react to upstream changes

8bbfdaa

add assert

2f0f838

Daniel-Svensson force-pushed the x86_multiply branch from 4ed65b8 to 2f0f838 Compare June 3, 2025 06:29

Daniel-Svensson added 3 commits June 3, 2025 15:42

merge upsteram/main Fix conflicts

de48455

rename Multiply to BigMul to handle name collision due to renamed to …

598e790

…SSE2 intrinsics

lookupIns has changed signature

21c3310

Daniel-Svensson added 2 commits June 3, 2025 18:27

Rename Multiply methods to BigMul in X86Base.PlatformNotSupported.cs.

1564ecf

Emit mulx for X86Base.Multiply when supported

12758de

build-analysis bot mentioned this pull request Jun 3, 2025

System.Net.Http.Functional.Tests timeouts #115683

Open

Daniel-Svensson commented Jun 4, 2025

View reviewed changes

Daniel-Svensson added 2 commits June 16, 2025 09:23

merge upstream/main

dedc015

Updates:

4ef7494

* InstructionSet_BMI2 => InstructionSet_AVX2 * More flexible MUL (no fixed rax arg)

Daniel-Svensson marked this pull request as draft June 16, 2025 21:04

This was referenced Jun 17, 2025

OSX failure on StringTests.StartsWithNoMatch_StringComparison #112195

Open

iOS.Device.LibraryMode.Test: failed to determine exit code - RETURN_CODE_NOT_SET #116558

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Daniel-Svensson commented May 24, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Daniel-Svensson commented Jun 3, 2025

Uh oh!

Daniel-Svensson commented Jun 3, 2025

Uh oh!

Uh oh!

Daniel-Svensson Jun 4, 2025 •

edited

Loading

Uh oh!

Daniel-Svensson commented Jun 11, 2025

Uh oh!

Uh oh!

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

Are you sure you want to change the base?

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

Conversation

Daniel-Svensson commented May 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Exampels of generated code code

Benchmarks

Additional benchmarks results

Copilot Summary

JIT Compiler Enhancements

Runtime Library Updates

Code Cleanup

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Daniel-Svensson commented Jun 3, 2025

Uh oh!

Daniel-Svensson commented Jun 3, 2025

Uh oh!

Uh oh!

Daniel-Svensson Jun 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Daniel-Svensson commented Jun 11, 2025

Uh oh!

Uh oh!

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Daniel-Svensson commented May 24, 2025 •

edited

Loading

Daniel-Svensson Jun 4, 2025 •

edited

Loading