From 76208985cfe8166e71dee02e99aa394fecaa0788 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 12:05:42 +0000 Subject: [PATCH 1/2] feat(simd_caps): CPUID 7,1 + AMX-FP16/AVX512-FP16/VP2INTERSECT bits + AMX OS-gate in cpu_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Salvages the detection-only subset of closed PR #190 — three real gaps in the substrate runtime dispatch without inheriting any of PR #190's consumer-facing additions (no SimdProfile enum, no public dispatch-identity API, no cpu-* features). What lands here: 1) CPUID leaf 7,1 read for AMX-FP16 (CPUID.07H.1H:EAX bit 21). Lives on a different subleaf than the existing AMX bits; GraniteRapids is the only silicon advertising it today. Guarded by leaf 7,0 EAX >= 1 so older CPUs that don't expose subleaf 1 stay correct. 2) Three new SimdCaps fields (additive, all default false on non-x86): - avx512fp16 — CPUID.07H.0H:EDX bit 23 — `__m512h` math. Discriminates SPR-class from CascadeLake/ IceLakeSp/SkylakeX for any future FP16 kernel. - avx512vp2intersect — CPUID.07H.0H:EDX bit 8 — TigerLake mobile only; absent from Ice Lake-SP and every later server part. Exposed for completeness. - amx_fp16 — CPUID.07H.1H:EAX bit 21 — Granite Rapids. Plus convenience methods has_avx512_fp16() and has_amx_fp16() (the latter defense-in-depths the amx_tile bit). 3) AMX OS-state gate in cpu_ops() selection. The CPU-reports-AMX path now AND-gates on `simd_amx::amx_available()` which runs the full four-step check (CPUID + OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM, 18) on Linux 5.19+). This closes the SIGILL hole when a hypervisor masks XCR0 or the OS hasn't honoured the prctl: previously cpu_ops() would route to CPU_OPS_AMX_INT8 and AMX instructions would SIGILL despite the CPUID bit. Now it demotes to CPU_OPS_AVX512_VNNI cleanly. What's deliberately NOT here (rejected from PR #190): - No `SimdProfile` enum — would expose dispatch identity to consumer code and invite `match profile { ... }` arms that defeat the polyfill contract. - No `cpu-*` cargo features — build-time silicon pinning that defeats polyfill at an earlier binding time. - No `simd_profile_probe` example — diagnostic-only, rebuilds the SimdProfile surface this PR doesn't bring. - No public dispatch-identity API at any layer. The new bits are internal substrate detection; consumers continue to use `crate::simd::*` polyfilled types and `crate::simd_runtime::*` per-op trampolines. The new fields slot into existing `cpu_ops()` selection by extension (e.g. a future AMX-FP16 tier would AND-gate on `caps.amx_fp16 && simd_amx::amx_available()` between the AMX-INT8 and AVX-512-VNNI arms). No selection logic uses them yet — they're laying the runway, not consuming it. Tests: - 4 new simd_caps tests: cpuid_extended_bits_smoke, has_amx_fp16_requires_amx_tile, x86_extended_bits_are_false_on_non_x86, plus extended determinism coverage. - All 6 existing cpu_ops tests still pass; the AMX OS-gate change passes through transparently on hosts where amx_available() agrees with CPUID (the typical case). - fmt + clippy clean on `--features runtime-dispatch`. --- src/hpc/simd_caps.rs | 124 ++++++++++++++++++++++++++++++++++++ src/simd_runtime/cpu_ops.rs | 10 ++- 2 files changed, 133 insertions(+), 1 deletion(-) diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs index a35823b5..654140ef 100644 --- a/src/hpc/simd_caps.rs +++ b/src/hpc/simd_caps.rs @@ -71,6 +71,24 @@ pub struct SimdCaps { /// (`is_x86_feature_detected!("avxvnniint8")`). /// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H). pub avxvnniint8: bool, + /// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23). Native + /// `__m512h` operations (`_mm512_*_ph`). Present on Sapphire Rapids, + /// Granite Rapids, Zen 4+. Bit is exposed for downstream substrate + /// kernels and dispatch ladders; no consumer-facing dispatch axis + /// is built on top of it. + pub avx512fp16: bool, + /// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on + /// Tiger Lake mobile silicon; absent from Ice Lake-SP and every + /// later server part. Useful for future intersection-heavy + /// primitives (set ops on bitmaps); exposed for completeness. + pub avx512vp2intersect: bool, + /// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot + /// product, present on Granite Rapids only. Lives at CPUID leaf + /// 7,1 (subleaf 1), not leaf 7,0 — separate `__cpuid_count(7, 1)` + /// call required. The leaf 7,1 read is gated on leaf 7,0's EAX + /// max-subleaf field being ≥ 1; on older silicon that field is 0 + /// and we never query leaf 7,1. + pub amx_fp16: bool, // ── aarch64 (ARM) ── /// NEON 128-bit SIMD (mandatory on aarch64, always true). @@ -124,6 +142,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -143,6 +164,18 @@ impl SimdCaps { let amx_tile = (cpuid7.edx >> 24) & 1 == 1; let amx_int8 = (cpuid7.edx >> 25) & 1 == 1; let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1; + let avx512fp16 = (cpuid7.edx >> 23) & 1 == 1; + let avx512vp2intersect = (cpuid7.edx >> 8) & 1 == 1; + + // Leaf 7,1 EAX bit 21 = AMX-FP16. Leaf 7,1 only exists when + // leaf 7,0 EAX (max-subleaf) is at least 1; on older silicon + // this returns 0 and the answer is correctly false. + let amx_fp16 = if cpuid7.eax >= 1 { + let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1); + (cpuid7_1.eax >> 21) & 1 == 1 + } else { + false + }; Self { avx2: is_x86_feature_detected!("avx2"), @@ -160,6 +193,9 @@ impl SimdCaps { amx_bf16, avx512bf16: is_x86_feature_detected!("avx512bf16"), avxvnniint8: is_x86_feature_detected!("avxvnniint8"), + avx512fp16, + avx512vp2intersect, + amx_fp16, // ARM fields: all false on x86 neon: false, asimd_dotprod: false, @@ -192,6 +228,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, // ARM fields: runtime detection neon: true, // mandatory on aarch64 asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"), @@ -221,6 +260,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -275,6 +317,23 @@ impl SimdCaps { self.avxvnniint8 } + /// True if AVX-512 FP16 (`__m512h`) is available. Distinguishes + /// SapphireRapids-class silicon (and Zen 4+) from the CascadeLake / + /// IceLakeSp / SkylakeX baseline that lacks native `__m512h` math. + #[inline(always)] + pub fn has_avx512_fp16(self) -> bool { + self.avx512fp16 + } + + /// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids + /// advertises this bit. Requires both the CPUID 7,1 bit AND + /// AMX-TILE (defense-in-depth: a CPU advertising AMX-FP16 without + /// AMX-TILE is contradictory but the check stays cheap). + #[inline(always)] + pub fn has_amx_fp16(self) -> bool { + self.amx_fp16 && self.amx_tile + } + // ── ARM convenience methods ── /// True if running on aarch64 with NEON (always true on aarch64). @@ -511,4 +570,69 @@ mod tests { #[cfg(target_arch = "aarch64")] assert_ne!(profile, ArmProfile::NotArm); } + + /// New CPUID 7,0 EDX bits and the CPUID 7,1 leaf read must surface + /// without crashing on every host. Field values are host-dependent; + /// we just exercise the readers and the convenience methods. + #[test] + fn cpuid_extended_bits_smoke() { + let caps = simd_caps(); + let _ = caps.avx512fp16; + let _ = caps.avx512vp2intersect; + let _ = caps.amx_fp16; + let _ = caps.has_avx512_fp16(); + let _ = caps.has_amx_fp16(); + } + + /// `has_amx_fp16()` defense-in-depth: even if `amx_fp16` were + /// spuriously true without `amx_tile`, the convenience method must + /// require both. Matches the pattern used by `has_amx_bf16` in + /// `simd_amx::amx_available()`. + #[test] + fn has_amx_fp16_requires_amx_tile() { + let synthetic = SimdCaps { + avx2: false, + avx512f: false, + avx512bw: false, + avx512vl: false, + avx512vpopcntdq: false, + sse41: false, + sse2: false, + fma: false, + avx512vnni: false, + avx512vbmi: false, + amx_tile: false, + amx_int8: false, + amx_bf16: false, + avx512bf16: false, + avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: true, + neon: false, + asimd_dotprod: false, + fp16: false, + aes: false, + sha2: false, + crc32: false, + }; + assert!( + !synthetic.has_amx_fp16(), + "amx_fp16 without amx_tile must report false" + ); + } + + /// On non-x86 builds the x86 capability bits MUST all read false — + /// the platform-specific zero-defaults must not regress when new + /// fields are added to `SimdCaps`. + #[cfg(not(target_arch = "x86_64"))] + #[test] + fn x86_extended_bits_are_false_on_non_x86() { + let caps = simd_caps(); + assert!(!caps.avx512fp16); + assert!(!caps.avx512vp2intersect); + assert!(!caps.amx_fp16); + assert!(!caps.has_avx512_fp16()); + assert!(!caps.has_amx_fp16()); + } } diff --git a/src/simd_runtime/cpu_ops.rs b/src/simd_runtime/cpu_ops.rs index 67b79713..c567f662 100644 --- a/src/simd_runtime/cpu_ops.rs +++ b/src/simd_runtime/cpu_ops.rs @@ -180,7 +180,15 @@ pub fn cpu_ops() -> &'static CpuOps { #[cfg(target_arch = "x86_64")] { - if _caps.amx_int8 { + // AMX tier selection: CPUID-reports-AMX is necessary but + // not sufficient. A hypervisor may mask XCR0 bits 17/18 + // (the tile XSAVE state) or the OS may not have honoured + // `arch_prctl(XCOMP_PERM, 18)` on Linux 5.19+. In either + // case AMX instructions SIGILL despite the CPUID bit + // being set. `simd_amx::amx_available()` runs the full + // four-step gate (CPUID + OSXSAVE + XCR0 + arch_prctl); + // demote to the AVX-512 path when the OS-check fails. + if _caps.amx_int8 && crate::simd_amx::amx_available() { return &CPU_OPS_AMX_INT8; } if _caps.avx512f && _caps.avx512vnni { From 339d45c2d5cd64d1aeb47e8d5d8e8fdcded75dc9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 12:38:23 +0000 Subject: [PATCH 2/2] style(simd_caps): rustfmt 1.95.0 collapse for synthetic-SimdCaps init Same canonical-fmt collapse as the prior pillar-branch hotfixes. No behavioral change. --- src/hpc/simd_caps.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs index 654140ef..179b4ac0 100644 --- a/src/hpc/simd_caps.rs +++ b/src/hpc/simd_caps.rs @@ -616,10 +616,7 @@ mod tests { sha2: false, crc32: false, }; - assert!( - !synthetic.has_amx_fp16(), - "amx_fp16 without amx_tile must report false" - ); + assert!(!synthetic.has_amx_fp16(), "amx_fp16 without amx_tile must report false"); } /// On non-x86 builds the x86 capability bits MUST all read false —