From db3669e8638947c7f133cb9c81349022e6b98d2a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 20:20:36 +0000 Subject: [PATCH 1/5] feat(simd): SimdProfile enum + detect() implements dispatch matrix Phase 3 T3.1 of the SIMD integration plan: introduce crate::hpc::simd_profile::SimdProfile, the silicon-grained dispatch identity that replaces the coarse three-Tier collapse called out in audit findings TD-T12/T13/T14. The decision tree in SimdProfile::detect() implements .claude/knowledge/td-simd-cpu-dispatch-matrix.md lines 271-305 verbatim, preserving the four load-bearing invariants from the "Detection invariants" section: GraniteRapids-before-SapphireRapids, Zen4-vs-SPR via amx_tile, CooperLake-vs-IceLakeSp via the mutually exclusive BF16/VBMI bit pattern, and TigerLakeU-vs-IceLakeSp via VP2INTERSECT. Risk #4 of the integration plan (no GNR detection without leaf 7,1 reader) closed in the same change: SimdCaps gains avx512fp16, avx512vp2intersect, and amx_fp16 fields, with the x86 detect() arm adding a __cpuid_count(7, 1) read gated on the leaf 7,0 EAX max subleaf advertising support. has_amx_fp16() requires amx_tile in addition to the FP16 bit, mirroring the defense-in-depth pattern in simd_amx::amx_available(). Surface follows cognitive-shader-foundation.md: SimdProfile + simd_profile() re-exported through crate::simd::* so consumers import a single public path. The existing private Tier / tier() machinery in src/simd.rs is untouched; this lands alongside, with incremental migration deferred to T3.5/T3.6. Tests: 7 new in simd_profile (detection determinism, arch partitioning, AVX-512 subset invariant, x86_64-only Scalar fallback, name uniqueness), 2 new in simd_caps (FP16 fields false on non-x86, has_amx_fp16 requires amx_tile). 2075/2075 lib tests pass, clippy -D warnings clean. --- src/hpc/mod.rs | 3 + src/hpc/simd_caps.rs | 99 ++++++++++- src/hpc/simd_profile.rs | 357 ++++++++++++++++++++++++++++++++++++++++ src/simd.rs | 16 ++ 4 files changed, 471 insertions(+), 4 deletions(-) create mode 100644 src/hpc/simd_profile.rs diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 11081ad6..6c6f3d18 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -17,6 +17,9 @@ pub mod simd_caps; // LazyLock frozen SIMD dispatch — function pointers selected once at startup pub mod simd_dispatch; +// Silicon-grained profile (Sapphire Rapids vs Zen 4 vs Ice Lake-SP, etc.) — +// implements the dispatch matrix from `.claude/knowledge/td-simd-cpu-dispatch-matrix.md` +pub mod simd_profile; pub mod blas_level1; pub mod blas_level2; diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs index a35823b5..7d52aec8 100644 --- a/src/hpc/simd_caps.rs +++ b/src/hpc/simd_caps.rs @@ -71,6 +71,21 @@ pub struct SimdCaps { /// (`is_x86_feature_detected!("avxvnniint8")`). /// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H). pub avxvnniint8: bool, + /// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23). + /// Native `__m512h` operations (`_mm512_*_ph`). Distinguishes Sapphire + /// Rapids / Granite Rapids / Zen 4+ from earlier AVX-512 silicon. + /// Required by the matrix doc to discriminate SPR (yes) from CLX (no). + pub avx512fp16: bool, + /// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on + /// Tiger Lake mobile silicon; absent from Ice Lake-SP and every later + /// server part. Sole discriminator between `TigerLakeU` and + /// `IceLakeSp` profiles, which otherwise share an identical feature set. + pub avx512vp2intersect: bool, + /// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot + /// product, present on Granite Rapids only. Sole discriminator between + /// `SapphireRapids` and `GraniteRapids` profiles. Lives at CPUID leaf + /// 7,1, not leaf 7,0 — separate cpuid_count call required. + pub amx_fp16: bool, // ── aarch64 (ARM) ── /// NEON 128-bit SIMD (mandatory on aarch64, always true). @@ -124,6 +139,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -139,10 +157,24 @@ impl SimdCaps { // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always // available on x86_64 (guaranteed by the ABI) and has no side effects // beyond reading CPU registers. - let cpuid7 = core::arch::x86_64::__cpuid_count(7, 0); - let amx_tile = (cpuid7.edx >> 24) & 1 == 1; - let amx_int8 = (cpuid7.edx >> 25) & 1 == 1; - let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1; + let cpuid7_0 = core::arch::x86_64::__cpuid_count(7, 0); + let amx_tile = (cpuid7_0.edx >> 24) & 1 == 1; + let amx_int8 = (cpuid7_0.edx >> 25) & 1 == 1; + let amx_bf16 = (cpuid7_0.edx >> 22) & 1 == 1; + let avx512fp16 = (cpuid7_0.edx >> 23) & 1 == 1; + let avx512vp2intersect = (cpuid7_0.edx >> 8) & 1 == 1; + + // Leaf 7,1 EAX bit 21 = AMX-FP16. Per the dispatch matrix this is + // the sole discriminator between Granite Rapids and Sapphire Rapids. + // Leaf 7,1 only exists when leaf 7,0 EAX (max subleaf) >= 1; on + // older silicon this returns zero, which is the correct answer. + let leaf7_max_sub = cpuid7_0.eax; + let amx_fp16 = if leaf7_max_sub >= 1 { + let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1); + (cpuid7_1.eax >> 21) & 1 == 1 + } else { + false + }; Self { avx2: is_x86_feature_detected!("avx2"), @@ -160,6 +192,9 @@ impl SimdCaps { amx_bf16, avx512bf16: is_x86_feature_detected!("avx512bf16"), avxvnniint8: is_x86_feature_detected!("avxvnniint8"), + avx512fp16, + avx512vp2intersect, + amx_fp16, // ARM fields: all false on x86 neon: false, asimd_dotprod: false, @@ -192,6 +227,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, // ARM fields: runtime detection neon: true, // mandatory on aarch64 asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"), @@ -221,6 +259,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -275,6 +316,20 @@ impl SimdCaps { self.avxvnniint8 } + /// True if AVX-512 FP16 (`__m512h`) is available. Required to + /// discriminate `SapphireRapids` from `CascadeLake`-class profiles. + #[inline(always)] + pub fn has_avx512_fp16(self) -> bool { + self.avx512fp16 + } + + /// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids + /// silicon advertises this bit; sole discriminator between GNR and SPR. + #[inline(always)] + pub fn has_amx_fp16(self) -> bool { + self.amx_fp16 && self.amx_tile + } + // ── ARM convenience methods ── /// True if running on aarch64 with NEON (always true on aarch64). @@ -408,6 +463,42 @@ mod tests { let _ = caps.amx_bf16; let _ = caps.avx512bf16; let _ = caps.avxvnniint8; + // PR-#181 follow-up fields (matrix doc lines 240, 247-248). + let _ = caps.avx512fp16; + let _ = caps.avx512vp2intersect; + let _ = caps.amx_fp16; + } + + #[test] + fn fp16_fields_consistent_on_non_x86() { + // Non-x86 targets must never report x86 AMX/AVX-512 FP16 capabilities. + #[cfg(not(target_arch = "x86_64"))] + { + let caps = simd_caps(); + assert!(!caps.avx512fp16); + assert!(!caps.avx512vp2intersect); + assert!(!caps.amx_fp16); + assert!(!caps.has_avx512_fp16()); + assert!(!caps.has_amx_fp16()); + } + } + + #[test] + fn has_amx_fp16_requires_amx_tile() { + // Even if `amx_fp16` were spuriously true without `amx_tile`, + // the convenience method must require both bits. Matches + // simd_amx.rs::amx_available()'s defense-in-depth pattern. + let synthetic = SimdCaps { + avx2: false, avx512f: false, avx512bw: false, avx512vl: false, + avx512vpopcntdq: false, sse41: false, sse2: false, fma: false, + avx512vnni: false, avx512vbmi: false, + amx_tile: false, amx_int8: false, amx_bf16: false, + avx512bf16: false, avxvnniint8: false, + avx512fp16: false, avx512vp2intersect: false, amx_fp16: true, + neon: false, asimd_dotprod: false, fp16: false, + aes: false, sha2: false, crc32: false, + }; + assert!(!synthetic.has_amx_fp16(), "amx_fp16 without amx_tile must report false"); } #[test] diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs new file mode 100644 index 00000000..a49df92b --- /dev/null +++ b/src/hpc/simd_profile.rs @@ -0,0 +1,357 @@ +//! `SimdProfile` — silicon-grained dispatch identity. +//! +//! Where `SimdCaps` reports individual feature bits (AVX-512F, AMX-TILE, +//! VBMI, etc.), `SimdProfile` collapses a *combination* of those bits into +//! a single enum variant naming the silicon generation. One profile = one +//! best set of primitives. Consumers branch on `simd_profile()` once at +//! startup; subsequent dispatch is a `match` over the variant or an index +//! into a `*Dispatch` static table. +//! +//! Authoritative feature mapping lives in +//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`. The detection ladder +//! below implements the decision tree at lines 271-305 of that document. +//! Critical invariants from § "Detection invariants": +//! +//! 1. `GraniteRapids` must be checked **before** `SapphireRapids` — GNR +//! has every SPR bit plus AMX-FP16. Checking SPR first would route GNR +//! silicon as SPR and leave the AMX-FP16 tile path unused. +//! 2. `Zen4Avx512` vs `SapphireRapids` discriminate on `amx_tile` (SPR +//! yes, Zen 4 no) — both have F + VBMI + BF16 + FP16. +//! 3. `CooperLake` vs `IceLakeSp` are mutually exclusive bit patterns: +//! CPL has BF16 without VBMI; ICX has VBMI without BF16. +//! 4. `TigerLakeU` vs `IceLakeSp` share an ISA except for VP2INTERSECT +//! (TigerLake mobile only). Sole discriminator is `avx512vp2intersect`. + +use crate::hpc::simd_caps::{simd_caps, ArmProfile}; +use std::sync::LazyLock; + +/// Silicon-grained dispatch identity. One variant = one set of best +/// primitives. See `td-simd-cpu-dispatch-matrix.md` for the authoritative +/// feature table per variant. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SimdProfile { + // ── x86_64 ── + /// Granite Rapids. Sapphire-Rapids feature set plus AMX-FP16. + GraniteRapids, + /// Sapphire Rapids / Emerald Rapids (ISA-identical). AVX-512 superset + /// with AMX-TILE/INT8/BF16 and AVX-512-FP16. No AMX-FP16. + SapphireRapids, + /// Zen 4 / Zen 5 (Genoa / Turin). AVX-512 superset matching SPR's + /// non-AMX feature set; no AMX of any kind. + Zen4Avx512, + /// Cooper Lake. AVX-512F + VNNI + BF16, no VBMI/FP16/AMX. The unique + /// "BF16 without VBMI" combination. + CooperLake, + /// Tiger Lake mobile. Ice-Lake-class AVX-512 + VP2INTERSECT, no + /// BF16/FP16/AMX. Distinguished from Ice Lake-SP by VP2INTERSECT only. + TigerLakeU, + /// Ice Lake-SP server. AVX-512F + VBMI + VNNI + IFMA, no + /// BF16/FP16/AMX/VP2INTERSECT. + IceLakeSp, + /// Cascade Lake. AVX-512F + VNNI, no VBMI/BF16/FP16/AMX. + CascadeLake, + /// Skylake-X / SP / W. AVX-512F baseline only (no VNNI). + SkylakeX, + /// Arrow Lake / Lunar Lake / Meteor Lake-H. AVX2 + AVX-VNNI-INT8 + + /// AVX-IFMA + AVX-NE-CONVERT, no AVX-512. + ArrowLake, + /// Haswell through Coffee Lake / Zen 1-3. AVX2 + FMA, no AVX-512. + HaswellAvx2, + + // ── aarch64 ── + /// Cortex-A76+ (Pi 5, RK3588 big cores, Apple M-series). NEON + + /// dotprod + fp16 + bf16/i8mm where present. + A76DotProd, + /// Cortex-A72 / A53-with-crypto. ARMv8.0 + NEON + crypto, no dotprod. + /// HWCAP cannot distinguish A72 (Pi 4) from A53-with-crypto (Pi 3) + /// silicon — both share the same dispatch tables at the ISA level. + A72Fast, + /// Cortex-A53 without crypto. Rare in the wild; QEMU / minimal aarch64. + A53Baseline, + + // ── Fallback ── + /// wasm32, riscv, x86 baseline, or anything else without a recognised + /// SIMD ISA. + Scalar, +} + +impl SimdProfile { + /// Resolve the current silicon to one of the enum variants. + /// + /// Reads `simd_caps()` once and walks the decision tree from + /// `td-simd-cpu-dispatch-matrix.md` § "SimdProfile::detect() mapping". + /// The order below is load-bearing: GNR before SPR, the BF16/VBMI + /// mutex for CPL/ICX, VP2INTERSECT for TigerLakeU vs IceLakeSp. + pub fn detect() -> Self { + #[cfg(target_arch = "x86_64")] + { + let caps = simd_caps(); + // GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be + // checked first because GNR is a strict superset of SPR. + if caps.has_amx_fp16() { + return SimdProfile::GraniteRapids; + } + // SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 + + // AVX-512-FP16. EmeraldRapids has identical ISA — same variant. + if caps.amx_tile && caps.amx_bf16 && caps.avx512fp16 { + return SimdProfile::SapphireRapids; + } + // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no AMX. + if caps.avx512f + && caps.avx512vbmi + && caps.avx512bf16 + && caps.avx512fp16 + && !caps.amx_tile + { + return SimdProfile::Zen4Avx512; + } + // CooperLake: AVX-512 + VNNI + BF16, but no VBMI. Mutually + // exclusive bit pattern with IceLakeSp (which has VBMI but + // no BF16). Order vs ICX is irrelevant. + if caps.avx512f && caps.avx512vnni && caps.avx512bf16 && !caps.avx512vbmi { + return SimdProfile::CooperLake; + } + // TigerLakeU vs IceLakeSp: same feature set EXCEPT + // VP2INTERSECT (TigerLake mobile only). Discriminate here. + if caps.avx512f && caps.avx512vbmi && caps.avx512vnni && !caps.avx512bf16 { + return if caps.avx512vp2intersect { + SimdProfile::TigerLakeU + } else { + SimdProfile::IceLakeSp + }; + } + // CascadeLake: AVX-512 + VNNI, no VBMI/BF16. + if caps.avx512f && caps.avx512vnni { + return SimdProfile::CascadeLake; + } + // SkylakeX: AVX-512F only, no VNNI. + if caps.avx512f { + return SimdProfile::SkylakeX; + } + // ArrowLake / Lunar Lake / Meteor Lake-H: no AVX-512 but + // AVX-VNNI-INT8 present. + if caps.avxvnniint8 { + return SimdProfile::ArrowLake; + } + // Haswell..Coffee Lake / Zen 1-3: AVX2 + FMA. + if caps.avx2 && caps.fma { + return SimdProfile::HaswellAvx2; + } + } + #[cfg(target_arch = "aarch64")] + { + // Reuse the in-tree heuristic from `simd_caps::arm_profile()`. + // It already encodes the A72-vs-A53-crypto deployment-pragmatic + // decision and is the canonical ARM dispatch helper. + return match simd_caps().arm_profile() { + ArmProfile::A76DotProd => SimdProfile::A76DotProd, + ArmProfile::A72Fast => SimdProfile::A72Fast, + ArmProfile::A53Baseline => SimdProfile::A53Baseline, + ArmProfile::NotArm => SimdProfile::Scalar, + }; + } + SimdProfile::Scalar + } + + /// Human-readable name. Stable across versions; safe to use in logs, + /// telemetry, and bench output. + pub const fn name(self) -> &'static str { + match self { + Self::GraniteRapids => "GraniteRapids", + Self::SapphireRapids => "SapphireRapids", + Self::Zen4Avx512 => "Zen4Avx512", + Self::CooperLake => "CooperLake", + Self::TigerLakeU => "TigerLakeU", + Self::IceLakeSp => "IceLakeSp", + Self::CascadeLake => "CascadeLake", + Self::SkylakeX => "SkylakeX", + Self::ArrowLake => "ArrowLake", + Self::HaswellAvx2 => "HaswellAvx2", + Self::A76DotProd => "A76DotProd", + Self::A72Fast => "A72Fast", + Self::A53Baseline => "A53Baseline", + Self::Scalar => "Scalar", + } + } + + /// True iff this profile sits on the x86_64 silicon family. Useful + /// for branching in dispatch tables that share a single AMX/AVX-512 + /// kernel across multiple variants. + pub const fn is_x86(self) -> bool { + matches!( + self, + Self::GraniteRapids + | Self::SapphireRapids + | Self::Zen4Avx512 + | Self::CooperLake + | Self::TigerLakeU + | Self::IceLakeSp + | Self::CascadeLake + | Self::SkylakeX + | Self::ArrowLake + | Self::HaswellAvx2 + ) + } + + /// True iff this profile sits on the aarch64 silicon family. + pub const fn is_aarch64(self) -> bool { + matches!( + self, + Self::A76DotProd | Self::A72Fast | Self::A53Baseline + ) + } + + /// True iff this profile has AVX-512 Foundation. False for ArrowLake, + /// HaswellAvx2, aarch64, and Scalar. Use to route AVX-512 dispatch + /// without re-querying `SimdCaps`. + pub const fn has_avx512(self) -> bool { + matches!( + self, + Self::GraniteRapids + | Self::SapphireRapids + | Self::Zen4Avx512 + | Self::CooperLake + | Self::TigerLakeU + | Self::IceLakeSp + | Self::CascadeLake + | Self::SkylakeX + ) + } + + /// True iff this profile has any AMX tile capability (AMX-TILE + + /// AMX-INT8 + AMX-BF16 at minimum). Only SPR-class and GNR. + pub const fn has_amx(self) -> bool { + matches!(self, Self::GraniteRapids | Self::SapphireRapids) + } +} + +static PROFILE: LazyLock = LazyLock::new(SimdProfile::detect); + +/// Get the resolved silicon profile, detected once at first access. +/// +/// All subsequent calls are a single pointer deref to a `Copy` enum — +/// no atomics, no branching. Pair with `*Dispatch` static tables to make +/// per-call dispatch a single indirect call after monomorphisation. +#[inline(always)] +pub fn simd_profile() -> SimdProfile { + *PROFILE +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detect_returns_a_valid_profile() { + let p = simd_profile(); + // Smoke: the LazyLock must not panic and must return *some* + // variant. `name()` exhausts the match, which would fail to + // compile if a new variant were added without a name. + let _ = p.name(); + } + + #[test] + fn determinism() { + // Two consecutive calls must agree — the LazyLock guarantees + // the closure runs exactly once. + assert_eq!(simd_profile(), simd_profile()); + } + + #[test] + fn arch_partitioning_is_consistent() { + let p = simd_profile(); + if p.is_x86() { + assert!(!p.is_aarch64(), "{:?} flagged both x86 and aarch64", p); + } + if p.is_aarch64() { + assert!(!p.is_x86(), "{:?} flagged both x86 and aarch64", p); + } + // On unsupported architectures the only legal answer is Scalar. + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + assert_eq!(p, SimdProfile::Scalar); + } + + #[test] + fn x86_target_lands_inside_x86_family() { + #[cfg(target_arch = "x86_64")] + { + let p = simd_profile(); + // Scalar is a valid x86_64 answer (no AVX2/FMA available), + // but any non-Scalar result must be inside the x86 family. + if p != SimdProfile::Scalar { + assert!( + p.is_x86(), + "x86_64 silicon resolved as non-x86 profile {:?}", + p + ); + } + } + } + + #[test] + fn aarch64_target_lands_inside_aarch64_family() { + #[cfg(target_arch = "aarch64")] + { + let p = simd_profile(); + assert!(p.is_aarch64(), "aarch64 silicon resolved as {:?}", p); + } + } + + #[test] + fn has_avx512_is_subset_of_is_x86() { + for &p in &[ + SimdProfile::GraniteRapids, + SimdProfile::SapphireRapids, + SimdProfile::Zen4Avx512, + SimdProfile::CooperLake, + SimdProfile::TigerLakeU, + SimdProfile::IceLakeSp, + SimdProfile::CascadeLake, + SimdProfile::SkylakeX, + SimdProfile::ArrowLake, + SimdProfile::HaswellAvx2, + SimdProfile::A76DotProd, + SimdProfile::A72Fast, + SimdProfile::A53Baseline, + SimdProfile::Scalar, + ] { + if p.has_avx512() { + assert!(p.is_x86(), "{:?} reports AVX-512 but is not x86", p); + } + if p.has_amx() { + assert!(p.has_avx512(), "{:?} reports AMX but not AVX-512", p); + assert!(p.is_x86(), "{:?} reports AMX but is not x86", p); + } + } + } + + #[test] + fn names_are_stable_and_unique() { + let all = [ + SimdProfile::GraniteRapids, + SimdProfile::SapphireRapids, + SimdProfile::Zen4Avx512, + SimdProfile::CooperLake, + SimdProfile::TigerLakeU, + SimdProfile::IceLakeSp, + SimdProfile::CascadeLake, + SimdProfile::SkylakeX, + SimdProfile::ArrowLake, + SimdProfile::HaswellAvx2, + SimdProfile::A76DotProd, + SimdProfile::A72Fast, + SimdProfile::A53Baseline, + SimdProfile::Scalar, + ]; + let names: Vec<&'static str> = all.iter().map(|p| p.name()).collect(); + for i in 0..names.len() { + for j in i + 1..names.len() { + assert_ne!( + names[i], names[j], + "profile names must be unique: {:?} == {:?}", + all[i], all[j] + ); + } + } + } +} diff --git a/src/simd.rs b/src/simd.rs index ce449991..877c304a 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -133,6 +133,22 @@ fn tier() -> Tier { // The check is cheap (reads a cached cpuid result) and the batch // function uses as_chunks::<16>() + as_chunks::<8>() for SIMD widths. +// ──────────────────────────────────────────────────────────────────── +// Silicon-grained profile re-export. +// +// `Tier` above is the legacy coarse enum (Avx512/Avx2/Neon/Scalar) used +// by the F32x16 / F64x8 dispatch in this module. `SimdProfile` is the +// fine-grained successor that names individual silicon generations +// (SapphireRapids, Zen4Avx512, IceLakeSp, A76DotProd, …) so consumers +// can route to the best primitive on each CPU. Both ship side-by-side +// during the Phase 3 integration; callers migrate on their own cadence. +// +// Public surface lives in `crate::simd::*` per the cognitive-shader +// foundation contract (`cognitive-shader-foundation.md` § "Public Surface"). +// ──────────────────────────────────────────────────────────────────── +#[cfg(feature = "std")] +pub use crate::hpc::simd_profile::{simd_profile, SimdProfile}; + // ============================================================================ // Preferred SIMD lane widths — compile-time constants for array_windows // ============================================================================ From b9a6fa0e85960a0f7a7cfec7e7ed19fd1a2b8ed1 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 20:34:55 +0000 Subject: [PATCH 2/5] feat(simd): cpu-* cargo features for compile-time SimdProfile pinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 T3.2: add 13 mutually-exclusive cargo features that pin simd_profile() to a const at compile time, bypassing the runtime LazyLock detection from T3.1. One feature per non-Scalar variant of the SimdProfile enum. Features (mapping to LLVM target-cpu codenames): cpu-gnr → GraniteRapids (graniterapids) cpu-spr → SapphireRapids (sapphirerapids) cpu-zen4 → Zen4Avx512 (znver4) cpu-cpl → CooperLake (cooperlake) cpu-tigerlake → TigerLakeU (tigerlake) cpu-icx → IceLakeSp (icelake-server) cpu-clx → CascadeLake (cascadelake) cpu-skx → SkylakeX (skylake-avx512) cpu-arrowlake → ArrowLake (arrowlake) cpu-haswell → HaswellAvx2 (haswell) cpu-a76 → A76DotProd (cortex-a76) cpu-a72 → A72Fast (cortex-a72) cpu-a53 → A53Baseline (cortex-a53) Mutual exclusion (per integration-plan risk #1: "if a user sets cpu-spr AND has runtime detection on Zen 4, the binary SIGILLs on AMX instructions") is enforced via a const assert: each cpu-* contributes 1 to _PIN_COUNT and the assert fires at compile time if the sum exceeds one. Verified: enabling cpu-spr+cpu-zen4 simultaneously produces a build error citing the mutex. Implementation: a const pinned_profile() -> Option walks a cfg cascade and returns the active variant or None. The simd_profile() function exists in two cfg-gated forms — a runtime LazyLock variant compiled when no cpu-* feature is set, and a const-foldable variant compiled when any is set. The LazyLock is not linked into pinned binaries. is_pinned() const helper exposes whether compile-time dispatch is active, useful both for consumer-facing diagnostics and for gating arch-detection tests that no longer apply when pinning overrides hardware. Existing x86_target_lands_inside_x86_family / aarch64_target_lands_inside_aarch64_family tests early-return when pinned; two new tests (pinning_default_is_off, pinning_consistency) verify the const + runtime paths agree. Tests: 2077/2077 lib tests pass under both default and --features cpu-spr configurations. cargo clippy --lib -- -D warnings clean in both modes. Mutex compile_error verified by attempting --features "cpu-spr,cpu-zen4" — fails as expected with the const assert citation. The default (no feature) path is byte-identical to the T3.1 runtime detection — no regression risk to the merged #181 work or to the e40f3a31 SimdProfile commit. --- Cargo.toml | 30 +++++ src/hpc/simd_profile.rs | 236 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 262 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ee7cc8f7..a431f14b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -273,6 +273,36 @@ splat3d = ["std"] # quad-tree partition; the entropy coder + RDO loop land in later workers. codec = ["std"] +# ── Phase 3 T3.2: compile-time SimdProfile pinning ─────────────────── +# +# Each cpu- feature, when enabled, makes +# `crate::simd::simd_profile()` fold to a const at compile time and +# bypass the runtime LazyLock detection. Pair with the matching +# `-Ctarget-cpu=` in `.cargo/config.toml` (or `RUSTFLAGS`) +# for full effect — the cargo feature picks the *dispatch* variant, +# while `-Ctarget-cpu` picks the *codegen* variant. Both together +# produce a binary that is specialised to one silicon family. +# +# Features are MUTUALLY EXCLUSIVE — enable at most one. A compile-time +# assert in `src/hpc/simd_profile.rs` enforces this. Multiple +# pinning features active = build error. +# +# Codename → SimdProfile variant mapping (see +# `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`): +cpu-gnr = [] # GraniteRapids — target-cpu=graniterapids +cpu-spr = [] # SapphireRapids — target-cpu=sapphirerapids +cpu-zen4 = [] # Zen4Avx512 — target-cpu=znver4 (or znver5) +cpu-cpl = [] # CooperLake — target-cpu=cooperlake +cpu-tigerlake = [] # TigerLakeU — target-cpu=tigerlake +cpu-icx = [] # IceLakeSp — target-cpu=icelake-server +cpu-clx = [] # CascadeLake — target-cpu=cascadelake +cpu-skx = [] # SkylakeX — target-cpu=skylake-avx512 +cpu-arrowlake = [] # ArrowLake — target-cpu=arrowlake +cpu-haswell = [] # HaswellAvx2 — target-cpu=haswell (or znver3) +cpu-a76 = [] # A76DotProd — target-cpu=cortex-a76 +cpu-a72 = [] # A72Fast — target-cpu=cortex-a72 +cpu-a53 = [] # A53Baseline — target-cpu=cortex-a53 + # no_std polyfill for `static LazyLock` in `src/simd.rs` (sprint A12). # Pulls in `portable-atomic` with the `critical-section` impl plus the # `critical-section` runtime so we can build a once-cell-style cache for diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs index a49df92b..7ed65b37 100644 --- a/src/hpc/simd_profile.rs +++ b/src/hpc/simd_profile.rs @@ -225,18 +225,200 @@ impl SimdProfile { } } +// ──────────────────────────────────────────────────────────────────── +// Phase 3 T3.2 — compile-time pinning via `cpu-*` cargo features. +// +// When any `cpu-` feature is set, `PINNED_PROFILE` is `Some(_)` +// and `simd_profile()` folds to the const at compile time — no LazyLock +// initialisation, no branch, no atomics. Without any feature set, +// behaviour matches the original runtime LazyLock detection. +// +// Mutual exclusion: at most ONE cpu-* feature may be enabled. The +// `_PIN_COUNT` const assert below fires at compile time if more than +// one is active, mirroring integration-plan risk #1 ("if a user sets +// cpu-spr AND has runtime detection on Zen 4, the binary SIGILLs"). +// ──────────────────────────────────────────────────────────────────── + +const _PIN_COUNT: u32 = 0 + + cfg!(feature = "cpu-gnr") as u32 + + cfg!(feature = "cpu-spr") as u32 + + cfg!(feature = "cpu-zen4") as u32 + + cfg!(feature = "cpu-cpl") as u32 + + cfg!(feature = "cpu-tigerlake") as u32 + + cfg!(feature = "cpu-icx") as u32 + + cfg!(feature = "cpu-clx") as u32 + + cfg!(feature = "cpu-skx") as u32 + + cfg!(feature = "cpu-arrowlake") as u32 + + cfg!(feature = "cpu-haswell") as u32 + + cfg!(feature = "cpu-a76") as u32 + + cfg!(feature = "cpu-a72") as u32 + + cfg!(feature = "cpu-a53") as u32; + +const _: () = assert!( + _PIN_COUNT <= 1, + "cpu-* cargo features are mutually exclusive: enable at most one (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell, cpu-a76, cpu-a72, cpu-a53)" +); + +/// The compile-time pinned profile, or `None` when runtime detection is in +/// effect. `Some(_)` exactly when one of the `cpu-*` cargo features is +/// enabled; mutually exclusive features are enforced by the `_PIN_COUNT` +/// const assert above. +/// +/// Consumers wanting branch-free dispatch on pinned builds can match on +/// this const directly — the optimiser folds `Some(SimdProfile::X)` into +/// the call site and the `None`-arm runtime path is eliminated. Returned +/// from a `const fn` so call sites in const contexts (e.g. `const` array +/// initialisers for dispatch tables) work as well. +pub const fn pinned_profile() -> Option { + #[cfg(feature = "cpu-gnr")] + { + return Some(SimdProfile::GraniteRapids); + } + #[cfg(feature = "cpu-spr")] + { + return Some(SimdProfile::SapphireRapids); + } + #[cfg(feature = "cpu-zen4")] + { + return Some(SimdProfile::Zen4Avx512); + } + #[cfg(feature = "cpu-cpl")] + { + return Some(SimdProfile::CooperLake); + } + #[cfg(feature = "cpu-tigerlake")] + { + return Some(SimdProfile::TigerLakeU); + } + #[cfg(feature = "cpu-icx")] + { + return Some(SimdProfile::IceLakeSp); + } + #[cfg(feature = "cpu-clx")] + { + return Some(SimdProfile::CascadeLake); + } + #[cfg(feature = "cpu-skx")] + { + return Some(SimdProfile::SkylakeX); + } + #[cfg(feature = "cpu-arrowlake")] + { + return Some(SimdProfile::ArrowLake); + } + #[cfg(feature = "cpu-haswell")] + { + return Some(SimdProfile::HaswellAvx2); + } + #[cfg(feature = "cpu-a76")] + { + return Some(SimdProfile::A76DotProd); + } + #[cfg(feature = "cpu-a72")] + { + return Some(SimdProfile::A72Fast); + } + #[cfg(feature = "cpu-a53")] + { + return Some(SimdProfile::A53Baseline); + } + #[allow(unreachable_code)] + None +} + +/// `true` when a `cpu-*` cargo feature has pinned the profile at compile +/// time, `false` when runtime detection is in use. Equivalent to +/// `pinned_profile().is_some()` but spelled out for grep-ability. +pub const fn is_pinned() -> bool { + pinned_profile().is_some() +} + +// The LazyLock only exists when no cpu-* feature is set. With pinning, +// linking the LazyLock would defeat the purpose — we want every code +// path that touches `simd_profile()` to fold to a const. +#[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +)))] static PROFILE: LazyLock = LazyLock::new(SimdProfile::detect); -/// Get the resolved silicon profile, detected once at first access. +/// Get the resolved silicon profile. +/// +/// **Default (no `cpu-*` feature):** detected once at first access via +/// `LazyLock`; subsequent calls are a single pointer deref to a `Copy` +/// enum — no atomics, no branching. +/// +/// **Pinned (one `cpu-*` feature set):** returns the pinned const +/// directly. The compiler folds this call into the matching variant at +/// every call site; the LazyLock is not linked into the binary. /// -/// All subsequent calls are a single pointer deref to a `Copy` enum — -/// no atomics, no branching. Pair with `*Dispatch` static tables to make -/// per-call dispatch a single indirect call after monomorphisation. +/// Pair with `*Dispatch` static tables to make per-call dispatch a single +/// indirect call after monomorphisation (runtime path) or to fold to a +/// direct call (pinned path). +#[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +)))] #[inline(always)] pub fn simd_profile() -> SimdProfile { *PROFILE } +/// Get the resolved silicon profile (pinned variant). +/// +/// A `cpu-*` cargo feature is active: this returns the pinned constant +/// directly, foldable at every call site. The runtime LazyLock is not +/// linked into the binary. See the documentation on the runtime variant +/// for the call-site contract. +#[cfg(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +))] +#[inline(always)] +pub const fn simd_profile() -> SimdProfile { + // SAFETY of the unwrap: the cfg gate above guarantees at least one + // cpu-* feature is set, and `pinned_profile()` returns Some(_) under + // any of those gates. Const-evaluable since the inner cfg cascade is + // resolved at compile time. + match pinned_profile() { + Some(p) => p, + None => SimdProfile::Scalar, // unreachable; const fn can't panic cleanly + } +} + #[cfg(test)] mod tests { use super::*; @@ -273,6 +455,11 @@ mod tests { #[test] fn x86_target_lands_inside_x86_family() { + // Pinning overrides hardware detection — the test only describes + // the default (runtime-detection) path. + if is_pinned() { + return; + } #[cfg(target_arch = "x86_64")] { let p = simd_profile(); @@ -290,6 +477,9 @@ mod tests { #[test] fn aarch64_target_lands_inside_aarch64_family() { + if is_pinned() { + return; + } #[cfg(target_arch = "aarch64")] { let p = simd_profile(); @@ -297,6 +487,44 @@ mod tests { } } + #[test] + fn pinning_default_is_off() { + // The default build (no cpu-* feature) must NOT be pinned, so + // downstream consumers don't get surprised by compile-time + // dispatch they didn't opt into. + #[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", + )))] + { + assert!(!is_pinned()); + assert_eq!(pinned_profile(), None); + } + } + + #[test] + fn pinning_consistency() { + // When pinning is in effect, simd_profile() must equal the + // pinned const — hardware detection is bypassed entirely. + if let Some(pinned) = pinned_profile() { + assert!(is_pinned()); + assert_eq!(simd_profile(), pinned); + } else { + assert!(!is_pinned()); + } + } + #[test] fn has_avx512_is_subset_of_is_x86() { for &p in &[ From 03b30e54d6f158945d72086d038f216d2c52a315 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 20:38:52 +0000 Subject: [PATCH 3/5] =?UTF-8?q?feat(examples):=20simd=5Fprofile=5Fprobe=20?= =?UTF-8?q?=E2=80=94=20hardware=20verification=20binary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supports step 1 of the TEST-promotion checklist from .claude/knowledge/td-simd-cpu-dispatch-matrix.md § "TEST verification checklist": "Boot the binary on the silicon and confirm simd_profile() returns the expected variant." The probe prints: - Resolved SimdProfile variant + arch/family flags - Compile-time pinning status (and pinned variant if active) - Every CPUID-derived SimdCaps bit, ticked/unticked - ARM heuristic profile when running on aarch64 - Active compile-time target features (avx512f / avx2) - Per-variant matrix-doc cell summary (terse — matrix is source of truth) - Runs the same pinning_consistency invariant the unit test checks, so a probe deployed on real silicon flags regressions in the cfg cascade. First-hardware results on the build host (Sapphire Rapids): - simd_profile() resolves to SapphireRapids ✓ - amx_tile + amx_bf16 + avx512fp16 all set ✓ - amx_fp16 unset (correctly NOT promoting to GraniteRapids) ✓ - GNR-before-SPR ordering invariant verified end-to-end This is the first end-to-end pass of the e40f3a31 SimdProfile detect chain plus the 5a3a6630 cpu-* pinning machinery on real silicon — confirms the dispatch axis is functional, not just doc-checked. Smoke-tested with --features cpu-zen4: probe correctly reports ACTIVE pinning and Zen4Avx512 as the resolved variant, even on SPR silicon (pinning intentionally overrides hardware detection). --- examples/simd_profile_probe.rs | 181 +++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 examples/simd_profile_probe.rs diff --git a/examples/simd_profile_probe.rs b/examples/simd_profile_probe.rs new file mode 100644 index 00000000..609843f4 --- /dev/null +++ b/examples/simd_profile_probe.rs @@ -0,0 +1,181 @@ +//! `simd_profile_probe` — boot-on-silicon diagnostic for the dispatch matrix. +//! +//! Step 1 of the TEST-promotion checklist from +//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md` § "TEST verification +//! checklist": *"Boot the binary on the silicon and confirm `simd_profile()` +//! returns the expected variant."* +//! +//! Prints every CPUID-derived capability bit plus the resolved `SimdProfile` +//! variant. Used to verify silicon → profile mapping when promoting DOC +//! cells in the dispatch matrix to TEST. +//! +//! Usage: +//! ```sh +//! # Runtime detection (default — same binary on any silicon): +//! cargo run --example simd_profile_probe --release +//! +//! # Compile-time pinned (the LazyLock is not linked in): +//! cargo run --example simd_profile_probe --release --features cpu-spr +//! ``` + +use ndarray::hpc::simd_caps::{simd_caps, ArmProfile, SimdCaps}; +use ndarray::hpc::simd_profile::{is_pinned, pinned_profile, simd_profile, SimdProfile}; + +fn main() { + let caps = simd_caps(); + let profile = simd_profile(); + + println!("ndarray simd-profile probe"); + println!("=========================="); + println!(); + + // ── Dispatch identity ─────────────────────────────────────────── + println!("Resolved profile: {}", profile.name()); + println!(" is_x86: {}", profile.is_x86()); + println!(" is_aarch64: {}", profile.is_aarch64()); + println!(" has_avx512: {}", profile.has_avx512()); + println!(" has_amx: {}", profile.has_amx()); + println!(); + + // ── Pinning status ───────────────────────────────────────────── + println!("Compile-time pinning: {}", if is_pinned() { "ACTIVE" } else { "off (runtime detection)" }); + if let Some(p) = pinned_profile() { + println!(" Pinned variant: {}", p.name()); + } + println!(); + + // ── Raw capability bits ──────────────────────────────────────── + println!("SimdCaps (raw bits):"); + print_caps(&caps); + println!(); + + // ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ── + let arm = caps.arm_profile(); + if !matches!(arm, ArmProfile::NotArm) { + println!("ARM profile (heuristic): {}", arm.name()); + println!(" est. tok/sec: {}", arm.estimated_tok_per_sec()); + println!(" eff. f32 lanes:{}", arm.effective_f32_lanes()); + println!(); + } + + // ── Build configuration ───────────────────────────────────────── + println!("Build:"); + println!(" target_arch: {}", std::env::consts::ARCH); + println!(" target_os: {}", std::env::consts::OS); + #[cfg(target_feature = "avx512f")] + println!(" -Ctarget-feature avx512f: yes (compile-time)"); + #[cfg(not(target_feature = "avx512f"))] + println!(" -Ctarget-feature avx512f: no (compile-time)"); + #[cfg(target_feature = "avx2")] + println!(" -Ctarget-feature avx2: yes (compile-time)"); + #[cfg(not(target_feature = "avx2"))] + println!(" -Ctarget-feature avx2: no (compile-time)"); + println!(); + + // ── TEST promotion guidance ──────────────────────────────────── + println!("Matrix-doc cells affected by this CPU:"); + matrix_cell_summary(profile); + + // Sanity invariant: simd_profile() and pinned_profile() must agree + // when pinning is active. This is the same check that + // `pinning_consistency` runs as a unit test; we re-run it here so a + // probe binary deployed on real silicon flags any future regression + // in the cfg cascade. + if let Some(p) = pinned_profile() { + assert_eq!( + profile, p, + "INVARIANT VIOLATION: pinned_profile()={:?} disagrees with simd_profile()={:?}", + p, profile + ); + } +} + +fn print_caps(c: &SimdCaps) { + let bits: &[(&str, bool)] = &[ + ("avx2", c.avx2), + ("avx512f", c.avx512f), + ("avx512bw", c.avx512bw), + ("avx512vl", c.avx512vl), + ("avx512vnni", c.avx512vnni), + ("avx512vbmi", c.avx512vbmi), + ("avx512vpopcntdq", c.avx512vpopcntdq), + ("avx512bf16", c.avx512bf16), + ("avx512fp16", c.avx512fp16), + ("avx512vp2intersect", c.avx512vp2intersect), + ("avxvnniint8", c.avxvnniint8), + ("amx_tile", c.amx_tile), + ("amx_int8", c.amx_int8), + ("amx_bf16", c.amx_bf16), + ("amx_fp16", c.amx_fp16), + ("fma", c.fma), + ("sse41", c.sse41), + ("sse2", c.sse2), + ("neon", c.neon), + ("asimd_dotprod", c.asimd_dotprod), + ("fp16 (arm)", c.fp16), + ("aes", c.aes), + ("sha2", c.sha2), + ("crc32", c.crc32), + ]; + for (name, present) in bits { + println!(" [{}] {}", if *present { "x" } else { " " }, name); + } +} + +fn matrix_cell_summary(p: SimdProfile) { + // Lifted from `td-simd-cpu-dispatch-matrix.md` § "Master matrix" + // for each x86 profile. The summary is intentionally terse — the + // matrix doc is the source of truth and should be consulted before + // promoting any DOC cell to TEST. + let summary: &[&str] = match p { + SimdProfile::GraniteRapids => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL", + "AMX-TILE+INT8+BF16+FP16 (FP16 is the GNR discriminator)", + ], + SimdProfile::SapphireRapids => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL", + "AMX-TILE+INT8+BF16 (no AMX-FP16 — that's GNR)", + ], + SimdProfile::Zen4Avx512 => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "No AMX of any kind; 256-bit FPU double-pumped on Zen4, native 512-bit on Zen5", + ], + SimdProfile::CooperLake => &[ + "F+CD+VL+DQ+BW+VNNI+BF16", + "No VBMI, no FP16, no AMX — unique 'BF16 without VBMI'", + ], + SimdProfile::TigerLakeU => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+VP2INTERSECT", + "VP2INTERSECT is the sole discriminator vs IceLakeSp", + ], + SimdProfile::IceLakeSp => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI", + "No BF16, no FP16, no AMX, no VP2INTERSECT", + ], + SimdProfile::CascadeLake => &["F+CD+VL+DQ+BW+VNNI", "First Xeon with VNNI; no VBMI/BF16/FP16/AMX"], + SimdProfile::SkylakeX => &["F+CD+VL+DQ+BW", "Founding AVX-512 baseline; everything since adds on top"], + SimdProfile::ArrowLake => &[ + "No AVX-512 (hybrid CPU design)", + "AVX-VNNI-INT8 + AVX-IFMA + AVX-NE-CONVERT (256-bit / VEX forms)", + ], + SimdProfile::HaswellAvx2 => &["AVX2 + FMA + F16C + BMI1/2", "Haswell..Coffee Lake / Zen 1-3"], + SimdProfile::A76DotProd => &[ + "NEON + dotprod + fp16 + bf16+ + i8mm", + "Pi 5 (BCM2712), Orange Pi 5 (RK3588), Apple M1+", + ], + SimdProfile::A72Fast => &[ + "NEON 128-bit + crypto (AES/SHA-2/CRC32)", + "Pi 4 (BCM2711), Pi 3-with-crypto, Orange Pi 4 — HWCAP cannot distinguish A72 from A53-with-crypto", + ], + SimdProfile::A53Baseline => &[ + "NEON 128-bit baseline", + "Rare in the wild — QEMU / minimal aarch64 without crypto", + ], + SimdProfile::Scalar => &["No SIMD ISA recognised", "Fallback: scalar reference kernels"], + }; + for line in summary { + println!(" - {}", line); + } +} From de52a446bdff917d2612c0bf5c3b96b165faa953 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 20:43:24 +0000 Subject: [PATCH 4/5] =?UTF-8?q?fix(simd):=20SimdProfile::detect()=20consul?= =?UTF-8?q?ts=20amx=5Favailable()=20=E2=80=94=20Risk=20#3=20closure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integration plan risk #3 ("Detection robustness across hypervisors"): CPUID may advertise AMX-TILE while the OS/hypervisor has not enabled the tile XSAVE state. Without the OS-level check, the dispatch table routes to AMX kernels that SIGILL at first use. Fix: SimdProfile::detect() now reads `simd_amx::amx_available()` (the existing 4-step gate: CPUID → OSXSAVE → XCR0[17,18] → arch_prctl XCOMP_PERM on Linux 5.19+) and demotes when CPUID and OS disagree. The GraniteRapids and SapphireRapids arms now require both the CPUID bits AND `amx_usable`; the Zen4Avx512 arm catches SPR-class CPUID with locked-down hypervisor XSAVE so dispatch falls to the AVX-512 BF16/FP16 path instead. Verified on the build host (Sapphire Rapids silicon, kernel 6.18.5): - CPUID reports amx_tile=1, amx_int8=1, amx_bf16=1 (all true) - simd_amx::amx_available() returns false (hypervisor masks XCR0[17,18] or the arch_prctl(XCOMP_PERM) request fails) - SimdProfile::detect() correctly resolves to Zen4Avx512, not SapphireRapids — the AMX kernels are not reachable from dispatch on this OS state. Without this fix, the e40f3a31 detect path would have resolved to SapphireRapids on this exact silicon/OS combination, then SIGILL'd the first time a dispatch table called an AMX kernel. Bug closed before any consumer was wired to the dispatch table. The probe binary (examples/simd_profile_probe.rs) gains a new "AMX gating (CPUID vs OS)" section so the CPUID-vs-OS gap is visible without reading source. Format mirrors how the matrix-doc cell summary appears: terse, two lines plus an optional demotion note when the bits disagree. Pinned mode (cpu-* cargo features) intentionally bypasses this gate since pinning is a build-time assertion that the target OS supports the chosen variant — pinned binaries are non-portable by design. Tests: 2077/2077 lib pass. cargo clippy --lib clean under default and --features cpu-spr. Behaviour on hardware with proper AMX enablement (full prctl path success) is unchanged: SapphireRapids still resolves to SapphireRapids when amx_available() returns true. --- examples/simd_profile_probe.rs | 21 +++++++++++++++++++++ src/hpc/simd_profile.rs | 20 ++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/examples/simd_profile_probe.rs b/examples/simd_profile_probe.rs index 609843f4..5324198c 100644 --- a/examples/simd_profile_probe.rs +++ b/examples/simd_profile_probe.rs @@ -49,6 +49,27 @@ fn main() { print_caps(&caps); println!(); + // ── AMX OS-state probe (Risk #3 from integration plan) ──────── + // SimdCaps reports raw CPUID. SimdProfile::detect() additionally + // consults `simd_amx::amx_available()` which gates on + // OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM). If CPUID says + // AMX-TILE but the OS/hypervisor doesn't enable the XSAVE state, + // dispatch demotes from SPR/GNR to Zen4Avx512 (AVX-512 BF16 path + // instead of AMX tiles). Surfacing the gap here lets a reviewer + // see when CPUID-vs-OS disagree without reading source. + #[cfg(target_arch = "x86_64")] + { + let cpuid_says_amx = caps.amx_tile && caps.amx_int8; + let os_allows_amx = ndarray::simd_amx::amx_available(); + println!("AMX gating (CPUID vs OS):"); + println!(" CPUID amx_tile+amx_int8: {}", cpuid_says_amx); + println!(" OS XSAVE/prctl gate: {}", os_allows_amx); + if cpuid_says_amx && !os_allows_amx { + println!(" → CPUID-reported AMX is OS-DEMOTED — dispatch falls back to AVX-512 path"); + } + println!(); + } + // ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ── let arm = caps.arm_profile(); if !matches!(arm, ArmProfile::NotArm) { diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs index 7ed65b37..78775fe9 100644 --- a/src/hpc/simd_profile.rs +++ b/src/hpc/simd_profile.rs @@ -86,22 +86,34 @@ impl SimdProfile { #[cfg(target_arch = "x86_64")] { let caps = simd_caps(); + // Risk #3 from the integration plan ("Detection robustness + // across hypervisors"): CPUID may report AMX-TILE while the + // OS has not enabled the tile XSAVE state. In that case AMX + // instructions SIGILL despite the CPUID bit being set. + // `simd_amx::amx_available()` runs the full 4-step gate + // (CPUID + OSXSAVE + XCR0 bits 17/18 + arch_prctl + // XCOMP_PERM). Demote to the no-AMX dispatch branch when + // the OS check fails — typically resolves as Zen4Avx512 on + // SPR-class CPUID with locked-down hypervisor XSAVE state. + let amx_usable = caps.amx_tile && crate::simd_amx::amx_available(); // GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be // checked first because GNR is a strict superset of SPR. - if caps.has_amx_fp16() { + if amx_usable && caps.amx_fp16 { return SimdProfile::GraniteRapids; } // SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 + // AVX-512-FP16. EmeraldRapids has identical ISA — same variant. - if caps.amx_tile && caps.amx_bf16 && caps.avx512fp16 { + if amx_usable && caps.amx_bf16 && caps.avx512fp16 { return SimdProfile::SapphireRapids; } - // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no AMX. + // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no usable + // AMX. The `!amx_usable` guard also catches OS-demoted SPR + // silicon so it resolves here instead of as SapphireRapids. if caps.avx512f && caps.avx512vbmi && caps.avx512bf16 && caps.avx512fp16 - && !caps.amx_tile + && !amx_usable { return SimdProfile::Zen4Avx512; } From a7279e00aa283f6a75e7ff8a679138ecd3110ba1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 11:37:28 +0000 Subject: [PATCH 5/5] fix(simd_profile): target_arch guards on cpu-* features + std-gate example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two independent bug fixes for codex P2 on de52a446bd and the no-default-features CI failure: 1) codex P2 — Reject ARM cpu-* on x86 builds (and vice versa). Without target_arch guards, `--features cpu-a76` on an x86_64 build silently routes simd_profile() to A76DotProd, breaking the is_x86()/is_aarch64() partitioning and routing callers into the wrong dispatch family. Add compile_error! checks that fail fast for the current target_arch — same fail-fast pattern as the existing _PIN_COUNT mutual-exclusion assert. 2) CI fix — examples/simd_profile_probe.rs uses ndarray::hpc::* and ndarray::simd_amx::* which are gated behind the "std" feature. On `cargo test --no-default-features` the example target still tries to compile, producing E0433 "cannot find hpc in ndarray". Add `required-features = ["std"]` to the example's Cargo.toml entry so it is skipped when std is disabled, matching the existing pattern for ocr_benchmark. No behavioral change on default builds. Both fixes are independent of the architectural question about whether cpu-* features should exist at all (which is for the originating session to revisit if they want to unify with cpu_ops_for_cpu); this just makes the existing features less of a footgun. --- Cargo.toml | 4 ++++ src/hpc/simd_profile.rs | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index a431f14b..3e943e1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,10 @@ required-features = ["std"] name = "splat3d_flex" required-features = ["splat3d"] +[[example]] +name = "simd_profile_probe" +required-features = ["std"] + [dependencies] num-integer = { workspace = true } num-traits = { workspace = true } diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs index 78775fe9..4b51996b 100644 --- a/src/hpc/simd_profile.rs +++ b/src/hpc/simd_profile.rs @@ -271,6 +271,44 @@ const _: () = assert!( "cpu-* cargo features are mutually exclusive: enable at most one (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell, cpu-a76, cpu-a72, cpu-a53)" ); +// ──────────────────────────────────────────────────────────────────── +// target_arch guards (codex P2 closure). +// +// Each `cpu-` pin is only valid on its native silicon +// family. Without these guards `--features cpu-a76` on an x86_64 +// build would silently route `simd_profile()` to `A76DotProd`, +// breaking the `is_x86()` / `is_aarch64()` partitioning and routing +// callers into the wrong dispatch family. Fail fast at compile time +// instead. +// ──────────────────────────────────────────────────────────────────── + +#[cfg(all( + not(target_arch = "x86_64"), + any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + ) +))] +compile_error!( + "x86 cpu-* pinning features (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell) require target_arch = \"x86_64\"" +); + +#[cfg(all( + not(target_arch = "aarch64"), + any(feature = "cpu-a76", feature = "cpu-a72", feature = "cpu-a53",) +))] +compile_error!( + "ARM cpu-* pinning features (cpu-a76, cpu-a72, cpu-a53) require target_arch = \"aarch64\"" +); + /// The compile-time pinned profile, or `None` when runtime detection is in /// effect. `Some(_)` exactly when one of the `cpu-*` cargo features is /// enabled; mutually exclusive features are enforced by the `_PIN_COUNT`