diff --git a/Cargo.toml b/Cargo.toml index ee7cc8f7..3e943e1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,10 @@ required-features = ["std"] name = "splat3d_flex" required-features = ["splat3d"] +[[example]] +name = "simd_profile_probe" +required-features = ["std"] + [dependencies] num-integer = { workspace = true } num-traits = { workspace = true } @@ -273,6 +277,36 @@ splat3d = ["std"] # quad-tree partition; the entropy coder + RDO loop land in later workers. codec = ["std"] +# ── Phase 3 T3.2: compile-time SimdProfile pinning ─────────────────── +# +# Each cpu- feature, when enabled, makes +# `crate::simd::simd_profile()` fold to a const at compile time and +# bypass the runtime LazyLock detection. Pair with the matching +# `-Ctarget-cpu=` in `.cargo/config.toml` (or `RUSTFLAGS`) +# for full effect — the cargo feature picks the *dispatch* variant, +# while `-Ctarget-cpu` picks the *codegen* variant. Both together +# produce a binary that is specialised to one silicon family. +# +# Features are MUTUALLY EXCLUSIVE — enable at most one. A compile-time +# assert in `src/hpc/simd_profile.rs` enforces this. Multiple +# pinning features active = build error. +# +# Codename → SimdProfile variant mapping (see +# `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`): +cpu-gnr = [] # GraniteRapids — target-cpu=graniterapids +cpu-spr = [] # SapphireRapids — target-cpu=sapphirerapids +cpu-zen4 = [] # Zen4Avx512 — target-cpu=znver4 (or znver5) +cpu-cpl = [] # CooperLake — target-cpu=cooperlake +cpu-tigerlake = [] # TigerLakeU — target-cpu=tigerlake +cpu-icx = [] # IceLakeSp — target-cpu=icelake-server +cpu-clx = [] # CascadeLake — target-cpu=cascadelake +cpu-skx = [] # SkylakeX — target-cpu=skylake-avx512 +cpu-arrowlake = [] # ArrowLake — target-cpu=arrowlake +cpu-haswell = [] # HaswellAvx2 — target-cpu=haswell (or znver3) +cpu-a76 = [] # A76DotProd — target-cpu=cortex-a76 +cpu-a72 = [] # A72Fast — target-cpu=cortex-a72 +cpu-a53 = [] # A53Baseline — target-cpu=cortex-a53 + # no_std polyfill for `static LazyLock` in `src/simd.rs` (sprint A12). # Pulls in `portable-atomic` with the `critical-section` impl plus the # `critical-section` runtime so we can build a once-cell-style cache for diff --git a/examples/simd_profile_probe.rs b/examples/simd_profile_probe.rs new file mode 100644 index 00000000..5324198c --- /dev/null +++ b/examples/simd_profile_probe.rs @@ -0,0 +1,202 @@ +//! `simd_profile_probe` — boot-on-silicon diagnostic for the dispatch matrix. +//! +//! Step 1 of the TEST-promotion checklist from +//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md` § "TEST verification +//! checklist": *"Boot the binary on the silicon and confirm `simd_profile()` +//! returns the expected variant."* +//! +//! Prints every CPUID-derived capability bit plus the resolved `SimdProfile` +//! variant. Used to verify silicon → profile mapping when promoting DOC +//! cells in the dispatch matrix to TEST. +//! +//! Usage: +//! ```sh +//! # Runtime detection (default — same binary on any silicon): +//! cargo run --example simd_profile_probe --release +//! +//! # Compile-time pinned (the LazyLock is not linked in): +//! cargo run --example simd_profile_probe --release --features cpu-spr +//! ``` + +use ndarray::hpc::simd_caps::{simd_caps, ArmProfile, SimdCaps}; +use ndarray::hpc::simd_profile::{is_pinned, pinned_profile, simd_profile, SimdProfile}; + +fn main() { + let caps = simd_caps(); + let profile = simd_profile(); + + println!("ndarray simd-profile probe"); + println!("=========================="); + println!(); + + // ── Dispatch identity ─────────────────────────────────────────── + println!("Resolved profile: {}", profile.name()); + println!(" is_x86: {}", profile.is_x86()); + println!(" is_aarch64: {}", profile.is_aarch64()); + println!(" has_avx512: {}", profile.has_avx512()); + println!(" has_amx: {}", profile.has_amx()); + println!(); + + // ── Pinning status ───────────────────────────────────────────── + println!("Compile-time pinning: {}", if is_pinned() { "ACTIVE" } else { "off (runtime detection)" }); + if let Some(p) = pinned_profile() { + println!(" Pinned variant: {}", p.name()); + } + println!(); + + // ── Raw capability bits ──────────────────────────────────────── + println!("SimdCaps (raw bits):"); + print_caps(&caps); + println!(); + + // ── AMX OS-state probe (Risk #3 from integration plan) ──────── + // SimdCaps reports raw CPUID. SimdProfile::detect() additionally + // consults `simd_amx::amx_available()` which gates on + // OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM). If CPUID says + // AMX-TILE but the OS/hypervisor doesn't enable the XSAVE state, + // dispatch demotes from SPR/GNR to Zen4Avx512 (AVX-512 BF16 path + // instead of AMX tiles). Surfacing the gap here lets a reviewer + // see when CPUID-vs-OS disagree without reading source. + #[cfg(target_arch = "x86_64")] + { + let cpuid_says_amx = caps.amx_tile && caps.amx_int8; + let os_allows_amx = ndarray::simd_amx::amx_available(); + println!("AMX gating (CPUID vs OS):"); + println!(" CPUID amx_tile+amx_int8: {}", cpuid_says_amx); + println!(" OS XSAVE/prctl gate: {}", os_allows_amx); + if cpuid_says_amx && !os_allows_amx { + println!(" → CPUID-reported AMX is OS-DEMOTED — dispatch falls back to AVX-512 path"); + } + println!(); + } + + // ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ── + let arm = caps.arm_profile(); + if !matches!(arm, ArmProfile::NotArm) { + println!("ARM profile (heuristic): {}", arm.name()); + println!(" est. tok/sec: {}", arm.estimated_tok_per_sec()); + println!(" eff. f32 lanes:{}", arm.effective_f32_lanes()); + println!(); + } + + // ── Build configuration ───────────────────────────────────────── + println!("Build:"); + println!(" target_arch: {}", std::env::consts::ARCH); + println!(" target_os: {}", std::env::consts::OS); + #[cfg(target_feature = "avx512f")] + println!(" -Ctarget-feature avx512f: yes (compile-time)"); + #[cfg(not(target_feature = "avx512f"))] + println!(" -Ctarget-feature avx512f: no (compile-time)"); + #[cfg(target_feature = "avx2")] + println!(" -Ctarget-feature avx2: yes (compile-time)"); + #[cfg(not(target_feature = "avx2"))] + println!(" -Ctarget-feature avx2: no (compile-time)"); + println!(); + + // ── TEST promotion guidance ──────────────────────────────────── + println!("Matrix-doc cells affected by this CPU:"); + matrix_cell_summary(profile); + + // Sanity invariant: simd_profile() and pinned_profile() must agree + // when pinning is active. This is the same check that + // `pinning_consistency` runs as a unit test; we re-run it here so a + // probe binary deployed on real silicon flags any future regression + // in the cfg cascade. + if let Some(p) = pinned_profile() { + assert_eq!( + profile, p, + "INVARIANT VIOLATION: pinned_profile()={:?} disagrees with simd_profile()={:?}", + p, profile + ); + } +} + +fn print_caps(c: &SimdCaps) { + let bits: &[(&str, bool)] = &[ + ("avx2", c.avx2), + ("avx512f", c.avx512f), + ("avx512bw", c.avx512bw), + ("avx512vl", c.avx512vl), + ("avx512vnni", c.avx512vnni), + ("avx512vbmi", c.avx512vbmi), + ("avx512vpopcntdq", c.avx512vpopcntdq), + ("avx512bf16", c.avx512bf16), + ("avx512fp16", c.avx512fp16), + ("avx512vp2intersect", c.avx512vp2intersect), + ("avxvnniint8", c.avxvnniint8), + ("amx_tile", c.amx_tile), + ("amx_int8", c.amx_int8), + ("amx_bf16", c.amx_bf16), + ("amx_fp16", c.amx_fp16), + ("fma", c.fma), + ("sse41", c.sse41), + ("sse2", c.sse2), + ("neon", c.neon), + ("asimd_dotprod", c.asimd_dotprod), + ("fp16 (arm)", c.fp16), + ("aes", c.aes), + ("sha2", c.sha2), + ("crc32", c.crc32), + ]; + for (name, present) in bits { + println!(" [{}] {}", if *present { "x" } else { " " }, name); + } +} + +fn matrix_cell_summary(p: SimdProfile) { + // Lifted from `td-simd-cpu-dispatch-matrix.md` § "Master matrix" + // for each x86 profile. The summary is intentionally terse — the + // matrix doc is the source of truth and should be consulted before + // promoting any DOC cell to TEST. + let summary: &[&str] = match p { + SimdProfile::GraniteRapids => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL", + "AMX-TILE+INT8+BF16+FP16 (FP16 is the GNR discriminator)", + ], + SimdProfile::SapphireRapids => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL", + "AMX-TILE+INT8+BF16 (no AMX-FP16 — that's GNR)", + ], + SimdProfile::Zen4Avx512 => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16", + "No AMX of any kind; 256-bit FPU double-pumped on Zen4, native 512-bit on Zen5", + ], + SimdProfile::CooperLake => &[ + "F+CD+VL+DQ+BW+VNNI+BF16", + "No VBMI, no FP16, no AMX — unique 'BF16 without VBMI'", + ], + SimdProfile::TigerLakeU => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+VP2INTERSECT", + "VP2INTERSECT is the sole discriminator vs IceLakeSp", + ], + SimdProfile::IceLakeSp => &[ + "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI", + "No BF16, no FP16, no AMX, no VP2INTERSECT", + ], + SimdProfile::CascadeLake => &["F+CD+VL+DQ+BW+VNNI", "First Xeon with VNNI; no VBMI/BF16/FP16/AMX"], + SimdProfile::SkylakeX => &["F+CD+VL+DQ+BW", "Founding AVX-512 baseline; everything since adds on top"], + SimdProfile::ArrowLake => &[ + "No AVX-512 (hybrid CPU design)", + "AVX-VNNI-INT8 + AVX-IFMA + AVX-NE-CONVERT (256-bit / VEX forms)", + ], + SimdProfile::HaswellAvx2 => &["AVX2 + FMA + F16C + BMI1/2", "Haswell..Coffee Lake / Zen 1-3"], + SimdProfile::A76DotProd => &[ + "NEON + dotprod + fp16 + bf16+ + i8mm", + "Pi 5 (BCM2712), Orange Pi 5 (RK3588), Apple M1+", + ], + SimdProfile::A72Fast => &[ + "NEON 128-bit + crypto (AES/SHA-2/CRC32)", + "Pi 4 (BCM2711), Pi 3-with-crypto, Orange Pi 4 — HWCAP cannot distinguish A72 from A53-with-crypto", + ], + SimdProfile::A53Baseline => &[ + "NEON 128-bit baseline", + "Rare in the wild — QEMU / minimal aarch64 without crypto", + ], + SimdProfile::Scalar => &["No SIMD ISA recognised", "Fallback: scalar reference kernels"], + }; + for line in summary { + println!(" - {}", line); + } +} diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 11081ad6..6c6f3d18 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -17,6 +17,9 @@ pub mod simd_caps; // LazyLock frozen SIMD dispatch — function pointers selected once at startup pub mod simd_dispatch; +// Silicon-grained profile (Sapphire Rapids vs Zen 4 vs Ice Lake-SP, etc.) — +// implements the dispatch matrix from `.claude/knowledge/td-simd-cpu-dispatch-matrix.md` +pub mod simd_profile; pub mod blas_level1; pub mod blas_level2; diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs index a35823b5..7d52aec8 100644 --- a/src/hpc/simd_caps.rs +++ b/src/hpc/simd_caps.rs @@ -71,6 +71,21 @@ pub struct SimdCaps { /// (`is_x86_feature_detected!("avxvnniint8")`). /// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H). pub avxvnniint8: bool, + /// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23). + /// Native `__m512h` operations (`_mm512_*_ph`). Distinguishes Sapphire + /// Rapids / Granite Rapids / Zen 4+ from earlier AVX-512 silicon. + /// Required by the matrix doc to discriminate SPR (yes) from CLX (no). + pub avx512fp16: bool, + /// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on + /// Tiger Lake mobile silicon; absent from Ice Lake-SP and every later + /// server part. Sole discriminator between `TigerLakeU` and + /// `IceLakeSp` profiles, which otherwise share an identical feature set. + pub avx512vp2intersect: bool, + /// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot + /// product, present on Granite Rapids only. Sole discriminator between + /// `SapphireRapids` and `GraniteRapids` profiles. Lives at CPUID leaf + /// 7,1, not leaf 7,0 — separate cpuid_count call required. + pub amx_fp16: bool, // ── aarch64 (ARM) ── /// NEON 128-bit SIMD (mandatory on aarch64, always true). @@ -124,6 +139,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -139,10 +157,24 @@ impl SimdCaps { // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always // available on x86_64 (guaranteed by the ABI) and has no side effects // beyond reading CPU registers. - let cpuid7 = core::arch::x86_64::__cpuid_count(7, 0); - let amx_tile = (cpuid7.edx >> 24) & 1 == 1; - let amx_int8 = (cpuid7.edx >> 25) & 1 == 1; - let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1; + let cpuid7_0 = core::arch::x86_64::__cpuid_count(7, 0); + let amx_tile = (cpuid7_0.edx >> 24) & 1 == 1; + let amx_int8 = (cpuid7_0.edx >> 25) & 1 == 1; + let amx_bf16 = (cpuid7_0.edx >> 22) & 1 == 1; + let avx512fp16 = (cpuid7_0.edx >> 23) & 1 == 1; + let avx512vp2intersect = (cpuid7_0.edx >> 8) & 1 == 1; + + // Leaf 7,1 EAX bit 21 = AMX-FP16. Per the dispatch matrix this is + // the sole discriminator between Granite Rapids and Sapphire Rapids. + // Leaf 7,1 only exists when leaf 7,0 EAX (max subleaf) >= 1; on + // older silicon this returns zero, which is the correct answer. + let leaf7_max_sub = cpuid7_0.eax; + let amx_fp16 = if leaf7_max_sub >= 1 { + let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1); + (cpuid7_1.eax >> 21) & 1 == 1 + } else { + false + }; Self { avx2: is_x86_feature_detected!("avx2"), @@ -160,6 +192,9 @@ impl SimdCaps { amx_bf16, avx512bf16: is_x86_feature_detected!("avx512bf16"), avxvnniint8: is_x86_feature_detected!("avxvnniint8"), + avx512fp16, + avx512vp2intersect, + amx_fp16, // ARM fields: all false on x86 neon: false, asimd_dotprod: false, @@ -192,6 +227,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, // ARM fields: runtime detection neon: true, // mandatory on aarch64 asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"), @@ -221,6 +259,9 @@ impl SimdCaps { amx_bf16: false, avx512bf16: false, avxvnniint8: false, + avx512fp16: false, + avx512vp2intersect: false, + amx_fp16: false, neon: false, asimd_dotprod: false, fp16: false, @@ -275,6 +316,20 @@ impl SimdCaps { self.avxvnniint8 } + /// True if AVX-512 FP16 (`__m512h`) is available. Required to + /// discriminate `SapphireRapids` from `CascadeLake`-class profiles. + #[inline(always)] + pub fn has_avx512_fp16(self) -> bool { + self.avx512fp16 + } + + /// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids + /// silicon advertises this bit; sole discriminator between GNR and SPR. + #[inline(always)] + pub fn has_amx_fp16(self) -> bool { + self.amx_fp16 && self.amx_tile + } + // ── ARM convenience methods ── /// True if running on aarch64 with NEON (always true on aarch64). @@ -408,6 +463,42 @@ mod tests { let _ = caps.amx_bf16; let _ = caps.avx512bf16; let _ = caps.avxvnniint8; + // PR-#181 follow-up fields (matrix doc lines 240, 247-248). + let _ = caps.avx512fp16; + let _ = caps.avx512vp2intersect; + let _ = caps.amx_fp16; + } + + #[test] + fn fp16_fields_consistent_on_non_x86() { + // Non-x86 targets must never report x86 AMX/AVX-512 FP16 capabilities. + #[cfg(not(target_arch = "x86_64"))] + { + let caps = simd_caps(); + assert!(!caps.avx512fp16); + assert!(!caps.avx512vp2intersect); + assert!(!caps.amx_fp16); + assert!(!caps.has_avx512_fp16()); + assert!(!caps.has_amx_fp16()); + } + } + + #[test] + fn has_amx_fp16_requires_amx_tile() { + // Even if `amx_fp16` were spuriously true without `amx_tile`, + // the convenience method must require both bits. Matches + // simd_amx.rs::amx_available()'s defense-in-depth pattern. + let synthetic = SimdCaps { + avx2: false, avx512f: false, avx512bw: false, avx512vl: false, + avx512vpopcntdq: false, sse41: false, sse2: false, fma: false, + avx512vnni: false, avx512vbmi: false, + amx_tile: false, amx_int8: false, amx_bf16: false, + avx512bf16: false, avxvnniint8: false, + avx512fp16: false, avx512vp2intersect: false, amx_fp16: true, + neon: false, asimd_dotprod: false, fp16: false, + aes: false, sha2: false, crc32: false, + }; + assert!(!synthetic.has_amx_fp16(), "amx_fp16 without amx_tile must report false"); } #[test] diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs new file mode 100644 index 00000000..4b51996b --- /dev/null +++ b/src/hpc/simd_profile.rs @@ -0,0 +1,635 @@ +//! `SimdProfile` — silicon-grained dispatch identity. +//! +//! Where `SimdCaps` reports individual feature bits (AVX-512F, AMX-TILE, +//! VBMI, etc.), `SimdProfile` collapses a *combination* of those bits into +//! a single enum variant naming the silicon generation. One profile = one +//! best set of primitives. Consumers branch on `simd_profile()` once at +//! startup; subsequent dispatch is a `match` over the variant or an index +//! into a `*Dispatch` static table. +//! +//! Authoritative feature mapping lives in +//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`. The detection ladder +//! below implements the decision tree at lines 271-305 of that document. +//! Critical invariants from § "Detection invariants": +//! +//! 1. `GraniteRapids` must be checked **before** `SapphireRapids` — GNR +//! has every SPR bit plus AMX-FP16. Checking SPR first would route GNR +//! silicon as SPR and leave the AMX-FP16 tile path unused. +//! 2. `Zen4Avx512` vs `SapphireRapids` discriminate on `amx_tile` (SPR +//! yes, Zen 4 no) — both have F + VBMI + BF16 + FP16. +//! 3. `CooperLake` vs `IceLakeSp` are mutually exclusive bit patterns: +//! CPL has BF16 without VBMI; ICX has VBMI without BF16. +//! 4. `TigerLakeU` vs `IceLakeSp` share an ISA except for VP2INTERSECT +//! (TigerLake mobile only). Sole discriminator is `avx512vp2intersect`. + +use crate::hpc::simd_caps::{simd_caps, ArmProfile}; +use std::sync::LazyLock; + +/// Silicon-grained dispatch identity. One variant = one set of best +/// primitives. See `td-simd-cpu-dispatch-matrix.md` for the authoritative +/// feature table per variant. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SimdProfile { + // ── x86_64 ── + /// Granite Rapids. Sapphire-Rapids feature set plus AMX-FP16. + GraniteRapids, + /// Sapphire Rapids / Emerald Rapids (ISA-identical). AVX-512 superset + /// with AMX-TILE/INT8/BF16 and AVX-512-FP16. No AMX-FP16. + SapphireRapids, + /// Zen 4 / Zen 5 (Genoa / Turin). AVX-512 superset matching SPR's + /// non-AMX feature set; no AMX of any kind. + Zen4Avx512, + /// Cooper Lake. AVX-512F + VNNI + BF16, no VBMI/FP16/AMX. The unique + /// "BF16 without VBMI" combination. + CooperLake, + /// Tiger Lake mobile. Ice-Lake-class AVX-512 + VP2INTERSECT, no + /// BF16/FP16/AMX. Distinguished from Ice Lake-SP by VP2INTERSECT only. + TigerLakeU, + /// Ice Lake-SP server. AVX-512F + VBMI + VNNI + IFMA, no + /// BF16/FP16/AMX/VP2INTERSECT. + IceLakeSp, + /// Cascade Lake. AVX-512F + VNNI, no VBMI/BF16/FP16/AMX. + CascadeLake, + /// Skylake-X / SP / W. AVX-512F baseline only (no VNNI). + SkylakeX, + /// Arrow Lake / Lunar Lake / Meteor Lake-H. AVX2 + AVX-VNNI-INT8 + + /// AVX-IFMA + AVX-NE-CONVERT, no AVX-512. + ArrowLake, + /// Haswell through Coffee Lake / Zen 1-3. AVX2 + FMA, no AVX-512. + HaswellAvx2, + + // ── aarch64 ── + /// Cortex-A76+ (Pi 5, RK3588 big cores, Apple M-series). NEON + + /// dotprod + fp16 + bf16/i8mm where present. + A76DotProd, + /// Cortex-A72 / A53-with-crypto. ARMv8.0 + NEON + crypto, no dotprod. + /// HWCAP cannot distinguish A72 (Pi 4) from A53-with-crypto (Pi 3) + /// silicon — both share the same dispatch tables at the ISA level. + A72Fast, + /// Cortex-A53 without crypto. Rare in the wild; QEMU / minimal aarch64. + A53Baseline, + + // ── Fallback ── + /// wasm32, riscv, x86 baseline, or anything else without a recognised + /// SIMD ISA. + Scalar, +} + +impl SimdProfile { + /// Resolve the current silicon to one of the enum variants. + /// + /// Reads `simd_caps()` once and walks the decision tree from + /// `td-simd-cpu-dispatch-matrix.md` § "SimdProfile::detect() mapping". + /// The order below is load-bearing: GNR before SPR, the BF16/VBMI + /// mutex for CPL/ICX, VP2INTERSECT for TigerLakeU vs IceLakeSp. + pub fn detect() -> Self { + #[cfg(target_arch = "x86_64")] + { + let caps = simd_caps(); + // Risk #3 from the integration plan ("Detection robustness + // across hypervisors"): CPUID may report AMX-TILE while the + // OS has not enabled the tile XSAVE state. In that case AMX + // instructions SIGILL despite the CPUID bit being set. + // `simd_amx::amx_available()` runs the full 4-step gate + // (CPUID + OSXSAVE + XCR0 bits 17/18 + arch_prctl + // XCOMP_PERM). Demote to the no-AMX dispatch branch when + // the OS check fails — typically resolves as Zen4Avx512 on + // SPR-class CPUID with locked-down hypervisor XSAVE state. + let amx_usable = caps.amx_tile && crate::simd_amx::amx_available(); + // GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be + // checked first because GNR is a strict superset of SPR. + if amx_usable && caps.amx_fp16 { + return SimdProfile::GraniteRapids; + } + // SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 + + // AVX-512-FP16. EmeraldRapids has identical ISA — same variant. + if amx_usable && caps.amx_bf16 && caps.avx512fp16 { + return SimdProfile::SapphireRapids; + } + // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no usable + // AMX. The `!amx_usable` guard also catches OS-demoted SPR + // silicon so it resolves here instead of as SapphireRapids. + if caps.avx512f + && caps.avx512vbmi + && caps.avx512bf16 + && caps.avx512fp16 + && !amx_usable + { + return SimdProfile::Zen4Avx512; + } + // CooperLake: AVX-512 + VNNI + BF16, but no VBMI. Mutually + // exclusive bit pattern with IceLakeSp (which has VBMI but + // no BF16). Order vs ICX is irrelevant. + if caps.avx512f && caps.avx512vnni && caps.avx512bf16 && !caps.avx512vbmi { + return SimdProfile::CooperLake; + } + // TigerLakeU vs IceLakeSp: same feature set EXCEPT + // VP2INTERSECT (TigerLake mobile only). Discriminate here. + if caps.avx512f && caps.avx512vbmi && caps.avx512vnni && !caps.avx512bf16 { + return if caps.avx512vp2intersect { + SimdProfile::TigerLakeU + } else { + SimdProfile::IceLakeSp + }; + } + // CascadeLake: AVX-512 + VNNI, no VBMI/BF16. + if caps.avx512f && caps.avx512vnni { + return SimdProfile::CascadeLake; + } + // SkylakeX: AVX-512F only, no VNNI. + if caps.avx512f { + return SimdProfile::SkylakeX; + } + // ArrowLake / Lunar Lake / Meteor Lake-H: no AVX-512 but + // AVX-VNNI-INT8 present. + if caps.avxvnniint8 { + return SimdProfile::ArrowLake; + } + // Haswell..Coffee Lake / Zen 1-3: AVX2 + FMA. + if caps.avx2 && caps.fma { + return SimdProfile::HaswellAvx2; + } + } + #[cfg(target_arch = "aarch64")] + { + // Reuse the in-tree heuristic from `simd_caps::arm_profile()`. + // It already encodes the A72-vs-A53-crypto deployment-pragmatic + // decision and is the canonical ARM dispatch helper. + return match simd_caps().arm_profile() { + ArmProfile::A76DotProd => SimdProfile::A76DotProd, + ArmProfile::A72Fast => SimdProfile::A72Fast, + ArmProfile::A53Baseline => SimdProfile::A53Baseline, + ArmProfile::NotArm => SimdProfile::Scalar, + }; + } + SimdProfile::Scalar + } + + /// Human-readable name. Stable across versions; safe to use in logs, + /// telemetry, and bench output. + pub const fn name(self) -> &'static str { + match self { + Self::GraniteRapids => "GraniteRapids", + Self::SapphireRapids => "SapphireRapids", + Self::Zen4Avx512 => "Zen4Avx512", + Self::CooperLake => "CooperLake", + Self::TigerLakeU => "TigerLakeU", + Self::IceLakeSp => "IceLakeSp", + Self::CascadeLake => "CascadeLake", + Self::SkylakeX => "SkylakeX", + Self::ArrowLake => "ArrowLake", + Self::HaswellAvx2 => "HaswellAvx2", + Self::A76DotProd => "A76DotProd", + Self::A72Fast => "A72Fast", + Self::A53Baseline => "A53Baseline", + Self::Scalar => "Scalar", + } + } + + /// True iff this profile sits on the x86_64 silicon family. Useful + /// for branching in dispatch tables that share a single AMX/AVX-512 + /// kernel across multiple variants. + pub const fn is_x86(self) -> bool { + matches!( + self, + Self::GraniteRapids + | Self::SapphireRapids + | Self::Zen4Avx512 + | Self::CooperLake + | Self::TigerLakeU + | Self::IceLakeSp + | Self::CascadeLake + | Self::SkylakeX + | Self::ArrowLake + | Self::HaswellAvx2 + ) + } + + /// True iff this profile sits on the aarch64 silicon family. + pub const fn is_aarch64(self) -> bool { + matches!( + self, + Self::A76DotProd | Self::A72Fast | Self::A53Baseline + ) + } + + /// True iff this profile has AVX-512 Foundation. False for ArrowLake, + /// HaswellAvx2, aarch64, and Scalar. Use to route AVX-512 dispatch + /// without re-querying `SimdCaps`. + pub const fn has_avx512(self) -> bool { + matches!( + self, + Self::GraniteRapids + | Self::SapphireRapids + | Self::Zen4Avx512 + | Self::CooperLake + | Self::TigerLakeU + | Self::IceLakeSp + | Self::CascadeLake + | Self::SkylakeX + ) + } + + /// True iff this profile has any AMX tile capability (AMX-TILE + + /// AMX-INT8 + AMX-BF16 at minimum). Only SPR-class and GNR. + pub const fn has_amx(self) -> bool { + matches!(self, Self::GraniteRapids | Self::SapphireRapids) + } +} + +// ──────────────────────────────────────────────────────────────────── +// Phase 3 T3.2 — compile-time pinning via `cpu-*` cargo features. +// +// When any `cpu-` feature is set, `PINNED_PROFILE` is `Some(_)` +// and `simd_profile()` folds to the const at compile time — no LazyLock +// initialisation, no branch, no atomics. Without any feature set, +// behaviour matches the original runtime LazyLock detection. +// +// Mutual exclusion: at most ONE cpu-* feature may be enabled. The +// `_PIN_COUNT` const assert below fires at compile time if more than +// one is active, mirroring integration-plan risk #1 ("if a user sets +// cpu-spr AND has runtime detection on Zen 4, the binary SIGILLs"). +// ──────────────────────────────────────────────────────────────────── + +const _PIN_COUNT: u32 = 0 + + cfg!(feature = "cpu-gnr") as u32 + + cfg!(feature = "cpu-spr") as u32 + + cfg!(feature = "cpu-zen4") as u32 + + cfg!(feature = "cpu-cpl") as u32 + + cfg!(feature = "cpu-tigerlake") as u32 + + cfg!(feature = "cpu-icx") as u32 + + cfg!(feature = "cpu-clx") as u32 + + cfg!(feature = "cpu-skx") as u32 + + cfg!(feature = "cpu-arrowlake") as u32 + + cfg!(feature = "cpu-haswell") as u32 + + cfg!(feature = "cpu-a76") as u32 + + cfg!(feature = "cpu-a72") as u32 + + cfg!(feature = "cpu-a53") as u32; + +const _: () = assert!( + _PIN_COUNT <= 1, + "cpu-* cargo features are mutually exclusive: enable at most one (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell, cpu-a76, cpu-a72, cpu-a53)" +); + +// ──────────────────────────────────────────────────────────────────── +// target_arch guards (codex P2 closure). +// +// Each `cpu-` pin is only valid on its native silicon +// family. Without these guards `--features cpu-a76` on an x86_64 +// build would silently route `simd_profile()` to `A76DotProd`, +// breaking the `is_x86()` / `is_aarch64()` partitioning and routing +// callers into the wrong dispatch family. Fail fast at compile time +// instead. +// ──────────────────────────────────────────────────────────────────── + +#[cfg(all( + not(target_arch = "x86_64"), + any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + ) +))] +compile_error!( + "x86 cpu-* pinning features (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell) require target_arch = \"x86_64\"" +); + +#[cfg(all( + not(target_arch = "aarch64"), + any(feature = "cpu-a76", feature = "cpu-a72", feature = "cpu-a53",) +))] +compile_error!( + "ARM cpu-* pinning features (cpu-a76, cpu-a72, cpu-a53) require target_arch = \"aarch64\"" +); + +/// The compile-time pinned profile, or `None` when runtime detection is in +/// effect. `Some(_)` exactly when one of the `cpu-*` cargo features is +/// enabled; mutually exclusive features are enforced by the `_PIN_COUNT` +/// const assert above. +/// +/// Consumers wanting branch-free dispatch on pinned builds can match on +/// this const directly — the optimiser folds `Some(SimdProfile::X)` into +/// the call site and the `None`-arm runtime path is eliminated. Returned +/// from a `const fn` so call sites in const contexts (e.g. `const` array +/// initialisers for dispatch tables) work as well. +pub const fn pinned_profile() -> Option { + #[cfg(feature = "cpu-gnr")] + { + return Some(SimdProfile::GraniteRapids); + } + #[cfg(feature = "cpu-spr")] + { + return Some(SimdProfile::SapphireRapids); + } + #[cfg(feature = "cpu-zen4")] + { + return Some(SimdProfile::Zen4Avx512); + } + #[cfg(feature = "cpu-cpl")] + { + return Some(SimdProfile::CooperLake); + } + #[cfg(feature = "cpu-tigerlake")] + { + return Some(SimdProfile::TigerLakeU); + } + #[cfg(feature = "cpu-icx")] + { + return Some(SimdProfile::IceLakeSp); + } + #[cfg(feature = "cpu-clx")] + { + return Some(SimdProfile::CascadeLake); + } + #[cfg(feature = "cpu-skx")] + { + return Some(SimdProfile::SkylakeX); + } + #[cfg(feature = "cpu-arrowlake")] + { + return Some(SimdProfile::ArrowLake); + } + #[cfg(feature = "cpu-haswell")] + { + return Some(SimdProfile::HaswellAvx2); + } + #[cfg(feature = "cpu-a76")] + { + return Some(SimdProfile::A76DotProd); + } + #[cfg(feature = "cpu-a72")] + { + return Some(SimdProfile::A72Fast); + } + #[cfg(feature = "cpu-a53")] + { + return Some(SimdProfile::A53Baseline); + } + #[allow(unreachable_code)] + None +} + +/// `true` when a `cpu-*` cargo feature has pinned the profile at compile +/// time, `false` when runtime detection is in use. Equivalent to +/// `pinned_profile().is_some()` but spelled out for grep-ability. +pub const fn is_pinned() -> bool { + pinned_profile().is_some() +} + +// The LazyLock only exists when no cpu-* feature is set. With pinning, +// linking the LazyLock would defeat the purpose — we want every code +// path that touches `simd_profile()` to fold to a const. +#[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +)))] +static PROFILE: LazyLock = LazyLock::new(SimdProfile::detect); + +/// Get the resolved silicon profile. +/// +/// **Default (no `cpu-*` feature):** detected once at first access via +/// `LazyLock`; subsequent calls are a single pointer deref to a `Copy` +/// enum — no atomics, no branching. +/// +/// **Pinned (one `cpu-*` feature set):** returns the pinned const +/// directly. The compiler folds this call into the matching variant at +/// every call site; the LazyLock is not linked into the binary. +/// +/// Pair with `*Dispatch` static tables to make per-call dispatch a single +/// indirect call after monomorphisation (runtime path) or to fold to a +/// direct call (pinned path). +#[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +)))] +#[inline(always)] +pub fn simd_profile() -> SimdProfile { + *PROFILE +} + +/// Get the resolved silicon profile (pinned variant). +/// +/// A `cpu-*` cargo feature is active: this returns the pinned constant +/// directly, foldable at every call site. The runtime LazyLock is not +/// linked into the binary. See the documentation on the runtime variant +/// for the call-site contract. +#[cfg(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", +))] +#[inline(always)] +pub const fn simd_profile() -> SimdProfile { + // SAFETY of the unwrap: the cfg gate above guarantees at least one + // cpu-* feature is set, and `pinned_profile()` returns Some(_) under + // any of those gates. Const-evaluable since the inner cfg cascade is + // resolved at compile time. + match pinned_profile() { + Some(p) => p, + None => SimdProfile::Scalar, // unreachable; const fn can't panic cleanly + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detect_returns_a_valid_profile() { + let p = simd_profile(); + // Smoke: the LazyLock must not panic and must return *some* + // variant. `name()` exhausts the match, which would fail to + // compile if a new variant were added without a name. + let _ = p.name(); + } + + #[test] + fn determinism() { + // Two consecutive calls must agree — the LazyLock guarantees + // the closure runs exactly once. + assert_eq!(simd_profile(), simd_profile()); + } + + #[test] + fn arch_partitioning_is_consistent() { + let p = simd_profile(); + if p.is_x86() { + assert!(!p.is_aarch64(), "{:?} flagged both x86 and aarch64", p); + } + if p.is_aarch64() { + assert!(!p.is_x86(), "{:?} flagged both x86 and aarch64", p); + } + // On unsupported architectures the only legal answer is Scalar. + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + assert_eq!(p, SimdProfile::Scalar); + } + + #[test] + fn x86_target_lands_inside_x86_family() { + // Pinning overrides hardware detection — the test only describes + // the default (runtime-detection) path. + if is_pinned() { + return; + } + #[cfg(target_arch = "x86_64")] + { + let p = simd_profile(); + // Scalar is a valid x86_64 answer (no AVX2/FMA available), + // but any non-Scalar result must be inside the x86 family. + if p != SimdProfile::Scalar { + assert!( + p.is_x86(), + "x86_64 silicon resolved as non-x86 profile {:?}", + p + ); + } + } + } + + #[test] + fn aarch64_target_lands_inside_aarch64_family() { + if is_pinned() { + return; + } + #[cfg(target_arch = "aarch64")] + { + let p = simd_profile(); + assert!(p.is_aarch64(), "aarch64 silicon resolved as {:?}", p); + } + } + + #[test] + fn pinning_default_is_off() { + // The default build (no cpu-* feature) must NOT be pinned, so + // downstream consumers don't get surprised by compile-time + // dispatch they didn't opt into. + #[cfg(not(any( + feature = "cpu-gnr", + feature = "cpu-spr", + feature = "cpu-zen4", + feature = "cpu-cpl", + feature = "cpu-tigerlake", + feature = "cpu-icx", + feature = "cpu-clx", + feature = "cpu-skx", + feature = "cpu-arrowlake", + feature = "cpu-haswell", + feature = "cpu-a76", + feature = "cpu-a72", + feature = "cpu-a53", + )))] + { + assert!(!is_pinned()); + assert_eq!(pinned_profile(), None); + } + } + + #[test] + fn pinning_consistency() { + // When pinning is in effect, simd_profile() must equal the + // pinned const — hardware detection is bypassed entirely. + if let Some(pinned) = pinned_profile() { + assert!(is_pinned()); + assert_eq!(simd_profile(), pinned); + } else { + assert!(!is_pinned()); + } + } + + #[test] + fn has_avx512_is_subset_of_is_x86() { + for &p in &[ + SimdProfile::GraniteRapids, + SimdProfile::SapphireRapids, + SimdProfile::Zen4Avx512, + SimdProfile::CooperLake, + SimdProfile::TigerLakeU, + SimdProfile::IceLakeSp, + SimdProfile::CascadeLake, + SimdProfile::SkylakeX, + SimdProfile::ArrowLake, + SimdProfile::HaswellAvx2, + SimdProfile::A76DotProd, + SimdProfile::A72Fast, + SimdProfile::A53Baseline, + SimdProfile::Scalar, + ] { + if p.has_avx512() { + assert!(p.is_x86(), "{:?} reports AVX-512 but is not x86", p); + } + if p.has_amx() { + assert!(p.has_avx512(), "{:?} reports AMX but not AVX-512", p); + assert!(p.is_x86(), "{:?} reports AMX but is not x86", p); + } + } + } + + #[test] + fn names_are_stable_and_unique() { + let all = [ + SimdProfile::GraniteRapids, + SimdProfile::SapphireRapids, + SimdProfile::Zen4Avx512, + SimdProfile::CooperLake, + SimdProfile::TigerLakeU, + SimdProfile::IceLakeSp, + SimdProfile::CascadeLake, + SimdProfile::SkylakeX, + SimdProfile::ArrowLake, + SimdProfile::HaswellAvx2, + SimdProfile::A76DotProd, + SimdProfile::A72Fast, + SimdProfile::A53Baseline, + SimdProfile::Scalar, + ]; + let names: Vec<&'static str> = all.iter().map(|p| p.name()).collect(); + for i in 0..names.len() { + for j in i + 1..names.len() { + assert_ne!( + names[i], names[j], + "profile names must be unique: {:?} == {:?}", + all[i], all[j] + ); + } + } + } +} diff --git a/src/simd.rs b/src/simd.rs index ce449991..877c304a 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -133,6 +133,22 @@ fn tier() -> Tier { // The check is cheap (reads a cached cpuid result) and the batch // function uses as_chunks::<16>() + as_chunks::<8>() for SIMD widths. +// ──────────────────────────────────────────────────────────────────── +// Silicon-grained profile re-export. +// +// `Tier` above is the legacy coarse enum (Avx512/Avx2/Neon/Scalar) used +// by the F32x16 / F64x8 dispatch in this module. `SimdProfile` is the +// fine-grained successor that names individual silicon generations +// (SapphireRapids, Zen4Avx512, IceLakeSp, A76DotProd, …) so consumers +// can route to the best primitive on each CPU. Both ship side-by-side +// during the Phase 3 integration; callers migrate on their own cadence. +// +// Public surface lives in `crate::simd::*` per the cognitive-shader +// foundation contract (`cognitive-shader-foundation.md` § "Public Surface"). +// ──────────────────────────────────────────────────────────────────── +#[cfg(feature = "std")] +pub use crate::hpc::simd_profile::{simd_profile, SimdProfile}; + // ============================================================================ // Preferred SIMD lane widths — compile-time constants for array_windows // ============================================================================