From db3669e8638947c7f133cb9c81349022e6b98d2a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 20 May 2026 20:20:36 +0000
Subject: [PATCH 1/5] feat(simd): SimdProfile enum + detect() implements
 dispatch matrix

Phase 3 T3.1 of the SIMD integration plan: introduce
crate::hpc::simd_profile::SimdProfile, the silicon-grained dispatch
identity that replaces the coarse three-Tier collapse called out in
audit findings TD-T12/T13/T14.

The decision tree in SimdProfile::detect() implements
.claude/knowledge/td-simd-cpu-dispatch-matrix.md lines 271-305
verbatim, preserving the four load-bearing invariants from the
"Detection invariants" section: GraniteRapids-before-SapphireRapids,
Zen4-vs-SPR via amx_tile, CooperLake-vs-IceLakeSp via the mutually
exclusive BF16/VBMI bit pattern, and TigerLakeU-vs-IceLakeSp via
VP2INTERSECT.

Risk #4 of the integration plan (no GNR detection without leaf 7,1
reader) closed in the same change: SimdCaps gains avx512fp16,
avx512vp2intersect, and amx_fp16 fields, with the x86 detect() arm
adding a __cpuid_count(7, 1) read gated on the leaf 7,0 EAX max
subleaf advertising support. has_amx_fp16() requires amx_tile in
addition to the FP16 bit, mirroring the defense-in-depth pattern in
simd_amx::amx_available().

Surface follows cognitive-shader-foundation.md: SimdProfile +
simd_profile() re-exported through crate::simd::* so consumers
import a single public path. The existing private Tier / tier()
machinery in src/simd.rs is untouched; this lands alongside, with
incremental migration deferred to T3.5/T3.6.

Tests: 7 new in simd_profile (detection determinism, arch
partitioning, AVX-512 subset invariant, x86_64-only Scalar
fallback, name uniqueness), 2 new in simd_caps (FP16 fields false
on non-x86, has_amx_fp16 requires amx_tile). 2075/2075 lib tests
pass, clippy -D warnings clean.
---
 src/hpc/mod.rs          |   3 +
 src/hpc/simd_caps.rs    |  99 ++++++++++-
 src/hpc/simd_profile.rs | 357 ++++++++++++++++++++++++++++++++++++++++
 src/simd.rs             |  16 ++
 4 files changed, 471 insertions(+), 4 deletions(-)
 create mode 100644 src/hpc/simd_profile.rs

diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs
index 11081ad6..6c6f3d18 100644
--- a/src/hpc/mod.rs
+++ b/src/hpc/mod.rs
@@ -17,6 +17,9 @@
 pub mod simd_caps;
 // LazyLock frozen SIMD dispatch — function pointers selected once at startup
 pub mod simd_dispatch;
+// Silicon-grained profile (Sapphire Rapids vs Zen 4 vs Ice Lake-SP, etc.) —
+// implements the dispatch matrix from `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`
+pub mod simd_profile;
 
 pub mod blas_level1;
 pub mod blas_level2;
diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs
index a35823b5..7d52aec8 100644
--- a/src/hpc/simd_caps.rs
+++ b/src/hpc/simd_caps.rs
@@ -71,6 +71,21 @@ pub struct SimdCaps {
     /// (`is_x86_feature_detected!("avxvnniint8")`).
     /// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H).
     pub avxvnniint8: bool,
+    /// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23).
+    /// Native `__m512h` operations (`_mm512_*_ph`). Distinguishes Sapphire
+    /// Rapids / Granite Rapids / Zen 4+ from earlier AVX-512 silicon.
+    /// Required by the matrix doc to discriminate SPR (yes) from CLX (no).
+    pub avx512fp16: bool,
+    /// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on
+    /// Tiger Lake mobile silicon; absent from Ice Lake-SP and every later
+    /// server part. Sole discriminator between `TigerLakeU` and
+    /// `IceLakeSp` profiles, which otherwise share an identical feature set.
+    pub avx512vp2intersect: bool,
+    /// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot
+    /// product, present on Granite Rapids only. Sole discriminator between
+    /// `SapphireRapids` and `GraniteRapids` profiles. Lives at CPUID leaf
+    /// 7,1, not leaf 7,0 — separate cpuid_count call required.
+    pub amx_fp16: bool,
 
     // ── aarch64 (ARM) ──
     /// NEON 128-bit SIMD (mandatory on aarch64, always true).
@@ -124,6 +139,9 @@ impl SimdCaps {
             amx_bf16: false,
             avx512bf16: false,
             avxvnniint8: false,
+            avx512fp16: false,
+            avx512vp2intersect: false,
+            amx_fp16: false,
             neon: false,
             asimd_dotprod: false,
             fp16: false,
@@ -139,10 +157,24 @@ impl SimdCaps {
         // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always
         // available on x86_64 (guaranteed by the ABI) and has no side effects
         // beyond reading CPU registers.
-        let cpuid7 = core::arch::x86_64::__cpuid_count(7, 0);
-        let amx_tile = (cpuid7.edx >> 24) & 1 == 1;
-        let amx_int8 = (cpuid7.edx >> 25) & 1 == 1;
-        let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1;
+        let cpuid7_0 = core::arch::x86_64::__cpuid_count(7, 0);
+        let amx_tile = (cpuid7_0.edx >> 24) & 1 == 1;
+        let amx_int8 = (cpuid7_0.edx >> 25) & 1 == 1;
+        let amx_bf16 = (cpuid7_0.edx >> 22) & 1 == 1;
+        let avx512fp16 = (cpuid7_0.edx >> 23) & 1 == 1;
+        let avx512vp2intersect = (cpuid7_0.edx >> 8) & 1 == 1;
+
+        // Leaf 7,1 EAX bit 21 = AMX-FP16. Per the dispatch matrix this is
+        // the sole discriminator between Granite Rapids and Sapphire Rapids.
+        // Leaf 7,1 only exists when leaf 7,0 EAX (max subleaf) >= 1; on
+        // older silicon this returns zero, which is the correct answer.
+        let leaf7_max_sub = cpuid7_0.eax;
+        let amx_fp16 = if leaf7_max_sub >= 1 {
+            let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1);
+            (cpuid7_1.eax >> 21) & 1 == 1
+        } else {
+            false
+        };
 
         Self {
             avx2: is_x86_feature_detected!("avx2"),
@@ -160,6 +192,9 @@ impl SimdCaps {
             amx_bf16,
             avx512bf16: is_x86_feature_detected!("avx512bf16"),
             avxvnniint8: is_x86_feature_detected!("avxvnniint8"),
+            avx512fp16,
+            avx512vp2intersect,
+            amx_fp16,
             // ARM fields: all false on x86
             neon: false,
             asimd_dotprod: false,
@@ -192,6 +227,9 @@ impl SimdCaps {
             amx_bf16: false,
             avx512bf16: false,
             avxvnniint8: false,
+            avx512fp16: false,
+            avx512vp2intersect: false,
+            amx_fp16: false,
             // ARM fields: runtime detection
             neon: true, // mandatory on aarch64
             asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
@@ -221,6 +259,9 @@ impl SimdCaps {
             amx_bf16: false,
             avx512bf16: false,
             avxvnniint8: false,
+            avx512fp16: false,
+            avx512vp2intersect: false,
+            amx_fp16: false,
             neon: false,
             asimd_dotprod: false,
             fp16: false,
@@ -275,6 +316,20 @@ impl SimdCaps {
         self.avxvnniint8
     }
 
+    /// True if AVX-512 FP16 (`__m512h`) is available. Required to
+    /// discriminate `SapphireRapids` from `CascadeLake`-class profiles.
+    #[inline(always)]
+    pub fn has_avx512_fp16(self) -> bool {
+        self.avx512fp16
+    }
+
+    /// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids
+    /// silicon advertises this bit; sole discriminator between GNR and SPR.
+    #[inline(always)]
+    pub fn has_amx_fp16(self) -> bool {
+        self.amx_fp16 && self.amx_tile
+    }
+
     // ── ARM convenience methods ──
 
     /// True if running on aarch64 with NEON (always true on aarch64).
@@ -408,6 +463,42 @@ mod tests {
         let _ = caps.amx_bf16;
         let _ = caps.avx512bf16;
         let _ = caps.avxvnniint8;
+        // PR-#181 follow-up fields (matrix doc lines 240, 247-248).
+        let _ = caps.avx512fp16;
+        let _ = caps.avx512vp2intersect;
+        let _ = caps.amx_fp16;
+    }
+
+    #[test]
+    fn fp16_fields_consistent_on_non_x86() {
+        // Non-x86 targets must never report x86 AMX/AVX-512 FP16 capabilities.
+        #[cfg(not(target_arch = "x86_64"))]
+        {
+            let caps = simd_caps();
+            assert!(!caps.avx512fp16);
+            assert!(!caps.avx512vp2intersect);
+            assert!(!caps.amx_fp16);
+            assert!(!caps.has_avx512_fp16());
+            assert!(!caps.has_amx_fp16());
+        }
+    }
+
+    #[test]
+    fn has_amx_fp16_requires_amx_tile() {
+        // Even if `amx_fp16` were spuriously true without `amx_tile`,
+        // the convenience method must require both bits. Matches
+        // simd_amx.rs::amx_available()'s defense-in-depth pattern.
+        let synthetic = SimdCaps {
+            avx2: false, avx512f: false, avx512bw: false, avx512vl: false,
+            avx512vpopcntdq: false, sse41: false, sse2: false, fma: false,
+            avx512vnni: false, avx512vbmi: false,
+            amx_tile: false, amx_int8: false, amx_bf16: false,
+            avx512bf16: false, avxvnniint8: false,
+            avx512fp16: false, avx512vp2intersect: false, amx_fp16: true,
+            neon: false, asimd_dotprod: false, fp16: false,
+            aes: false, sha2: false, crc32: false,
+        };
+        assert!(!synthetic.has_amx_fp16(), "amx_fp16 without amx_tile must report false");
     }
 
     #[test]
diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs
new file mode 100644
index 00000000..a49df92b
--- /dev/null
+++ b/src/hpc/simd_profile.rs
@@ -0,0 +1,357 @@
+//! `SimdProfile` — silicon-grained dispatch identity.
+//!
+//! Where `SimdCaps` reports individual feature bits (AVX-512F, AMX-TILE,
+//! VBMI, etc.), `SimdProfile` collapses a *combination* of those bits into
+//! a single enum variant naming the silicon generation. One profile = one
+//! best set of primitives. Consumers branch on `simd_profile()` once at
+//! startup; subsequent dispatch is a `match` over the variant or an index
+//! into a `*Dispatch` static table.
+//!
+//! Authoritative feature mapping lives in
+//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`. The detection ladder
+//! below implements the decision tree at lines 271-305 of that document.
+//! Critical invariants from § "Detection invariants":
+//!
+//! 1. `GraniteRapids` must be checked **before** `SapphireRapids` — GNR
+//!    has every SPR bit plus AMX-FP16. Checking SPR first would route GNR
+//!    silicon as SPR and leave the AMX-FP16 tile path unused.
+//! 2. `Zen4Avx512` vs `SapphireRapids` discriminate on `amx_tile` (SPR
+//!    yes, Zen 4 no) — both have F + VBMI + BF16 + FP16.
+//! 3. `CooperLake` vs `IceLakeSp` are mutually exclusive bit patterns:
+//!    CPL has BF16 without VBMI; ICX has VBMI without BF16.
+//! 4. `TigerLakeU` vs `IceLakeSp` share an ISA except for VP2INTERSECT
+//!    (TigerLake mobile only). Sole discriminator is `avx512vp2intersect`.
+
+use crate::hpc::simd_caps::{simd_caps, ArmProfile};
+use std::sync::LazyLock;
+
+/// Silicon-grained dispatch identity. One variant = one set of best
+/// primitives. See `td-simd-cpu-dispatch-matrix.md` for the authoritative
+/// feature table per variant.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum SimdProfile {
+    // ── x86_64 ──
+    /// Granite Rapids. Sapphire-Rapids feature set plus AMX-FP16.
+    GraniteRapids,
+    /// Sapphire Rapids / Emerald Rapids (ISA-identical). AVX-512 superset
+    /// with AMX-TILE/INT8/BF16 and AVX-512-FP16. No AMX-FP16.
+    SapphireRapids,
+    /// Zen 4 / Zen 5 (Genoa / Turin). AVX-512 superset matching SPR's
+    /// non-AMX feature set; no AMX of any kind.
+    Zen4Avx512,
+    /// Cooper Lake. AVX-512F + VNNI + BF16, no VBMI/FP16/AMX. The unique
+    /// "BF16 without VBMI" combination.
+    CooperLake,
+    /// Tiger Lake mobile. Ice-Lake-class AVX-512 + VP2INTERSECT, no
+    /// BF16/FP16/AMX. Distinguished from Ice Lake-SP by VP2INTERSECT only.
+    TigerLakeU,
+    /// Ice Lake-SP server. AVX-512F + VBMI + VNNI + IFMA, no
+    /// BF16/FP16/AMX/VP2INTERSECT.
+    IceLakeSp,
+    /// Cascade Lake. AVX-512F + VNNI, no VBMI/BF16/FP16/AMX.
+    CascadeLake,
+    /// Skylake-X / SP / W. AVX-512F baseline only (no VNNI).
+    SkylakeX,
+    /// Arrow Lake / Lunar Lake / Meteor Lake-H. AVX2 + AVX-VNNI-INT8 +
+    /// AVX-IFMA + AVX-NE-CONVERT, no AVX-512.
+    ArrowLake,
+    /// Haswell through Coffee Lake / Zen 1-3. AVX2 + FMA, no AVX-512.
+    HaswellAvx2,
+
+    // ── aarch64 ──
+    /// Cortex-A76+ (Pi 5, RK3588 big cores, Apple M-series). NEON +
+    /// dotprod + fp16 + bf16/i8mm where present.
+    A76DotProd,
+    /// Cortex-A72 / A53-with-crypto. ARMv8.0 + NEON + crypto, no dotprod.
+    /// HWCAP cannot distinguish A72 (Pi 4) from A53-with-crypto (Pi 3)
+    /// silicon — both share the same dispatch tables at the ISA level.
+    A72Fast,
+    /// Cortex-A53 without crypto. Rare in the wild; QEMU / minimal aarch64.
+    A53Baseline,
+
+    // ── Fallback ──
+    /// wasm32, riscv, x86 baseline, or anything else without a recognised
+    /// SIMD ISA.
+    Scalar,
+}
+
+impl SimdProfile {
+    /// Resolve the current silicon to one of the enum variants.
+    ///
+    /// Reads `simd_caps()` once and walks the decision tree from
+    /// `td-simd-cpu-dispatch-matrix.md` § "SimdProfile::detect() mapping".
+    /// The order below is load-bearing: GNR before SPR, the BF16/VBMI
+    /// mutex for CPL/ICX, VP2INTERSECT for TigerLakeU vs IceLakeSp.
+    pub fn detect() -> Self {
+        #[cfg(target_arch = "x86_64")]
+        {
+            let caps = simd_caps();
+            // GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be
+            // checked first because GNR is a strict superset of SPR.
+            if caps.has_amx_fp16() {
+                return SimdProfile::GraniteRapids;
+            }
+            // SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 +
+            // AVX-512-FP16. EmeraldRapids has identical ISA — same variant.
+            if caps.amx_tile && caps.amx_bf16 && caps.avx512fp16 {
+                return SimdProfile::SapphireRapids;
+            }
+            // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no AMX.
+            if caps.avx512f
+                && caps.avx512vbmi
+                && caps.avx512bf16
+                && caps.avx512fp16
+                && !caps.amx_tile
+            {
+                return SimdProfile::Zen4Avx512;
+            }
+            // CooperLake: AVX-512 + VNNI + BF16, but no VBMI. Mutually
+            // exclusive bit pattern with IceLakeSp (which has VBMI but
+            // no BF16). Order vs ICX is irrelevant.
+            if caps.avx512f && caps.avx512vnni && caps.avx512bf16 && !caps.avx512vbmi {
+                return SimdProfile::CooperLake;
+            }
+            // TigerLakeU vs IceLakeSp: same feature set EXCEPT
+            // VP2INTERSECT (TigerLake mobile only). Discriminate here.
+            if caps.avx512f && caps.avx512vbmi && caps.avx512vnni && !caps.avx512bf16 {
+                return if caps.avx512vp2intersect {
+                    SimdProfile::TigerLakeU
+                } else {
+                    SimdProfile::IceLakeSp
+                };
+            }
+            // CascadeLake: AVX-512 + VNNI, no VBMI/BF16.
+            if caps.avx512f && caps.avx512vnni {
+                return SimdProfile::CascadeLake;
+            }
+            // SkylakeX: AVX-512F only, no VNNI.
+            if caps.avx512f {
+                return SimdProfile::SkylakeX;
+            }
+            // ArrowLake / Lunar Lake / Meteor Lake-H: no AVX-512 but
+            // AVX-VNNI-INT8 present.
+            if caps.avxvnniint8 {
+                return SimdProfile::ArrowLake;
+            }
+            // Haswell..Coffee Lake / Zen 1-3: AVX2 + FMA.
+            if caps.avx2 && caps.fma {
+                return SimdProfile::HaswellAvx2;
+            }
+        }
+        #[cfg(target_arch = "aarch64")]
+        {
+            // Reuse the in-tree heuristic from `simd_caps::arm_profile()`.
+            // It already encodes the A72-vs-A53-crypto deployment-pragmatic
+            // decision and is the canonical ARM dispatch helper.
+            return match simd_caps().arm_profile() {
+                ArmProfile::A76DotProd => SimdProfile::A76DotProd,
+                ArmProfile::A72Fast => SimdProfile::A72Fast,
+                ArmProfile::A53Baseline => SimdProfile::A53Baseline,
+                ArmProfile::NotArm => SimdProfile::Scalar,
+            };
+        }
+        SimdProfile::Scalar
+    }
+
+    /// Human-readable name. Stable across versions; safe to use in logs,
+    /// telemetry, and bench output.
+    pub const fn name(self) -> &'static str {
+        match self {
+            Self::GraniteRapids => "GraniteRapids",
+            Self::SapphireRapids => "SapphireRapids",
+            Self::Zen4Avx512 => "Zen4Avx512",
+            Self::CooperLake => "CooperLake",
+            Self::TigerLakeU => "TigerLakeU",
+            Self::IceLakeSp => "IceLakeSp",
+            Self::CascadeLake => "CascadeLake",
+            Self::SkylakeX => "SkylakeX",
+            Self::ArrowLake => "ArrowLake",
+            Self::HaswellAvx2 => "HaswellAvx2",
+            Self::A76DotProd => "A76DotProd",
+            Self::A72Fast => "A72Fast",
+            Self::A53Baseline => "A53Baseline",
+            Self::Scalar => "Scalar",
+        }
+    }
+
+    /// True iff this profile sits on the x86_64 silicon family. Useful
+    /// for branching in dispatch tables that share a single AMX/AVX-512
+    /// kernel across multiple variants.
+    pub const fn is_x86(self) -> bool {
+        matches!(
+            self,
+            Self::GraniteRapids
+                | Self::SapphireRapids
+                | Self::Zen4Avx512
+                | Self::CooperLake
+                | Self::TigerLakeU
+                | Self::IceLakeSp
+                | Self::CascadeLake
+                | Self::SkylakeX
+                | Self::ArrowLake
+                | Self::HaswellAvx2
+        )
+    }
+
+    /// True iff this profile sits on the aarch64 silicon family.
+    pub const fn is_aarch64(self) -> bool {
+        matches!(
+            self,
+            Self::A76DotProd | Self::A72Fast | Self::A53Baseline
+        )
+    }
+
+    /// True iff this profile has AVX-512 Foundation. False for ArrowLake,
+    /// HaswellAvx2, aarch64, and Scalar. Use to route AVX-512 dispatch
+    /// without re-querying `SimdCaps`.
+    pub const fn has_avx512(self) -> bool {
+        matches!(
+            self,
+            Self::GraniteRapids
+                | Self::SapphireRapids
+                | Self::Zen4Avx512
+                | Self::CooperLake
+                | Self::TigerLakeU
+                | Self::IceLakeSp
+                | Self::CascadeLake
+                | Self::SkylakeX
+        )
+    }
+
+    /// True iff this profile has any AMX tile capability (AMX-TILE +
+    /// AMX-INT8 + AMX-BF16 at minimum). Only SPR-class and GNR.
+    pub const fn has_amx(self) -> bool {
+        matches!(self, Self::GraniteRapids | Self::SapphireRapids)
+    }
+}
+
+static PROFILE: LazyLock<SimdProfile> = LazyLock::new(SimdProfile::detect);
+
+/// Get the resolved silicon profile, detected once at first access.
+///
+/// All subsequent calls are a single pointer deref to a `Copy` enum —
+/// no atomics, no branching. Pair with `*Dispatch` static tables to make
+/// per-call dispatch a single indirect call after monomorphisation.
+#[inline(always)]
+pub fn simd_profile() -> SimdProfile {
+    *PROFILE
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn detect_returns_a_valid_profile() {
+        let p = simd_profile();
+        // Smoke: the LazyLock must not panic and must return *some*
+        // variant. `name()` exhausts the match, which would fail to
+        // compile if a new variant were added without a name.
+        let _ = p.name();
+    }
+
+    #[test]
+    fn determinism() {
+        // Two consecutive calls must agree — the LazyLock guarantees
+        // the closure runs exactly once.
+        assert_eq!(simd_profile(), simd_profile());
+    }
+
+    #[test]
+    fn arch_partitioning_is_consistent() {
+        let p = simd_profile();
+        if p.is_x86() {
+            assert!(!p.is_aarch64(), "{:?} flagged both x86 and aarch64", p);
+        }
+        if p.is_aarch64() {
+            assert!(!p.is_x86(), "{:?} flagged both x86 and aarch64", p);
+        }
+        // On unsupported architectures the only legal answer is Scalar.
+        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        assert_eq!(p, SimdProfile::Scalar);
+    }
+
+    #[test]
+    fn x86_target_lands_inside_x86_family() {
+        #[cfg(target_arch = "x86_64")]
+        {
+            let p = simd_profile();
+            // Scalar is a valid x86_64 answer (no AVX2/FMA available),
+            // but any non-Scalar result must be inside the x86 family.
+            if p != SimdProfile::Scalar {
+                assert!(
+                    p.is_x86(),
+                    "x86_64 silicon resolved as non-x86 profile {:?}",
+                    p
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn aarch64_target_lands_inside_aarch64_family() {
+        #[cfg(target_arch = "aarch64")]
+        {
+            let p = simd_profile();
+            assert!(p.is_aarch64(), "aarch64 silicon resolved as {:?}", p);
+        }
+    }
+
+    #[test]
+    fn has_avx512_is_subset_of_is_x86() {
+        for &p in &[
+            SimdProfile::GraniteRapids,
+            SimdProfile::SapphireRapids,
+            SimdProfile::Zen4Avx512,
+            SimdProfile::CooperLake,
+            SimdProfile::TigerLakeU,
+            SimdProfile::IceLakeSp,
+            SimdProfile::CascadeLake,
+            SimdProfile::SkylakeX,
+            SimdProfile::ArrowLake,
+            SimdProfile::HaswellAvx2,
+            SimdProfile::A76DotProd,
+            SimdProfile::A72Fast,
+            SimdProfile::A53Baseline,
+            SimdProfile::Scalar,
+        ] {
+            if p.has_avx512() {
+                assert!(p.is_x86(), "{:?} reports AVX-512 but is not x86", p);
+            }
+            if p.has_amx() {
+                assert!(p.has_avx512(), "{:?} reports AMX but not AVX-512", p);
+                assert!(p.is_x86(), "{:?} reports AMX but is not x86", p);
+            }
+        }
+    }
+
+    #[test]
+    fn names_are_stable_and_unique() {
+        let all = [
+            SimdProfile::GraniteRapids,
+            SimdProfile::SapphireRapids,
+            SimdProfile::Zen4Avx512,
+            SimdProfile::CooperLake,
+            SimdProfile::TigerLakeU,
+            SimdProfile::IceLakeSp,
+            SimdProfile::CascadeLake,
+            SimdProfile::SkylakeX,
+            SimdProfile::ArrowLake,
+            SimdProfile::HaswellAvx2,
+            SimdProfile::A76DotProd,
+            SimdProfile::A72Fast,
+            SimdProfile::A53Baseline,
+            SimdProfile::Scalar,
+        ];
+        let names: Vec<&'static str> = all.iter().map(|p| p.name()).collect();
+        for i in 0..names.len() {
+            for j in i + 1..names.len() {
+                assert_ne!(
+                    names[i], names[j],
+                    "profile names must be unique: {:?} == {:?}",
+                    all[i], all[j]
+                );
+            }
+        }
+    }
+}
diff --git a/src/simd.rs b/src/simd.rs
index ce449991..877c304a 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -133,6 +133,22 @@ fn tier() -> Tier {
 // The check is cheap (reads a cached cpuid result) and the batch
 // function uses as_chunks::<16>() + as_chunks::<8>() for SIMD widths.
 
+// ────────────────────────────────────────────────────────────────────
+// Silicon-grained profile re-export.
+//
+// `Tier` above is the legacy coarse enum (Avx512/Avx2/Neon/Scalar) used
+// by the F32x16 / F64x8 dispatch in this module. `SimdProfile` is the
+// fine-grained successor that names individual silicon generations
+// (SapphireRapids, Zen4Avx512, IceLakeSp, A76DotProd, …) so consumers
+// can route to the best primitive on each CPU. Both ship side-by-side
+// during the Phase 3 integration; callers migrate on their own cadence.
+//
+// Public surface lives in `crate::simd::*` per the cognitive-shader
+// foundation contract (`cognitive-shader-foundation.md` § "Public Surface").
+// ────────────────────────────────────────────────────────────────────
+#[cfg(feature = "std")]
+pub use crate::hpc::simd_profile::{simd_profile, SimdProfile};
+
 // ============================================================================
 // Preferred SIMD lane widths — compile-time constants for array_windows
 // ============================================================================

From b9a6fa0e85960a0f7a7cfec7e7ed19fd1a2b8ed1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 20 May 2026 20:34:55 +0000
Subject: [PATCH 2/5] feat(simd): cpu-* cargo features for compile-time
 SimdProfile pinning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 3 T3.2: add 13 mutually-exclusive cargo features that pin
simd_profile() to a const at compile time, bypassing the runtime
LazyLock detection from T3.1. One feature per non-Scalar variant of
the SimdProfile enum.

Features (mapping to LLVM target-cpu codenames):
  cpu-gnr         → GraniteRapids    (graniterapids)
  cpu-spr         → SapphireRapids   (sapphirerapids)
  cpu-zen4        → Zen4Avx512       (znver4)
  cpu-cpl         → CooperLake       (cooperlake)
  cpu-tigerlake   → TigerLakeU       (tigerlake)
  cpu-icx         → IceLakeSp        (icelake-server)
  cpu-clx         → CascadeLake      (cascadelake)
  cpu-skx         → SkylakeX         (skylake-avx512)
  cpu-arrowlake   → ArrowLake        (arrowlake)
  cpu-haswell     → HaswellAvx2      (haswell)
  cpu-a76         → A76DotProd       (cortex-a76)
  cpu-a72         → A72Fast          (cortex-a72)
  cpu-a53         → A53Baseline      (cortex-a53)

Mutual exclusion (per integration-plan risk #1: "if a user sets
cpu-spr AND has runtime detection on Zen 4, the binary SIGILLs on
AMX instructions") is enforced via a const assert: each cpu-*
contributes 1 to _PIN_COUNT and the assert fires at compile time if
the sum exceeds one. Verified: enabling cpu-spr+cpu-zen4
simultaneously produces a build error citing the mutex.

Implementation: a const pinned_profile() -> Option<SimdProfile>
walks a cfg cascade and returns the active variant or None. The
simd_profile() function exists in two cfg-gated forms — a runtime
LazyLock variant compiled when no cpu-* feature is set, and a
const-foldable variant compiled when any is set. The LazyLock is
not linked into pinned binaries.

is_pinned() const helper exposes whether compile-time dispatch is
active, useful both for consumer-facing diagnostics and for gating
arch-detection tests that no longer apply when pinning overrides
hardware. Existing x86_target_lands_inside_x86_family /
aarch64_target_lands_inside_aarch64_family tests early-return when
pinned; two new tests (pinning_default_is_off, pinning_consistency)
verify the const + runtime paths agree.

Tests: 2077/2077 lib tests pass under both default and --features
cpu-spr configurations. cargo clippy --lib -- -D warnings clean in
both modes. Mutex compile_error verified by attempting
--features "cpu-spr,cpu-zen4" — fails as expected with the const
assert citation. The default (no feature) path is byte-identical to
the T3.1 runtime detection — no regression risk to the merged #181
work or to the e40f3a31 SimdProfile commit.
---
 Cargo.toml              |  30 +++++
 src/hpc/simd_profile.rs | 236 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 262 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ee7cc8f7..a431f14b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -273,6 +273,36 @@ splat3d = ["std"]
 # quad-tree partition; the entropy coder + RDO loop land in later workers.
 codec = ["std"]
 
+# ── Phase 3 T3.2: compile-time SimdProfile pinning ───────────────────
+#
+# Each cpu-<codename> feature, when enabled, makes
+# `crate::simd::simd_profile()` fold to a const at compile time and
+# bypass the runtime LazyLock detection. Pair with the matching
+# `-Ctarget-cpu=<llvm-name>` in `.cargo/config.toml` (or `RUSTFLAGS`)
+# for full effect — the cargo feature picks the *dispatch* variant,
+# while `-Ctarget-cpu` picks the *codegen* variant. Both together
+# produce a binary that is specialised to one silicon family.
+#
+# Features are MUTUALLY EXCLUSIVE — enable at most one. A compile-time
+# assert in `src/hpc/simd_profile.rs` enforces this. Multiple
+# pinning features active = build error.
+#
+# Codename → SimdProfile variant mapping (see
+# `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`):
+cpu-gnr = []          # GraniteRapids        — target-cpu=graniterapids
+cpu-spr = []          # SapphireRapids       — target-cpu=sapphirerapids
+cpu-zen4 = []         # Zen4Avx512           — target-cpu=znver4 (or znver5)
+cpu-cpl = []          # CooperLake           — target-cpu=cooperlake
+cpu-tigerlake = []    # TigerLakeU           — target-cpu=tigerlake
+cpu-icx = []          # IceLakeSp            — target-cpu=icelake-server
+cpu-clx = []          # CascadeLake          — target-cpu=cascadelake
+cpu-skx = []          # SkylakeX             — target-cpu=skylake-avx512
+cpu-arrowlake = []    # ArrowLake            — target-cpu=arrowlake
+cpu-haswell = []      # HaswellAvx2          — target-cpu=haswell (or znver3)
+cpu-a76 = []          # A76DotProd           — target-cpu=cortex-a76
+cpu-a72 = []          # A72Fast              — target-cpu=cortex-a72
+cpu-a53 = []          # A53Baseline          — target-cpu=cortex-a53
+
 # no_std polyfill for `static LazyLock` in `src/simd.rs` (sprint A12).
 # Pulls in `portable-atomic` with the `critical-section` impl plus the
 # `critical-section` runtime so we can build a once-cell-style cache for
diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs
index a49df92b..7ed65b37 100644
--- a/src/hpc/simd_profile.rs
+++ b/src/hpc/simd_profile.rs
@@ -225,18 +225,200 @@ impl SimdProfile {
     }
 }
 
+// ────────────────────────────────────────────────────────────────────
+// Phase 3 T3.2 — compile-time pinning via `cpu-*` cargo features.
+//
+// When any `cpu-<codename>` feature is set, `PINNED_PROFILE` is `Some(_)`
+// and `simd_profile()` folds to the const at compile time — no LazyLock
+// initialisation, no branch, no atomics. Without any feature set,
+// behaviour matches the original runtime LazyLock detection.
+//
+// Mutual exclusion: at most ONE cpu-* feature may be enabled. The
+// `_PIN_COUNT` const assert below fires at compile time if more than
+// one is active, mirroring integration-plan risk #1 ("if a user sets
+// cpu-spr AND has runtime detection on Zen 4, the binary SIGILLs").
+// ────────────────────────────────────────────────────────────────────
+
+const _PIN_COUNT: u32 = 0
+    + cfg!(feature = "cpu-gnr") as u32
+    + cfg!(feature = "cpu-spr") as u32
+    + cfg!(feature = "cpu-zen4") as u32
+    + cfg!(feature = "cpu-cpl") as u32
+    + cfg!(feature = "cpu-tigerlake") as u32
+    + cfg!(feature = "cpu-icx") as u32
+    + cfg!(feature = "cpu-clx") as u32
+    + cfg!(feature = "cpu-skx") as u32
+    + cfg!(feature = "cpu-arrowlake") as u32
+    + cfg!(feature = "cpu-haswell") as u32
+    + cfg!(feature = "cpu-a76") as u32
+    + cfg!(feature = "cpu-a72") as u32
+    + cfg!(feature = "cpu-a53") as u32;
+
+const _: () = assert!(
+    _PIN_COUNT <= 1,
+    "cpu-* cargo features are mutually exclusive: enable at most one (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell, cpu-a76, cpu-a72, cpu-a53)"
+);
+
+/// The compile-time pinned profile, or `None` when runtime detection is in
+/// effect. `Some(_)` exactly when one of the `cpu-*` cargo features is
+/// enabled; mutually exclusive features are enforced by the `_PIN_COUNT`
+/// const assert above.
+///
+/// Consumers wanting branch-free dispatch on pinned builds can match on
+/// this const directly — the optimiser folds `Some(SimdProfile::X)` into
+/// the call site and the `None`-arm runtime path is eliminated. Returned
+/// from a `const fn` so call sites in const contexts (e.g. `const` array
+/// initialisers for dispatch tables) work as well.
+pub const fn pinned_profile() -> Option<SimdProfile> {
+    #[cfg(feature = "cpu-gnr")]
+    {
+        return Some(SimdProfile::GraniteRapids);
+    }
+    #[cfg(feature = "cpu-spr")]
+    {
+        return Some(SimdProfile::SapphireRapids);
+    }
+    #[cfg(feature = "cpu-zen4")]
+    {
+        return Some(SimdProfile::Zen4Avx512);
+    }
+    #[cfg(feature = "cpu-cpl")]
+    {
+        return Some(SimdProfile::CooperLake);
+    }
+    #[cfg(feature = "cpu-tigerlake")]
+    {
+        return Some(SimdProfile::TigerLakeU);
+    }
+    #[cfg(feature = "cpu-icx")]
+    {
+        return Some(SimdProfile::IceLakeSp);
+    }
+    #[cfg(feature = "cpu-clx")]
+    {
+        return Some(SimdProfile::CascadeLake);
+    }
+    #[cfg(feature = "cpu-skx")]
+    {
+        return Some(SimdProfile::SkylakeX);
+    }
+    #[cfg(feature = "cpu-arrowlake")]
+    {
+        return Some(SimdProfile::ArrowLake);
+    }
+    #[cfg(feature = "cpu-haswell")]
+    {
+        return Some(SimdProfile::HaswellAvx2);
+    }
+    #[cfg(feature = "cpu-a76")]
+    {
+        return Some(SimdProfile::A76DotProd);
+    }
+    #[cfg(feature = "cpu-a72")]
+    {
+        return Some(SimdProfile::A72Fast);
+    }
+    #[cfg(feature = "cpu-a53")]
+    {
+        return Some(SimdProfile::A53Baseline);
+    }
+    #[allow(unreachable_code)]
+    None
+}
+
+/// `true` when a `cpu-*` cargo feature has pinned the profile at compile
+/// time, `false` when runtime detection is in use. Equivalent to
+/// `pinned_profile().is_some()` but spelled out for grep-ability.
+pub const fn is_pinned() -> bool {
+    pinned_profile().is_some()
+}
+
+// The LazyLock only exists when no cpu-* feature is set. With pinning,
+// linking the LazyLock would defeat the purpose — we want every code
+// path that touches `simd_profile()` to fold to a const.
+#[cfg(not(any(
+    feature = "cpu-gnr",
+    feature = "cpu-spr",
+    feature = "cpu-zen4",
+    feature = "cpu-cpl",
+    feature = "cpu-tigerlake",
+    feature = "cpu-icx",
+    feature = "cpu-clx",
+    feature = "cpu-skx",
+    feature = "cpu-arrowlake",
+    feature = "cpu-haswell",
+    feature = "cpu-a76",
+    feature = "cpu-a72",
+    feature = "cpu-a53",
+)))]
 static PROFILE: LazyLock<SimdProfile> = LazyLock::new(SimdProfile::detect);
 
-/// Get the resolved silicon profile, detected once at first access.
+/// Get the resolved silicon profile.
+///
+/// **Default (no `cpu-*` feature):** detected once at first access via
+/// `LazyLock`; subsequent calls are a single pointer deref to a `Copy`
+/// enum — no atomics, no branching.
+///
+/// **Pinned (one `cpu-*` feature set):** returns the pinned const
+/// directly. The compiler folds this call into the matching variant at
+/// every call site; the LazyLock is not linked into the binary.
 ///
-/// All subsequent calls are a single pointer deref to a `Copy` enum —
-/// no atomics, no branching. Pair with `*Dispatch` static tables to make
-/// per-call dispatch a single indirect call after monomorphisation.
+/// Pair with `*Dispatch` static tables to make per-call dispatch a single
+/// indirect call after monomorphisation (runtime path) or to fold to a
+/// direct call (pinned path).
+#[cfg(not(any(
+    feature = "cpu-gnr",
+    feature = "cpu-spr",
+    feature = "cpu-zen4",
+    feature = "cpu-cpl",
+    feature = "cpu-tigerlake",
+    feature = "cpu-icx",
+    feature = "cpu-clx",
+    feature = "cpu-skx",
+    feature = "cpu-arrowlake",
+    feature = "cpu-haswell",
+    feature = "cpu-a76",
+    feature = "cpu-a72",
+    feature = "cpu-a53",
+)))]
 #[inline(always)]
 pub fn simd_profile() -> SimdProfile {
     *PROFILE
 }
 
+/// Get the resolved silicon profile (pinned variant).
+///
+/// A `cpu-*` cargo feature is active: this returns the pinned constant
+/// directly, foldable at every call site. The runtime LazyLock is not
+/// linked into the binary. See the documentation on the runtime variant
+/// for the call-site contract.
+#[cfg(any(
+    feature = "cpu-gnr",
+    feature = "cpu-spr",
+    feature = "cpu-zen4",
+    feature = "cpu-cpl",
+    feature = "cpu-tigerlake",
+    feature = "cpu-icx",
+    feature = "cpu-clx",
+    feature = "cpu-skx",
+    feature = "cpu-arrowlake",
+    feature = "cpu-haswell",
+    feature = "cpu-a76",
+    feature = "cpu-a72",
+    feature = "cpu-a53",
+))]
+#[inline(always)]
+pub const fn simd_profile() -> SimdProfile {
+    // SAFETY of the unwrap: the cfg gate above guarantees at least one
+    // cpu-* feature is set, and `pinned_profile()` returns Some(_) under
+    // any of those gates. Const-evaluable since the inner cfg cascade is
+    // resolved at compile time.
+    match pinned_profile() {
+        Some(p) => p,
+        None => SimdProfile::Scalar, // unreachable; const fn can't panic cleanly
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -273,6 +455,11 @@ mod tests {
 
     #[test]
     fn x86_target_lands_inside_x86_family() {
+        // Pinning overrides hardware detection — the test only describes
+        // the default (runtime-detection) path.
+        if is_pinned() {
+            return;
+        }
         #[cfg(target_arch = "x86_64")]
         {
             let p = simd_profile();
@@ -290,6 +477,9 @@ mod tests {
 
     #[test]
     fn aarch64_target_lands_inside_aarch64_family() {
+        if is_pinned() {
+            return;
+        }
         #[cfg(target_arch = "aarch64")]
         {
             let p = simd_profile();
@@ -297,6 +487,44 @@ mod tests {
         }
     }
 
+    #[test]
+    fn pinning_default_is_off() {
+        // The default build (no cpu-* feature) must NOT be pinned, so
+        // downstream consumers don't get surprised by compile-time
+        // dispatch they didn't opt into.
+        #[cfg(not(any(
+            feature = "cpu-gnr",
+            feature = "cpu-spr",
+            feature = "cpu-zen4",
+            feature = "cpu-cpl",
+            feature = "cpu-tigerlake",
+            feature = "cpu-icx",
+            feature = "cpu-clx",
+            feature = "cpu-skx",
+            feature = "cpu-arrowlake",
+            feature = "cpu-haswell",
+            feature = "cpu-a76",
+            feature = "cpu-a72",
+            feature = "cpu-a53",
+        )))]
+        {
+            assert!(!is_pinned());
+            assert_eq!(pinned_profile(), None);
+        }
+    }
+
+    #[test]
+    fn pinning_consistency() {
+        // When pinning is in effect, simd_profile() must equal the
+        // pinned const — hardware detection is bypassed entirely.
+        if let Some(pinned) = pinned_profile() {
+            assert!(is_pinned());
+            assert_eq!(simd_profile(), pinned);
+        } else {
+            assert!(!is_pinned());
+        }
+    }
+
     #[test]
     fn has_avx512_is_subset_of_is_x86() {
         for &p in &[

From 03b30e54d6f158945d72086d038f216d2c52a315 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 20 May 2026 20:38:52 +0000
Subject: [PATCH 3/5] =?UTF-8?q?feat(examples):=20simd=5Fprofile=5Fprobe=20?=
 =?UTF-8?q?=E2=80=94=20hardware=20verification=20binary?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Supports step 1 of the TEST-promotion checklist from
.claude/knowledge/td-simd-cpu-dispatch-matrix.md § "TEST verification
checklist": "Boot the binary on the silicon and confirm
simd_profile() returns the expected variant."

The probe prints:
  - Resolved SimdProfile variant + arch/family flags
  - Compile-time pinning status (and pinned variant if active)
  - Every CPUID-derived SimdCaps bit, ticked/unticked
  - ARM heuristic profile when running on aarch64
  - Active compile-time target features (avx512f / avx2)
  - Per-variant matrix-doc cell summary (terse — matrix is source of truth)
  - Runs the same pinning_consistency invariant the unit test checks,
    so a probe deployed on real silicon flags regressions in the cfg
    cascade.

First-hardware results on the build host (Sapphire Rapids):
  - simd_profile() resolves to SapphireRapids ✓
  - amx_tile + amx_bf16 + avx512fp16 all set ✓
  - amx_fp16 unset (correctly NOT promoting to GraniteRapids) ✓
  - GNR-before-SPR ordering invariant verified end-to-end

This is the first end-to-end pass of the e40f3a31 SimdProfile detect
chain plus the 5a3a6630 cpu-* pinning machinery on real silicon —
confirms the dispatch axis is functional, not just doc-checked.

Smoke-tested with --features cpu-zen4: probe correctly reports
ACTIVE pinning and Zen4Avx512 as the resolved variant, even on SPR
silicon (pinning intentionally overrides hardware detection).
---
 examples/simd_profile_probe.rs | 181 +++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 examples/simd_profile_probe.rs

diff --git a/examples/simd_profile_probe.rs b/examples/simd_profile_probe.rs
new file mode 100644
index 00000000..609843f4
--- /dev/null
+++ b/examples/simd_profile_probe.rs
@@ -0,0 +1,181 @@
+//! `simd_profile_probe` — boot-on-silicon diagnostic for the dispatch matrix.
+//!
+//! Step 1 of the TEST-promotion checklist from
+//! `.claude/knowledge/td-simd-cpu-dispatch-matrix.md` § "TEST verification
+//! checklist": *"Boot the binary on the silicon and confirm `simd_profile()`
+//! returns the expected variant."*
+//!
+//! Prints every CPUID-derived capability bit plus the resolved `SimdProfile`
+//! variant. Used to verify silicon → profile mapping when promoting DOC
+//! cells in the dispatch matrix to TEST.
+//!
+//! Usage:
+//! ```sh
+//! # Runtime detection (default — same binary on any silicon):
+//! cargo run --example simd_profile_probe --release
+//!
+//! # Compile-time pinned (the LazyLock is not linked in):
+//! cargo run --example simd_profile_probe --release --features cpu-spr
+//! ```
+
+use ndarray::hpc::simd_caps::{simd_caps, ArmProfile, SimdCaps};
+use ndarray::hpc::simd_profile::{is_pinned, pinned_profile, simd_profile, SimdProfile};
+
+fn main() {
+    let caps = simd_caps();
+    let profile = simd_profile();
+
+    println!("ndarray simd-profile probe");
+    println!("==========================");
+    println!();
+
+    // ── Dispatch identity ───────────────────────────────────────────
+    println!("Resolved profile: {}", profile.name());
+    println!("  is_x86:        {}", profile.is_x86());
+    println!("  is_aarch64:    {}", profile.is_aarch64());
+    println!("  has_avx512:    {}", profile.has_avx512());
+    println!("  has_amx:       {}", profile.has_amx());
+    println!();
+
+    // ── Pinning status ─────────────────────────────────────────────
+    println!("Compile-time pinning: {}", if is_pinned() { "ACTIVE" } else { "off (runtime detection)" });
+    if let Some(p) = pinned_profile() {
+        println!("  Pinned variant:  {}", p.name());
+    }
+    println!();
+
+    // ── Raw capability bits ────────────────────────────────────────
+    println!("SimdCaps (raw bits):");
+    print_caps(&caps);
+    println!();
+
+    // ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ──
+    let arm = caps.arm_profile();
+    if !matches!(arm, ArmProfile::NotArm) {
+        println!("ARM profile (heuristic):  {}", arm.name());
+        println!("  est. tok/sec:  {}", arm.estimated_tok_per_sec());
+        println!("  eff. f32 lanes:{}", arm.effective_f32_lanes());
+        println!();
+    }
+
+    // ── Build configuration ─────────────────────────────────────────
+    println!("Build:");
+    println!("  target_arch:  {}", std::env::consts::ARCH);
+    println!("  target_os:    {}", std::env::consts::OS);
+    #[cfg(target_feature = "avx512f")]
+    println!("  -Ctarget-feature avx512f: yes (compile-time)");
+    #[cfg(not(target_feature = "avx512f"))]
+    println!("  -Ctarget-feature avx512f: no (compile-time)");
+    #[cfg(target_feature = "avx2")]
+    println!("  -Ctarget-feature avx2:    yes (compile-time)");
+    #[cfg(not(target_feature = "avx2"))]
+    println!("  -Ctarget-feature avx2:    no (compile-time)");
+    println!();
+
+    // ── TEST promotion guidance ────────────────────────────────────
+    println!("Matrix-doc cells affected by this CPU:");
+    matrix_cell_summary(profile);
+
+    // Sanity invariant: simd_profile() and pinned_profile() must agree
+    // when pinning is active. This is the same check that
+    // `pinning_consistency` runs as a unit test; we re-run it here so a
+    // probe binary deployed on real silicon flags any future regression
+    // in the cfg cascade.
+    if let Some(p) = pinned_profile() {
+        assert_eq!(
+            profile, p,
+            "INVARIANT VIOLATION: pinned_profile()={:?} disagrees with simd_profile()={:?}",
+            p, profile
+        );
+    }
+}
+
+fn print_caps(c: &SimdCaps) {
+    let bits: &[(&str, bool)] = &[
+        ("avx2", c.avx2),
+        ("avx512f", c.avx512f),
+        ("avx512bw", c.avx512bw),
+        ("avx512vl", c.avx512vl),
+        ("avx512vnni", c.avx512vnni),
+        ("avx512vbmi", c.avx512vbmi),
+        ("avx512vpopcntdq", c.avx512vpopcntdq),
+        ("avx512bf16", c.avx512bf16),
+        ("avx512fp16", c.avx512fp16),
+        ("avx512vp2intersect", c.avx512vp2intersect),
+        ("avxvnniint8", c.avxvnniint8),
+        ("amx_tile", c.amx_tile),
+        ("amx_int8", c.amx_int8),
+        ("amx_bf16", c.amx_bf16),
+        ("amx_fp16", c.amx_fp16),
+        ("fma", c.fma),
+        ("sse41", c.sse41),
+        ("sse2", c.sse2),
+        ("neon", c.neon),
+        ("asimd_dotprod", c.asimd_dotprod),
+        ("fp16 (arm)", c.fp16),
+        ("aes", c.aes),
+        ("sha2", c.sha2),
+        ("crc32", c.crc32),
+    ];
+    for (name, present) in bits {
+        println!("  [{}] {}", if *present { "x" } else { " " }, name);
+    }
+}
+
+fn matrix_cell_summary(p: SimdProfile) {
+    // Lifted from `td-simd-cpu-dispatch-matrix.md` § "Master matrix"
+    // for each x86 profile. The summary is intentionally terse — the
+    // matrix doc is the source of truth and should be consulted before
+    // promoting any DOC cell to TEST.
+    let summary: &[&str] = match p {
+        SimdProfile::GraniteRapids => &[
+            "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16",
+            "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL",
+            "AMX-TILE+INT8+BF16+FP16 (FP16 is the GNR discriminator)",
+        ],
+        SimdProfile::SapphireRapids => &[
+            "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16",
+            "VPOPCNTDQ+BITALG+GFNI+VAES+VPCLMUL",
+            "AMX-TILE+INT8+BF16 (no AMX-FP16 — that's GNR)",
+        ],
+        SimdProfile::Zen4Avx512 => &[
+            "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+BF16+FP16",
+            "No AMX of any kind; 256-bit FPU double-pumped on Zen4, native 512-bit on Zen5",
+        ],
+        SimdProfile::CooperLake => &[
+            "F+CD+VL+DQ+BW+VNNI+BF16",
+            "No VBMI, no FP16, no AMX — unique 'BF16 without VBMI'",
+        ],
+        SimdProfile::TigerLakeU => &[
+            "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI+VP2INTERSECT",
+            "VP2INTERSECT is the sole discriminator vs IceLakeSp",
+        ],
+        SimdProfile::IceLakeSp => &[
+            "F+CD+VL+DQ+BW+IFMA+VBMI+VBMI2+VNNI",
+            "No BF16, no FP16, no AMX, no VP2INTERSECT",
+        ],
+        SimdProfile::CascadeLake => &["F+CD+VL+DQ+BW+VNNI", "First Xeon with VNNI; no VBMI/BF16/FP16/AMX"],
+        SimdProfile::SkylakeX => &["F+CD+VL+DQ+BW", "Founding AVX-512 baseline; everything since adds on top"],
+        SimdProfile::ArrowLake => &[
+            "No AVX-512 (hybrid CPU design)",
+            "AVX-VNNI-INT8 + AVX-IFMA + AVX-NE-CONVERT (256-bit / VEX forms)",
+        ],
+        SimdProfile::HaswellAvx2 => &["AVX2 + FMA + F16C + BMI1/2", "Haswell..Coffee Lake / Zen 1-3"],
+        SimdProfile::A76DotProd => &[
+            "NEON + dotprod + fp16 + bf16+ + i8mm",
+            "Pi 5 (BCM2712), Orange Pi 5 (RK3588), Apple M1+",
+        ],
+        SimdProfile::A72Fast => &[
+            "NEON 128-bit + crypto (AES/SHA-2/CRC32)",
+            "Pi 4 (BCM2711), Pi 3-with-crypto, Orange Pi 4 — HWCAP cannot distinguish A72 from A53-with-crypto",
+        ],
+        SimdProfile::A53Baseline => &[
+            "NEON 128-bit baseline",
+            "Rare in the wild — QEMU / minimal aarch64 without crypto",
+        ],
+        SimdProfile::Scalar => &["No SIMD ISA recognised", "Fallback: scalar reference kernels"],
+    };
+    for line in summary {
+        println!("  - {}", line);
+    }
+}

From de52a446bdff917d2612c0bf5c3b96b165faa953 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 20 May 2026 20:43:24 +0000
Subject: [PATCH 4/5] =?UTF-8?q?fix(simd):=20SimdProfile::detect()=20consul?=
 =?UTF-8?q?ts=20amx=5Favailable()=20=E2=80=94=20Risk=20#3=20closure?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integration plan risk #3 ("Detection robustness across hypervisors"):
CPUID may advertise AMX-TILE while the OS/hypervisor has not enabled
the tile XSAVE state. Without the OS-level check, the dispatch table
routes to AMX kernels that SIGILL at first use.

Fix: SimdProfile::detect() now reads `simd_amx::amx_available()` (the
existing 4-step gate: CPUID → OSXSAVE → XCR0[17,18] → arch_prctl
XCOMP_PERM on Linux 5.19+) and demotes when CPUID and OS disagree.
The GraniteRapids and SapphireRapids arms now require both the CPUID
bits AND `amx_usable`; the Zen4Avx512 arm catches SPR-class CPUID
with locked-down hypervisor XSAVE so dispatch falls to the AVX-512
BF16/FP16 path instead.

Verified on the build host (Sapphire Rapids silicon, kernel 6.18.5):
  - CPUID reports amx_tile=1, amx_int8=1, amx_bf16=1 (all true)
  - simd_amx::amx_available() returns false (hypervisor masks
    XCR0[17,18] or the arch_prctl(XCOMP_PERM) request fails)
  - SimdProfile::detect() correctly resolves to Zen4Avx512, not
    SapphireRapids — the AMX kernels are not reachable from
    dispatch on this OS state.

Without this fix, the e40f3a31 detect path would have resolved to
SapphireRapids on this exact silicon/OS combination, then SIGILL'd
the first time a dispatch table called an AMX kernel. Bug closed
before any consumer was wired to the dispatch table.

The probe binary (examples/simd_profile_probe.rs) gains a new "AMX
gating (CPUID vs OS)" section so the CPUID-vs-OS gap is visible
without reading source. Format mirrors how the matrix-doc cell
summary appears: terse, two lines plus an optional demotion note
when the bits disagree.

Pinned mode (cpu-* cargo features) intentionally bypasses this gate
since pinning is a build-time assertion that the target OS supports
the chosen variant — pinned binaries are non-portable by design.

Tests: 2077/2077 lib pass. cargo clippy --lib clean under default
and --features cpu-spr. Behaviour on hardware with proper AMX
enablement (full prctl path success) is unchanged: SapphireRapids
still resolves to SapphireRapids when amx_available() returns true.
---
 examples/simd_profile_probe.rs | 21 +++++++++++++++++++++
 src/hpc/simd_profile.rs        | 20 ++++++++++++++++----
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/examples/simd_profile_probe.rs b/examples/simd_profile_probe.rs
index 609843f4..5324198c 100644
--- a/examples/simd_profile_probe.rs
+++ b/examples/simd_profile_probe.rs
@@ -49,6 +49,27 @@ fn main() {
     print_caps(&caps);
     println!();
 
+    // ── AMX OS-state probe (Risk #3 from integration plan) ────────
+    // SimdCaps reports raw CPUID. SimdProfile::detect() additionally
+    // consults `simd_amx::amx_available()` which gates on
+    // OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM). If CPUID says
+    // AMX-TILE but the OS/hypervisor doesn't enable the XSAVE state,
+    // dispatch demotes from SPR/GNR to Zen4Avx512 (AVX-512 BF16 path
+    // instead of AMX tiles). Surfacing the gap here lets a reviewer
+    // see when CPUID-vs-OS disagree without reading source.
+    #[cfg(target_arch = "x86_64")]
+    {
+        let cpuid_says_amx = caps.amx_tile && caps.amx_int8;
+        let os_allows_amx = ndarray::simd_amx::amx_available();
+        println!("AMX gating (CPUID vs OS):");
+        println!("  CPUID amx_tile+amx_int8: {}", cpuid_says_amx);
+        println!("  OS XSAVE/prctl gate:     {}", os_allows_amx);
+        if cpuid_says_amx && !os_allows_amx {
+            println!("  → CPUID-reported AMX is OS-DEMOTED — dispatch falls back to AVX-512 path");
+        }
+        println!();
+    }
+
     // ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ──
     let arm = caps.arm_profile();
     if !matches!(arm, ArmProfile::NotArm) {
diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs
index 7ed65b37..78775fe9 100644
--- a/src/hpc/simd_profile.rs
+++ b/src/hpc/simd_profile.rs
@@ -86,22 +86,34 @@ impl SimdProfile {
         #[cfg(target_arch = "x86_64")]
         {
             let caps = simd_caps();
+            // Risk #3 from the integration plan ("Detection robustness
+            // across hypervisors"): CPUID may report AMX-TILE while the
+            // OS has not enabled the tile XSAVE state. In that case AMX
+            // instructions SIGILL despite the CPUID bit being set.
+            // `simd_amx::amx_available()` runs the full 4-step gate
+            // (CPUID + OSXSAVE + XCR0 bits 17/18 + arch_prctl
+            // XCOMP_PERM). Demote to the no-AMX dispatch branch when
+            // the OS check fails — typically resolves as Zen4Avx512 on
+            // SPR-class CPUID with locked-down hypervisor XSAVE state.
+            let amx_usable = caps.amx_tile && crate::simd_amx::amx_available();
             // GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be
             // checked first because GNR is a strict superset of SPR.
-            if caps.has_amx_fp16() {
+            if amx_usable && caps.amx_fp16 {
                 return SimdProfile::GraniteRapids;
             }
             // SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 +
             // AVX-512-FP16. EmeraldRapids has identical ISA — same variant.
-            if caps.amx_tile && caps.amx_bf16 && caps.avx512fp16 {
+            if amx_usable && caps.amx_bf16 && caps.avx512fp16 {
                 return SimdProfile::SapphireRapids;
             }
-            // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no AMX.
+            // Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no usable
+            // AMX. The `!amx_usable` guard also catches OS-demoted SPR
+            // silicon so it resolves here instead of as SapphireRapids.
             if caps.avx512f
                 && caps.avx512vbmi
                 && caps.avx512bf16
                 && caps.avx512fp16
-                && !caps.amx_tile
+                && !amx_usable
             {
                 return SimdProfile::Zen4Avx512;
             }

From a7279e00aa283f6a75e7ff8a679138ecd3110ba1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 21 May 2026 11:37:28 +0000
Subject: [PATCH 5/5] fix(simd_profile): target_arch guards on cpu-* features +
 std-gate example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two independent bug fixes for codex P2 on de52a446bd and the
no-default-features CI failure:

1) codex P2 — Reject ARM cpu-* on x86 builds (and vice versa).
   Without target_arch guards, `--features cpu-a76` on an x86_64
   build silently routes simd_profile() to A76DotProd, breaking
   the is_x86()/is_aarch64() partitioning and routing callers
   into the wrong dispatch family. Add compile_error! checks
   that fail fast for the current target_arch — same fail-fast
   pattern as the existing _PIN_COUNT mutual-exclusion assert.

2) CI fix — examples/simd_profile_probe.rs uses ndarray::hpc::*
   and ndarray::simd_amx::* which are gated behind the "std"
   feature. On `cargo test --no-default-features` the example
   target still tries to compile, producing E0433 "cannot find
   hpc in ndarray". Add `required-features = ["std"]` to the
   example's Cargo.toml entry so it is skipped when std is
   disabled, matching the existing pattern for ocr_benchmark.

No behavioral change on default builds. Both fixes are
independent of the architectural question about whether cpu-*
features should exist at all (which is for the originating
session to revisit if they want to unify with cpu_ops_for_cpu);
this just makes the existing features less of a footgun.
---
 Cargo.toml              |  4 ++++
 src/hpc/simd_profile.rs | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index a431f14b..3e943e1f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,10 @@ required-features = ["std"]
 name = "splat3d_flex"
 required-features = ["splat3d"]
 
+[[example]]
+name = "simd_profile_probe"
+required-features = ["std"]
+
 [dependencies]
 num-integer = { workspace = true }
 num-traits = { workspace = true }
diff --git a/src/hpc/simd_profile.rs b/src/hpc/simd_profile.rs
index 78775fe9..4b51996b 100644
--- a/src/hpc/simd_profile.rs
+++ b/src/hpc/simd_profile.rs
@@ -271,6 +271,44 @@ const _: () = assert!(
     "cpu-* cargo features are mutually exclusive: enable at most one (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell, cpu-a76, cpu-a72, cpu-a53)"
 );
 
+// ────────────────────────────────────────────────────────────────────
+// target_arch guards (codex P2 closure).
+//
+// Each `cpu-<codename>` pin is only valid on its native silicon
+// family. Without these guards `--features cpu-a76` on an x86_64
+// build would silently route `simd_profile()` to `A76DotProd`,
+// breaking the `is_x86()` / `is_aarch64()` partitioning and routing
+// callers into the wrong dispatch family. Fail fast at compile time
+// instead.
+// ────────────────────────────────────────────────────────────────────
+
+#[cfg(all(
+    not(target_arch = "x86_64"),
+    any(
+        feature = "cpu-gnr",
+        feature = "cpu-spr",
+        feature = "cpu-zen4",
+        feature = "cpu-cpl",
+        feature = "cpu-tigerlake",
+        feature = "cpu-icx",
+        feature = "cpu-clx",
+        feature = "cpu-skx",
+        feature = "cpu-arrowlake",
+        feature = "cpu-haswell",
+    )
+))]
+compile_error!(
+    "x86 cpu-* pinning features (cpu-gnr, cpu-spr, cpu-zen4, cpu-cpl, cpu-tigerlake, cpu-icx, cpu-clx, cpu-skx, cpu-arrowlake, cpu-haswell) require target_arch = \"x86_64\""
+);
+
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(feature = "cpu-a76", feature = "cpu-a72", feature = "cpu-a53",)
+))]
+compile_error!(
+    "ARM cpu-* pinning features (cpu-a76, cpu-a72, cpu-a53) require target_arch = \"aarch64\""
+);
+
 /// The compile-time pinned profile, or `None` when runtime detection is in
 /// effect. `Some(_)` exactly when one of the `cpu-*` cargo features is
 /// enabled; mutually exclusive features are enforced by the `_PIN_COUNT`