diff --git a/.cargo/config-apple-m2.toml b/.cargo/config-apple-m2.toml
new file mode 100644
index 00000000..b86a8f56
--- /dev/null
+++ b/.cargo/config-apple-m2.toml
@@ -0,0 +1,20 @@
+[build]
+# Apple M2 / M3 / M4 — ARMv8.6-A+ with BF16, dotprod, fp16, i8mm.
+# Use with:
+#   cargo --config .cargo/config-apple-m2.toml build --target=aarch64-apple-darwin
+#
+# Targets the BF16 tier — see `src/simd_neon_bf16.rs` for the silicon
+# table, runtime detection (`sysctl hw.optional.arm.FEAT_BF16`), the
+# BFMMLA / BFDOT intrinsic family, and the asm-byte fallback path that
+# stable Rust 1.95 must use until `vbfdotq_f32` stabilizes (issue
+# #117222).
+#
+# Also works on:
+#   - Apple M3 (target-cpu=apple-m3) — same ARMv8.6-A baseline
+#   - Apple M4 — adds SVE2, can override with -Ctarget-cpu=apple-m4
+#   - Snapdragon X Elite / X Plus on macOS-like targets (use cortex-x4)
+#
+# DOES NOT target Apple M1 — M1 is ARMv8.5-A and lacks BF16. M1 should
+# use the dotprod tier (config-pi5.toml-shaped, target-cpu=apple-m1).
+[target.aarch64-apple-darwin]
+rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"]
diff --git a/.cargo/config-graviton.toml b/.cargo/config-graviton.toml
new file mode 100644
index 00000000..4b8a7e25
--- /dev/null
+++ b/.cargo/config-graviton.toml
@@ -0,0 +1,19 @@
+[build]
+# AWS Graviton 3 / 3E / 4 (Neoverse V1 / V2) — ARMv8.4-A+ with BF16
+# (V1: optional, V2: mandatory) + SVE / SVE2.
+# Use with:
+#   cargo --config .cargo/config-graviton.toml build --target=aarch64-unknown-linux-gnu
+#
+# Targets the BF16 tier — see `src/simd_neon_bf16.rs`. Graviton 3 (V1)
+# also adds SVE 256-bit; Graviton 4 (V2) adds SVE2 + BFMMLA + i8mm.
+#
+# Also works on:
+#   - Cortex-X3 / X4 / X925 generic Linux servers
+#   - Ampere Altra (V1-class — same baseline)
+#   - NVIDIA Grace (V2 — same as Graviton 4)
+#
+# For ARMv9 cores with SVE2 you may want a separate config-sve2.toml
+# later that adds `+sve2` and routes through a future
+# `src/simd_neon_sve2.rs` (not in Phase 3 scope).
+[target.aarch64-unknown-linux-gnu]
+rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"]
diff --git a/.cargo/config-pi5.toml b/.cargo/config-pi5.toml
new file mode 100644
index 00000000..b092b315
--- /dev/null
+++ b/.cargo/config-pi5.toml
@@ -0,0 +1,15 @@
+[build]
+# Raspberry Pi 5 (BCM2712, Cortex-A76) — ARMv8.2-A with dotprod + fp16.
+# Use with:
+#   cargo --config .cargo/config-pi5.toml build --target=aarch64-unknown-linux-gnu
+#
+# Targets the dotprod/fp16 tier — see `src/simd_neon_dotprod.rs` for the
+# silicon table, runtime detection, and stub map. Also works on:
+#   - Orange Pi 5 (Rockchip RK3588, Cortex-A76)
+#   - Anything reporting `Features: ... asimddp asimdhp ...` in
+#     /proc/cpuinfo without `bf16`.
+#
+# For Apple M2+ / Snapdragon X / Graviton 4, use config-apple-m2.toml
+# (BF16 tier — see src/simd_neon_bf16.rs).
+[target.aarch64-unknown-linux-gnu]
+rustflags = ["-Ctarget-cpu=cortex-a76", "-Ctarget-feature=+dotprod,+fp16"]
diff --git a/src/lib.rs b/src/lib.rs
index 9dffe54a..426bdae9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -266,6 +266,24 @@ pub mod simd_amx;
 #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
 pub mod simd_neon;
 
+// NEON tier scaffolds — Phase 3 of the SIMD integration plan
+// (.claude/knowledge/simd-dispatch-architecture.md § 6).
+//
+// Each file documents the silicon, the runtime + compile-time detection
+// path, and stubs out the F16 / BF16 wrappers with intrinsic maps for
+// future implementation. Current state: scaffolds only — the actual
+// NEON code still lives in `simd_neon.rs::aarch64_simd` and gets
+// migrated tier-by-tier as the Phase 3 sprints land.
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_baseline;
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_dotprod;
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_bf16;
+
 #[cfg(feature = "std")]
 #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
 pub mod simd_wasm;
diff --git a/src/simd_neon_baseline.rs b/src/simd_neon_baseline.rs
new file mode 100644
index 00000000..cf7d2437
--- /dev/null
+++ b/src/simd_neon_baseline.rs
@@ -0,0 +1,81 @@
+//! NEON baseline tier — ARMv8.0-A `+neon` only.
+//!
+//! # Silicon
+//!
+//! Every aarch64 CPU since ARMv8.0-A ships NEON unconditionally. This
+//! tier is the lowest common denominator — the floor every other tier
+//! builds on. Concretely:
+//!
+//! - **Raspberry Pi 3** (BCM2837, Cortex-A53) — ARMv8.0-A
+//! - **Raspberry Pi 4** (BCM2711, Cortex-A72) — ARMv8.0-A
+//! - **Pi CM3 / CM4 (4 GB)** — same A72 silicon
+//! - Anything reporting `Features: ... asimd ...` in `/proc/cpuinfo`
+//!   without `asimddp`, `asimdfhm`, `asimdhp`, or `bf16`.
+//!
+//! # What you get
+//!
+//! Native 128-bit lanes: `float32x4_t`, `float64x2_t`, `int8x16_t`,
+//! `uint8x16_t`, `int16x8_t`, `uint16x8_t`, `int32x4_t`, `int64x2_t`,
+//! `uint32x4_t`, `uint64x2_t`. Standard NEON arithmetic — `vaddq_*`,
+//! `vsubq_*`, `vmulq_*`, `vfmaq_*`, `vminq_*`, `vmaxq_*`, gather /
+//! scatter via `vld1q_*` / `vst1q_*`, lane select, reduce.
+//!
+//! # What you do NOT get
+//!
+//! - **dotprod** (SDOT/UDOT) → see `simd_neon_dotprod.rs`
+//! - **fp16 arithmetic** (`vfmlaq_f16`, `vaddq_f16`) → see
+//!   `simd_neon_dotprod.rs`
+//! - **bf16** (`vbfdotq_f32`, `vbfmlalbq_f32`) → see `simd_neon_bf16.rs`
+//! - **SVE2** (variable-length vectors) → not in any current tier file.
+//!
+//! # 512-bit composed wrappers
+//!
+//! `crate::simd::F32x16` / `U8x64` etc. on aarch64 compose 4× 128-bit
+//! NEON registers into one logical wrapper, e.g.
+//! `pub struct F32x16(pub [float32x4_t; 4])`. The four loads/stores
+//! pipeline well on dual-issue cores (A72, A76, M-series).
+//!
+//! # Cargo config
+//!
+//! No special flags needed. `aarch64-unknown-linux-gnu` / `aarch64-
+//! apple-darwin` already enable NEON. Pi 3/4 cross-builds:
+//! `cargo build --target=aarch64-unknown-linux-gnu` from any host.
+//!
+//! # Status
+//!
+//! Scaffold only — placeholder for Phase 3 implementation. The actual
+//! 128-bit native wrappers (I8x16, I16x8, U8x16, U16x8, U32x4, U64x2,
+//! I32x4, I64x2) currently live in `src/simd_neon.rs::aarch64_simd`.
+//! That code moves here once the tier split lands.
+//!
+//! Composed 512-bit wrappers (`F32x16` = `[float32x4_t; 4]` etc.) for
+//! the 8 missing int types (U8x64, I8x64, I16x32, I32x16, I64x8,
+//! U16x32, U32x16, U64x8) are TODO — currently dispatched to
+//! `simd_scalar.rs` via the `scalar::*` fallback at `simd.rs:1593-95`.
+
+#![cfg(all(target_arch = "aarch64", feature = "std"))]
+
+// TODO(Phase-3): move the existing `pub mod aarch64_simd` block from
+// `src/simd_neon.rs` (lines 463-1126 of master @ 3c20392f) into this
+// file. Then re-export from `simd_neon.rs` for backwards compatibility
+// during the migration window. Same pattern as Phase 4 used to extract
+// `simd_scalar.rs` via `#[path]` declaration.
+
+// TODO(Phase-3): add the 8 missing 512-bit composed wrappers as
+// `[neon_native; 4]`. Apply the `avx2_int_type!`-equivalent macro
+// pattern to generate them mechanically — name it `neon_int_type!` and
+// keep the API surface identical to the AVX-512 / AVX2 / nightly arms.
+//
+//   neon_int_type!(U8x64,  u8,  64, uint8x16_t, vaddq_u8,  vsubq_u8);
+//   neon_int_type!(I8x64,  i8,  64, int8x16_t,  vaddq_s8,  vsubq_s8);
+//   neon_int_type!(U16x32, u16, 32, uint16x8_t, vaddq_u16, vsubq_u16);
+//   neon_int_type!(I16x32, i16, 32, int16x8_t,  vaddq_s16, vsubq_s16);
+//   neon_int_type!(U32x16, u32, 16, uint32x4_t, vaddq_u32, vsubq_u32);
+//   neon_int_type!(I32x16, i32, 16, int32x4_t,  vaddq_s32, vsubq_s32);
+//   neon_int_type!(U64x8,  u64, 8,  uint64x2_t, vaddq_u64, vsubq_u64);
+//   neon_int_type!(I64x8,  i64, 8,  int64x2_t,  vaddq_s64, vsubq_s64);
+
+// TODO(Phase-3): copy the existing F32x16 / F64x8 paired-load impls
+// from `src/simd_neon.rs::aarch64_simd::{F32x16, F64x8, F32Mask16,
+// F64Mask8}` here. They already use the composed `[float32x4_t; 4]` /
+// `[float64x2_t; 4]` layout this tier expects.
diff --git a/src/simd_neon_bf16.rs b/src/simd_neon_bf16.rs
new file mode 100644
index 00000000..e046b2d3
--- /dev/null
+++ b/src/simd_neon_bf16.rs
@@ -0,0 +1,204 @@
+//! NEON + BF16 tier — ARMv8.6-A `+bf16` (or ARMv8.2-A + optional `+bf16`).
+//!
+//! Builds on `simd_neon_dotprod.rs`. Adds the BF16 instruction family:
+//! BFDOT, BFMMLA, BFMLALB, BFMLALT, BFCVT. These are the bf16 cousins
+//! of dotprod — same 4× int8 throughput shape, but for the half-the-
+//! width bfloat16 type that LLM inference standardized on.
+//!
+//! # Silicon
+//!
+//! - **Apple M2 / M3 / M4** (Avalanche/Blizzard, Everest/Sawtooth,
+//!   Tupai/Donan) — ARMv8.6-A+. BF16 always on. `sysctl
+//!   hw.optional.arm.FEAT_BF16` returns 1. M1 does NOT have BF16 — it's
+//!   ARMv8.5-A.
+//! - **Snapdragon X Elite / X Plus** (Cortex-X4/X3 cores, Oryon
+//!   prime) — ARMv8.7-A. BF16 always on.
+//! - **Cortex-A510 / A520 / A710 / A720 / X2 / X3 / X4 / X925** —
+//!   ARMv9.0-A+. BF16 always on.
+//! - **NVIDIA Grace** (Neoverse V2) — ARMv9-A. BF16 on.
+//! - **AWS Graviton 3 / 3E / 4** (Neoverse V1/V2) — V1 added BF16 as
+//!   optional ARMv8.4-A extension; V2 makes it mandatory.
+//! - **Ampere One (M-series)** — ARMv8.6-A+. BF16 on.
+//!
+//! # NOT in this tier
+//!
+//! - Apple M1 (ARMv8.5-A, no BF16) — falls back to `simd_neon_dotprod.rs`
+//! - Raspberry Pi 5 (Cortex-A76, ARMv8.2-A, no BF16) — `simd_neon_dotprod.rs`
+//! - Any Pi 3/4 / Cortex-A53/A72 — `simd_neon_baseline.rs`
+//!
+//! # How to detect at runtime
+//!
+//! - **Linux**: `/proc/cpuinfo` Features line should show `bf16`.
+//!   `getauxval(AT_HWCAP2) & HWCAP2_BF16` (bit 14).
+//!   `std::arch::is_aarch64_feature_detected!("bf16")` — recommended.
+//! - **macOS**: `sysctl hw.optional.arm.FEAT_BF16` → `1` means yes.
+//!   On M2+ it's always 1; on M1 it's 0.
+//! - **Windows ARM64**: `IsProcessorFeaturePresent(PF_ARM_V83_BF16)`
+//!   (constant added in Win11 24H2 SDK).
+//!
+//! # How to detect at compile time
+//!
+//! Cargo config flags:
+//! - `-Ctarget-feature=+bf16` — enables BF16 intrinsics + cfg gate.
+//! - `-Ctarget-cpu=apple-m2` — implies bf16 + everything else.
+//! - `-Ctarget-cpu=neoverse-v2` — Graviton 4 baseline.
+//! - `-Ctarget-cpu=cortex-x4` — Snapdragon X Elite / Cortex-X4 cores.
+//!
+//! Inside Rust:
+//!
+//! ```ignore
+//! #[cfg(all(target_arch = "aarch64", target_feature = "bf16"))]
+//! pub use crate::simd_neon_bf16::{BF16x8, BF16x16, bfdot, bfmmla};
+//! ```
+//!
+//! # What you get
+//!
+//! ## BF16 dot-product / matrix-multiply
+//!
+//! - `vbfdotq_f32(acc, a, b)` — 2×(2×bf16·2×bf16) → 2×f32, accumulated
+//!   into 4×f32 register. The bf16 analogue of `vdotq_s32`.
+//! - `vbfmmlaq_f32(acc, a, b)` — 2×2 outer product BFMMLA. The crown
+//!   jewel for transformer GEMM — accumulates a full 2×2 f32 tile per
+//!   instruction. 8 bf16 mults + 4 f32 adds per cycle on M2.
+//! - `vbfmlalbq_f32` / `vbfmlaltq_f32` — bottom / top half multiply-
+//!   accumulate, lane-by-lane variant of BFDOT.
+//! - `vbfmlalbq_laned_f32` — broadcast one lane across all bf16
+//!   multiplications. Useful for matvec.
+//!
+//! ## BF16 conversion
+//!
+//! - `vcvt_bf16_f32` / `vcvtq_low_bf16_f32` / `vcvtq_high_bf16_f32` —
+//!   pack 4×f32 → 4×bf16. Hardware rounding (no manual RNE needed
+//!   like the AVX-512BF16 `_mm512_cvtne2ps_pbh` path in
+//!   `simd_avx512.rs`).
+//! - Scalar f32 ↔ bf16: trivial high-16-bit slice (the scalar paths in
+//!   `src/simd.rs:1604-1626` work everywhere, including this tier).
+//!
+//! # Composed wrapper shapes
+//!
+//! - `BF16x8` = `bfloat16x8_t` — native 128-bit register, 8 bf16 lanes.
+//!   Matches AVX-512BF16 `BF16x8 = __m128bh` in shape.
+//! - `BF16x16` = `[bfloat16x8_t; 2]` — two 128-bit registers, 16 bf16
+//!   lanes. Matches AVX-512BF16 `BF16x16 = __m256bh` in shape.
+//!
+//! # Cargo configs
+//!
+//! ```toml
+//! # .cargo/config-apple-m2.toml — Apple M2/M3/M4
+//! [build]
+//! target = "aarch64-apple-darwin"
+//! [target.aarch64-apple-darwin]
+//! rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16"]
+//! ```
+//!
+//! ```toml
+//! # .cargo/config-graviton.toml — AWS Graviton 3/4
+//! [build]
+//! target = "aarch64-unknown-linux-gnu"
+//! [target.aarch64-unknown-linux-gnu]
+//! rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16"]
+//! ```
+//!
+//! ```toml
+//! # .cargo/config-snapdragon-x.toml — Snapdragon X Elite (Win/Linux)
+//! [build]
+//! target = "aarch64-pc-windows-msvc"   # or aarch64-unknown-linux-gnu
+//! rustflags = ["-Ctarget-cpu=cortex-x4", "-Ctarget-feature=+bf16,+i8mm"]
+//! ```
+//!
+//! # Stable-Rust constraint
+//!
+//! Same as the FP16 tier: `bfloat16x8_t` exists in `core::arch::aarch64`
+//! on stable, but the intrinsics (`vbfdotq_f32`, `vbfmmlaq_f32`, ...)
+//! are nightly-only (issue #117222). Two paths on stable 1.95:
+//!
+//!   1. **asm! byte encoding** — same pattern as `src/simd_amx.rs`
+//!      uses for AMX. Example:
+//!      ```ignore
+//!      // BFDOT v0.4s, v1.8h, v2.8h
+//!      asm!(".inst 0x4e41ec00", inout("v0") acc, in("v1") a, in("v2") b);
+//!      // BFMMLA v0.4s, v1.8h, v2.8h
+//!      asm!(".inst 0x6e42ec01", inout("v0") acc, in("v1") a, in("v2") b);
+//!      ```
+//!      Verify the encoding with `aarch64-linux-gnu-objdump --disassemble`
+//!      on a reference compile.
+//!   2. **Round-trip through f32** — convert bf16 → f32 (scalar bit-
+//!      shift), use the existing `vfmaq_f32` from baseline NEON, convert
+//!      back. Loses the 4× throughput; only as a correctness anchor for
+//!      the asm path.
+//!
+//! Path (1) is the only one worth shipping. The asm-byte fallback IS
+//! how `simd_amx.rs` ships AMX on stable Rust today — same pattern.
+
+#![cfg(all(target_arch = "aarch64", feature = "std"))]
+
+// ─── BF16 stubs ──────────────────────────────────────────────────────
+
+/// Placeholder for the BF16 8-lane native wrapper.
+///
+/// Real implementation: `pub struct BF16x8(pub bfloat16x8_t)`. API
+/// surface mirrors `simd_avx512::BF16x8`:
+/// - `splat(bits: u16) -> Self` (broadcast bf16 bit pattern across 8 lanes)
+/// - `from_slice(s: &[u16]) -> Self` (load 8 raw bf16 bits as u16s)
+/// - `to_array(self) -> [u16; 8]`
+/// - `dot_f32(self, other: Self, acc: F32x4) -> F32x4` — wraps BFDOT
+/// - `cvt_to_f32_lo(self) -> F32x4`, `cvt_to_f32_hi(self) -> F32x4`
+///
+/// Without `target_feature = "bf16"`, this falls back to round-trip
+/// through f32 (slow). With the feature on, it uses asm-byte BFDOT.
+pub struct BF16x8Stub;
+
+/// Placeholder for the BF16 16-lane composed wrapper.
+///
+/// Real implementation: `pub struct BF16x16(pub [bfloat16x8_t; 2])`.
+/// API mirror of `simd_avx512::BF16x16`. The 16-lane variant is the
+/// natural width for matmul tile rows in transformer attention.
+pub struct BF16x16Stub;
+
+impl BF16x8Stub {
+    pub fn unimplemented() -> ! {
+        unimplemented!(
+            "BF16x8 NEON bf16-tier implementation TODO. See \
+             src/simd_neon_bf16.rs module docs for the BFDOT / BFMMLA \
+             asm-byte encoding (stable Rust 1.95 can't reach the \
+             nightly-only vbfdotq_f32 intrinsic). Reference: \
+             src/simd_amx.rs's `.byte` pattern."
+        )
+    }
+}
+
+impl BF16x16Stub {
+    pub fn unimplemented() -> ! {
+        unimplemented!(
+            "BF16x16 NEON bf16-tier implementation TODO. Two-half \
+             composed wrapper [bfloat16x8_t; 2] — see module docs."
+        )
+    }
+}
+
+// ─── BFMMLA: the prize intrinsic ─────────────────────────────────────
+//
+// BFMMLA is the most important instruction this tier unlocks. It
+// computes a 2×2 outer-product matrix multiply of bf16 inputs,
+// accumulating into a 2×2 f32 tile. One instruction = 8 bf16 mults +
+// 4 f32 adds. On Apple M2 the throughput is ~32 GFLOP/s per core in
+// bf16-matmul-bound kernels.
+//
+// Encoding for `BFMMLA Vd.4s, Vn.8h, Vm.8h`: 0x6e40_ec00 | (Vm << 16)
+// | (Vn << 5) | Vd. Use a `bfmmla!` macro to emit the asm-byte for any
+// (acc, a, b) v-register triple.
+//
+// TODO(Phase-3): implement `bfmmla(acc: F32x4, a: BF16x8, b: BF16x8)
+// -> F32x4` as the primary export. The rest of the BF16 API builds on
+// it (BFDOT is BFMMLA's diagonal, BFMLALB/T are its half-slices).
+
+// ─── BFDOT: same shape as DotProd, but bf16 ──────────────────────────
+//
+// Where `vdotq_s32(acc, a, b)` does 4×(4×i8·4×i8) → 4×i32, BFDOT does
+// 2×(2×bf16·2×bf16) → 2×f32 accumulated into 4×f32. The bf16 analogue
+// is HALF the lane count per output (2 vs 4) because bf16 is twice as
+// wide as i8.
+//
+// TODO(Phase-3): implement `bfdot(acc: F32x4, a: BF16x8, b: BF16x8)
+// -> F32x4`. Asm-byte for `BFDOT Vd.4s, Vn.8h, Vm.8h`:
+//   0x4e40_ec00 | (Vm << 16) | (Vn << 5) | Vd
diff --git a/src/simd_neon_dotprod.rs b/src/simd_neon_dotprod.rs
new file mode 100644
index 00000000..d3bc3fcd
--- /dev/null
+++ b/src/simd_neon_dotprod.rs
@@ -0,0 +1,156 @@
+//! NEON + dotprod + FP16 tier — ARMv8.2-A `+dotprod,+fp16`.
+//!
+//! Builds on `simd_neon_baseline.rs`. This tier adds two ISA features
+//! that landed together in ARMv8.2-A and ship as a pair on every
+//! consumer aarch64 chip from ~2019 onward.
+//!
+//! # Silicon
+//!
+//! - **Raspberry Pi 5** (BCM2712, Cortex-A76) — ARMv8.2-A. The first
+//!   Pi with dotprod + fp16. 4× int8 dot-product throughput vs Pi 4.
+//! - **Cortex-A55r2+** — ARMv8.2-A revision 2 silicon. Mid-range
+//!   Android, low-power servers (Ampere Altra little cores).
+//! - **Cortex-A75 / A76 / A77 / A78 / A78AE / X1 / X2 / X3** — all
+//!   ARMv8.2-A or later, all have dotprod + fp16.
+//! - **Apple A11 / A12 / A13 / M1** — ARMv8.3-A or later. (Note: Apple
+//!   exposes `bf16` from M2 onward; see `simd_neon_bf16.rs`.)
+//! - **Snapdragon 8 Gen 1+** — ARMv8.4-A+. dotprod + fp16 always on.
+//!
+//! # How to detect at runtime
+//!
+//! - **Linux**: read `/proc/cpuinfo`'s `Features:` line. dotprod is
+//!   reported as `asimddp`; fp16 as `asimdhp` (asimd half-precision)
+//!   and `fphp` (scalar fp16). Both should be present on this tier.
+//!   Or use `std::arch::is_aarch64_feature_detected!("dotprod")` /
+//!   `("fp16")` — already wired into `simd.rs::detect_tier()` at
+//!   line 63: `if std::arch::is_aarch64_feature_detected!("dotprod")
+//!   { return Tier::NeonDotProd; }`.
+//! - **macOS** (Apple silicon): all M-series chips have dotprod + fp16.
+//!   `sysctl hw.optional.arm.FEAT_DotProd` returns 1.
+//! - **Android NDK**: `getauxval(AT_HWCAP) & HWCAP_ASIMDDP` (bit 20).
+//!
+//! # How to detect at compile time
+//!
+//! Cargo config flags `-Ctarget-feature=+dotprod,+fp16` (or
+//! `-Ctarget-cpu=cortex-a76` which includes both as defaults). Inside
+//! Rust:
+//!
+//! ```ignore
+//! #[cfg(all(target_arch = "aarch64", target_feature = "dotprod"))]
+//! pub use crate::simd_neon_dotprod::dot_i8x16;
+//!
+//! #[cfg(all(target_arch = "aarch64", target_feature = "fp16"))]
+//! pub use crate::simd_neon_dotprod::F16x16;
+//! ```
+//!
+//! # What you get
+//!
+//! ## DotProd (already wired in `simd_neon.rs:190-203`)
+//!
+//! - `vdotq_s32(acc, a, b)` — 4×(4×i8·4×i8) → 4×i32 in ONE instruction.
+//! - `vdotq_u32` — unsigned variant.
+//! - `vdotq_laned_s32` — broadcast lane of `b` across all 4 dotproducts.
+//! - Already exposed: `dot_i8x16_neon`, `codebook_gather_i8_dotprod`.
+//!
+//! ## FP16 (TODO — stubs below)
+//!
+//! - `vaddq_f16`, `vsubq_f16`, `vmulq_f16`, `vfmaq_f16` — 8×fp16 per
+//!   `float16x8_t` register. 2× the throughput of fp32 for inference.
+//! - `vcvt_f32_f16` / `vcvt_f16_f32` — pack-into / unpack-from fp32.
+//!   Already implemented as scalar bit-twiddle in `simd_neon.rs:324-420`
+//!   (`f16_to_f32_scalar`, `f32_to_f16_scalar`); the SIMD versions
+//!   need the `target_feature = "fp16"` gate.
+//!
+//! # Composed wrapper shape
+//!
+//! `F16x16` = `[float16x8_t; 2]` — two 128-bit FP16 registers compose
+//! 16 lanes, matching the AVX-512 / nightly width.
+//!
+//! # Cargo config
+//!
+//! ```toml
+//! # .cargo/config-pi5.toml
+//! [build]
+//! target = "aarch64-unknown-linux-gnu"
+//! [target.aarch64-unknown-linux-gnu]
+//! rustflags = ["-Ctarget-cpu=cortex-a76", "-Ctarget-feature=+dotprod,+fp16"]
+//! ```
+//!
+//! Cross-build for Pi 5 from x86_64:
+//! `cargo --config .cargo/config-pi5.toml build`.
+//!
+//! Apple build (Mac M1+ runs this natively under `aarch64-apple-darwin`):
+//! ```toml
+//! # .cargo/config-apple.toml
+//! [build]
+//! target = "aarch64-apple-darwin"
+//! [target.aarch64-apple-darwin]
+//! rustflags = ["-Ctarget-cpu=apple-m1", "-Ctarget-feature=+dotprod,+fp16"]
+//! ```
+
+#![cfg(all(target_arch = "aarch64", feature = "std"))]
+
+// ─── F16 stubs ────────────────────────────────────────────────────────
+//
+// Target intrinsic family: `float16x8_t` arithmetic, gated on
+// `target_feature = "fp16"`. Rust stable 1.95 has `float16x8_t` in
+// `core::arch::aarch64` but the intrinsics like `vaddq_f16` are nightly
+// only (issue #112800). On stable we have three options:
+//
+//   1. Use `vfmaq_f16` etc. via `unsafe asm!` with the literal
+//      ARM64 instruction (e.g. `asm!("fmla v0.8h, v1.8h, v2.8h",
+//      in("v1") a, in("v2") b, inout("v0") acc)`). This is the AMX
+//      precedent — see `src/simd_amx.rs::amx_available()` for the
+//      pattern of byte-encoded asm on stable Rust.
+//   2. Round-trip through f32 using the existing scalar conversions
+//      in `simd_neon.rs:324-420`. Loses throughput; only useful as a
+//      correctness baseline.
+//   3. Wait for `core::arch::aarch64::vaddq_f16` to stabilize.
+//
+// Phase 3 implementation should pick (1) — it's the only path that
+// realizes the 2× throughput claim for inference. The asm-byte
+// encoding for `fmla v0.8h, v1.8h, v2.8h` is `0x0e40cc20` (verify with
+// `aarch64-linux-gnu-objdump`).
+
+/// Placeholder for the FP16 16-lane composed wrapper.
+///
+/// Real implementation: `pub struct F16x16(pub [float16x8_t; 2])`.
+/// Add methods mirroring `F32x16` (splat, from_slice, add, sub, mul,
+/// mul_add, sqrt, reduce_sum, simd_min/max/eq/lt/le/gt/ge/ne).
+///
+/// Intrinsics map:
+/// - `splat` → `vdupq_n_f16`
+/// - `from_slice` → `vld1q_f16`
+/// - `to_array` → `vst1q_f16`
+/// - `Add` → `vaddq_f16`
+/// - `Sub` → `vsubq_f16`
+/// - `Mul` → `vmulq_f16`
+/// - `mul_add` → `vfmaq_f16` (fused multiply-add)
+/// - `sqrt` → `vsqrtq_f16`
+/// - `reduce_sum` → `vaddvq_f16`
+/// - mask compare → `vcgtq_f16`, `vceqq_f16`, ... yielding `uint16x8_t`
+///
+/// Until implemented, consumers reach `crate::simd::F16x16` via
+/// `simd_avx2::F16Scaler` (scalar polyfill) or `simd_nightly::F16x16`
+/// (`core::simd::f16x16`).
+pub struct F16x16Stub;
+
+impl F16x16Stub {
+    /// Placeholder — real impl needs `target_feature = "fp16"` + the
+    /// asm-byte / intrinsic path documented above.
+    pub fn unimplemented() -> ! {
+        unimplemented!(
+            "F16x16 NEON dotprod-tier implementation TODO. See \
+             src/simd_neon_dotprod.rs module docs for the intrinsic \
+             map and stable-Rust asm-byte path."
+        )
+    }
+}
+
+// ─── DotProd: already implemented in src/simd_neon.rs ────────────────
+//
+// TODO(Phase-3): move `dot_i8x16_neon` and `codebook_gather_i8_dotprod`
+// from `src/simd_neon.rs:191-237` to this file. Then re-export from
+// `simd_neon.rs` for backwards compatibility during migration. Both
+// already carry `#[target_feature(enable = "dotprod")]` so the gating
+// is correct — the move is purely structural.