diff --git a/.cargo/config-apple-m2.toml b/.cargo/config-apple-m2.toml new file mode 100644 index 00000000..b86a8f56 --- /dev/null +++ b/.cargo/config-apple-m2.toml @@ -0,0 +1,20 @@ +[build] +# Apple M2 / M3 / M4 — ARMv8.6-A+ with BF16, dotprod, fp16, i8mm. +# Use with: +# cargo --config .cargo/config-apple-m2.toml build --target=aarch64-apple-darwin +# +# Targets the BF16 tier — see `src/simd_neon_bf16.rs` for the silicon +# table, runtime detection (`sysctl hw.optional.arm.FEAT_BF16`), the +# BFMMLA / BFDOT intrinsic family, and the asm-byte fallback path that +# stable Rust 1.95 must use until `vbfdotq_f32` stabilizes (issue +# #117222). +# +# Also works on: +# - Apple M3 (target-cpu=apple-m3) — same ARMv8.6-A baseline +# - Apple M4 — adds SVE2, can override with -Ctarget-cpu=apple-m4 +# - Snapdragon X Elite / X Plus on macOS-like targets (use cortex-x4) +# +# DOES NOT target Apple M1 — M1 is ARMv8.5-A and lacks BF16. M1 should +# use the dotprod tier (config-pi5.toml-shaped, target-cpu=apple-m1). +[target.aarch64-apple-darwin] +rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"] diff --git a/.cargo/config-graviton.toml b/.cargo/config-graviton.toml new file mode 100644 index 00000000..4b8a7e25 --- /dev/null +++ b/.cargo/config-graviton.toml @@ -0,0 +1,19 @@ +[build] +# AWS Graviton 3 / 3E / 4 (Neoverse V1 / V2) — ARMv8.4-A+ with BF16 +# (V1: optional, V2: mandatory) + SVE / SVE2. +# Use with: +# cargo --config .cargo/config-graviton.toml build --target=aarch64-unknown-linux-gnu +# +# Targets the BF16 tier — see `src/simd_neon_bf16.rs`. Graviton 3 (V1) +# also adds SVE 256-bit; Graviton 4 (V2) adds SVE2 + BFMMLA + i8mm. +# +# Also works on: +# - Cortex-X3 / X4 / X925 generic Linux servers +# - Ampere Altra (V1-class — same baseline) +# - NVIDIA Grace (V2 — same as Graviton 4) +# +# For ARMv9 cores with SVE2 you may want a separate config-sve2.toml +# later that adds `+sve2` and routes through a future +# `src/simd_neon_sve2.rs` (not in Phase 3 scope). +[target.aarch64-unknown-linux-gnu] +rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"] diff --git a/.cargo/config-pi5.toml b/.cargo/config-pi5.toml new file mode 100644 index 00000000..b092b315 --- /dev/null +++ b/.cargo/config-pi5.toml @@ -0,0 +1,15 @@ +[build] +# Raspberry Pi 5 (BCM2712, Cortex-A76) — ARMv8.2-A with dotprod + fp16. +# Use with: +# cargo --config .cargo/config-pi5.toml build --target=aarch64-unknown-linux-gnu +# +# Targets the dotprod/fp16 tier — see `src/simd_neon_dotprod.rs` for the +# silicon table, runtime detection, and stub map. Also works on: +# - Orange Pi 5 (Rockchip RK3588, Cortex-A76) +# - Anything reporting `Features: ... asimddp asimdhp ...` in +# /proc/cpuinfo without `bf16`. +# +# For Apple M2+ / Snapdragon X / Graviton 4, use config-apple-m2.toml +# (BF16 tier — see src/simd_neon_bf16.rs). +[target.aarch64-unknown-linux-gnu] +rustflags = ["-Ctarget-cpu=cortex-a76", "-Ctarget-feature=+dotprod,+fp16"] diff --git a/src/lib.rs b/src/lib.rs index 9dffe54a..426bdae9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -266,6 +266,24 @@ pub mod simd_amx; #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] pub mod simd_neon; +// NEON tier scaffolds — Phase 3 of the SIMD integration plan +// (.claude/knowledge/simd-dispatch-architecture.md § 6). +// +// Each file documents the silicon, the runtime + compile-time detection +// path, and stubs out the F16 / BF16 wrappers with intrinsic maps for +// future implementation. Current state: scaffolds only — the actual +// NEON code still lives in `simd_neon.rs::aarch64_simd` and gets +// migrated tier-by-tier as the Phase 3 sprints land. +#[cfg(all(target_arch = "aarch64", feature = "std"))] +#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] +pub mod simd_neon_baseline; +#[cfg(all(target_arch = "aarch64", feature = "std"))] +#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] +pub mod simd_neon_dotprod; +#[cfg(all(target_arch = "aarch64", feature = "std"))] +#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] +pub mod simd_neon_bf16; + #[cfg(feature = "std")] #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] pub mod simd_wasm; diff --git a/src/simd_neon_baseline.rs b/src/simd_neon_baseline.rs new file mode 100644 index 00000000..cf7d2437 --- /dev/null +++ b/src/simd_neon_baseline.rs @@ -0,0 +1,81 @@ +//! NEON baseline tier — ARMv8.0-A `+neon` only. +//! +//! # Silicon +//! +//! Every aarch64 CPU since ARMv8.0-A ships NEON unconditionally. This +//! tier is the lowest common denominator — the floor every other tier +//! builds on. Concretely: +//! +//! - **Raspberry Pi 3** (BCM2837, Cortex-A53) — ARMv8.0-A +//! - **Raspberry Pi 4** (BCM2711, Cortex-A72) — ARMv8.0-A +//! - **Pi CM3 / CM4 (4 GB)** — same A72 silicon +//! - Anything reporting `Features: ... asimd ...` in `/proc/cpuinfo` +//! without `asimddp`, `asimdfhm`, `asimdhp`, or `bf16`. +//! +//! # What you get +//! +//! Native 128-bit lanes: `float32x4_t`, `float64x2_t`, `int8x16_t`, +//! `uint8x16_t`, `int16x8_t`, `uint16x8_t`, `int32x4_t`, `int64x2_t`, +//! `uint32x4_t`, `uint64x2_t`. Standard NEON arithmetic — `vaddq_*`, +//! `vsubq_*`, `vmulq_*`, `vfmaq_*`, `vminq_*`, `vmaxq_*`, gather / +//! scatter via `vld1q_*` / `vst1q_*`, lane select, reduce. +//! +//! # What you do NOT get +//! +//! - **dotprod** (SDOT/UDOT) → see `simd_neon_dotprod.rs` +//! - **fp16 arithmetic** (`vfmlaq_f16`, `vaddq_f16`) → see +//! `simd_neon_dotprod.rs` +//! - **bf16** (`vbfdotq_f32`, `vbfmlalbq_f32`) → see `simd_neon_bf16.rs` +//! - **SVE2** (variable-length vectors) → not in any current tier file. +//! +//! # 512-bit composed wrappers +//! +//! `crate::simd::F32x16` / `U8x64` etc. on aarch64 compose 4× 128-bit +//! NEON registers into one logical wrapper, e.g. +//! `pub struct F32x16(pub [float32x4_t; 4])`. The four loads/stores +//! pipeline well on dual-issue cores (A72, A76, M-series). +//! +//! # Cargo config +//! +//! No special flags needed. `aarch64-unknown-linux-gnu` / `aarch64- +//! apple-darwin` already enable NEON. Pi 3/4 cross-builds: +//! `cargo build --target=aarch64-unknown-linux-gnu` from any host. +//! +//! # Status +//! +//! Scaffold only — placeholder for Phase 3 implementation. The actual +//! 128-bit native wrappers (I8x16, I16x8, U8x16, U16x8, U32x4, U64x2, +//! I32x4, I64x2) currently live in `src/simd_neon.rs::aarch64_simd`. +//! That code moves here once the tier split lands. +//! +//! Composed 512-bit wrappers (`F32x16` = `[float32x4_t; 4]` etc.) for +//! the 8 missing int types (U8x64, I8x64, I16x32, I32x16, I64x8, +//! U16x32, U32x16, U64x8) are TODO — currently dispatched to +//! `simd_scalar.rs` via the `scalar::*` fallback at `simd.rs:1593-95`. + +#![cfg(all(target_arch = "aarch64", feature = "std"))] + +// TODO(Phase-3): move the existing `pub mod aarch64_simd` block from +// `src/simd_neon.rs` (lines 463-1126 of master @ 3c20392f) into this +// file. Then re-export from `simd_neon.rs` for backwards compatibility +// during the migration window. Same pattern as Phase 4 used to extract +// `simd_scalar.rs` via `#[path]` declaration. + +// TODO(Phase-3): add the 8 missing 512-bit composed wrappers as +// `[neon_native; 4]`. Apply the `avx2_int_type!`-equivalent macro +// pattern to generate them mechanically — name it `neon_int_type!` and +// keep the API surface identical to the AVX-512 / AVX2 / nightly arms. +// +// neon_int_type!(U8x64, u8, 64, uint8x16_t, vaddq_u8, vsubq_u8); +// neon_int_type!(I8x64, i8, 64, int8x16_t, vaddq_s8, vsubq_s8); +// neon_int_type!(U16x32, u16, 32, uint16x8_t, vaddq_u16, vsubq_u16); +// neon_int_type!(I16x32, i16, 32, int16x8_t, vaddq_s16, vsubq_s16); +// neon_int_type!(U32x16, u32, 16, uint32x4_t, vaddq_u32, vsubq_u32); +// neon_int_type!(I32x16, i32, 16, int32x4_t, vaddq_s32, vsubq_s32); +// neon_int_type!(U64x8, u64, 8, uint64x2_t, vaddq_u64, vsubq_u64); +// neon_int_type!(I64x8, i64, 8, int64x2_t, vaddq_s64, vsubq_s64); + +// TODO(Phase-3): copy the existing F32x16 / F64x8 paired-load impls +// from `src/simd_neon.rs::aarch64_simd::{F32x16, F64x8, F32Mask16, +// F64Mask8}` here. They already use the composed `[float32x4_t; 4]` / +// `[float64x2_t; 4]` layout this tier expects. diff --git a/src/simd_neon_bf16.rs b/src/simd_neon_bf16.rs new file mode 100644 index 00000000..e046b2d3 --- /dev/null +++ b/src/simd_neon_bf16.rs @@ -0,0 +1,204 @@ +//! NEON + BF16 tier — ARMv8.6-A `+bf16` (or ARMv8.2-A + optional `+bf16`). +//! +//! Builds on `simd_neon_dotprod.rs`. Adds the BF16 instruction family: +//! BFDOT, BFMMLA, BFMLALB, BFMLALT, BFCVT. These are the bf16 cousins +//! of dotprod — same 4× int8 throughput shape, but for the half-the- +//! width bfloat16 type that LLM inference standardized on. +//! +//! # Silicon +//! +//! - **Apple M2 / M3 / M4** (Avalanche/Blizzard, Everest/Sawtooth, +//! Tupai/Donan) — ARMv8.6-A+. BF16 always on. `sysctl +//! hw.optional.arm.FEAT_BF16` returns 1. M1 does NOT have BF16 — it's +//! ARMv8.5-A. +//! - **Snapdragon X Elite / X Plus** (Cortex-X4/X3 cores, Oryon +//! prime) — ARMv8.7-A. BF16 always on. +//! - **Cortex-A510 / A520 / A710 / A720 / X2 / X3 / X4 / X925** — +//! ARMv9.0-A+. BF16 always on. +//! - **NVIDIA Grace** (Neoverse V2) — ARMv9-A. BF16 on. +//! - **AWS Graviton 3 / 3E / 4** (Neoverse V1/V2) — V1 added BF16 as +//! optional ARMv8.4-A extension; V2 makes it mandatory. +//! - **Ampere One (M-series)** — ARMv8.6-A+. BF16 on. +//! +//! # NOT in this tier +//! +//! - Apple M1 (ARMv8.5-A, no BF16) — falls back to `simd_neon_dotprod.rs` +//! - Raspberry Pi 5 (Cortex-A76, ARMv8.2-A, no BF16) — `simd_neon_dotprod.rs` +//! - Any Pi 3/4 / Cortex-A53/A72 — `simd_neon_baseline.rs` +//! +//! # How to detect at runtime +//! +//! - **Linux**: `/proc/cpuinfo` Features line should show `bf16`. +//! `getauxval(AT_HWCAP2) & HWCAP2_BF16` (bit 14). +//! `std::arch::is_aarch64_feature_detected!("bf16")` — recommended. +//! - **macOS**: `sysctl hw.optional.arm.FEAT_BF16` → `1` means yes. +//! On M2+ it's always 1; on M1 it's 0. +//! - **Windows ARM64**: `IsProcessorFeaturePresent(PF_ARM_V83_BF16)` +//! (constant added in Win11 24H2 SDK). +//! +//! # How to detect at compile time +//! +//! Cargo config flags: +//! - `-Ctarget-feature=+bf16` — enables BF16 intrinsics + cfg gate. +//! - `-Ctarget-cpu=apple-m2` — implies bf16 + everything else. +//! - `-Ctarget-cpu=neoverse-v2` — Graviton 4 baseline. +//! - `-Ctarget-cpu=cortex-x4` — Snapdragon X Elite / Cortex-X4 cores. +//! +//! Inside Rust: +//! +//! ```ignore +//! #[cfg(all(target_arch = "aarch64", target_feature = "bf16"))] +//! pub use crate::simd_neon_bf16::{BF16x8, BF16x16, bfdot, bfmmla}; +//! ``` +//! +//! # What you get +//! +//! ## BF16 dot-product / matrix-multiply +//! +//! - `vbfdotq_f32(acc, a, b)` — 2×(2×bf16·2×bf16) → 2×f32, accumulated +//! into 4×f32 register. The bf16 analogue of `vdotq_s32`. +//! - `vbfmmlaq_f32(acc, a, b)` — 2×2 outer product BFMMLA. The crown +//! jewel for transformer GEMM — accumulates a full 2×2 f32 tile per +//! instruction. 8 bf16 mults + 4 f32 adds per cycle on M2. +//! - `vbfmlalbq_f32` / `vbfmlaltq_f32` — bottom / top half multiply- +//! accumulate, lane-by-lane variant of BFDOT. +//! - `vbfmlalbq_laned_f32` — broadcast one lane across all bf16 +//! multiplications. Useful for matvec. +//! +//! ## BF16 conversion +//! +//! - `vcvt_bf16_f32` / `vcvtq_low_bf16_f32` / `vcvtq_high_bf16_f32` — +//! pack 4×f32 → 4×bf16. Hardware rounding (no manual RNE needed +//! like the AVX-512BF16 `_mm512_cvtne2ps_pbh` path in +//! `simd_avx512.rs`). +//! - Scalar f32 ↔ bf16: trivial high-16-bit slice (the scalar paths in +//! `src/simd.rs:1604-1626` work everywhere, including this tier). +//! +//! # Composed wrapper shapes +//! +//! - `BF16x8` = `bfloat16x8_t` — native 128-bit register, 8 bf16 lanes. +//! Matches AVX-512BF16 `BF16x8 = __m128bh` in shape. +//! - `BF16x16` = `[bfloat16x8_t; 2]` — two 128-bit registers, 16 bf16 +//! lanes. Matches AVX-512BF16 `BF16x16 = __m256bh` in shape. +//! +//! # Cargo configs +//! +//! ```toml +//! # .cargo/config-apple-m2.toml — Apple M2/M3/M4 +//! [build] +//! target = "aarch64-apple-darwin" +//! [target.aarch64-apple-darwin] +//! rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16"] +//! ``` +//! +//! ```toml +//! # .cargo/config-graviton.toml — AWS Graviton 3/4 +//! [build] +//! target = "aarch64-unknown-linux-gnu" +//! [target.aarch64-unknown-linux-gnu] +//! rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16"] +//! ``` +//! +//! ```toml +//! # .cargo/config-snapdragon-x.toml — Snapdragon X Elite (Win/Linux) +//! [build] +//! target = "aarch64-pc-windows-msvc" # or aarch64-unknown-linux-gnu +//! rustflags = ["-Ctarget-cpu=cortex-x4", "-Ctarget-feature=+bf16,+i8mm"] +//! ``` +//! +//! # Stable-Rust constraint +//! +//! Same as the FP16 tier: `bfloat16x8_t` exists in `core::arch::aarch64` +//! on stable, but the intrinsics (`vbfdotq_f32`, `vbfmmlaq_f32`, ...) +//! are nightly-only (issue #117222). Two paths on stable 1.95: +//! +//! 1. **asm! byte encoding** — same pattern as `src/simd_amx.rs` +//! uses for AMX. Example: +//! ```ignore +//! // BFDOT v0.4s, v1.8h, v2.8h +//! asm!(".inst 0x4e41ec00", inout("v0") acc, in("v1") a, in("v2") b); +//! // BFMMLA v0.4s, v1.8h, v2.8h +//! asm!(".inst 0x6e42ec01", inout("v0") acc, in("v1") a, in("v2") b); +//! ``` +//! Verify the encoding with `aarch64-linux-gnu-objdump --disassemble` +//! on a reference compile. +//! 2. **Round-trip through f32** — convert bf16 → f32 (scalar bit- +//! shift), use the existing `vfmaq_f32` from baseline NEON, convert +//! back. Loses the 4× throughput; only as a correctness anchor for +//! the asm path. +//! +//! Path (1) is the only one worth shipping. The asm-byte fallback IS +//! how `simd_amx.rs` ships AMX on stable Rust today — same pattern. + +#![cfg(all(target_arch = "aarch64", feature = "std"))] + +// ─── BF16 stubs ────────────────────────────────────────────────────── + +/// Placeholder for the BF16 8-lane native wrapper. +/// +/// Real implementation: `pub struct BF16x8(pub bfloat16x8_t)`. API +/// surface mirrors `simd_avx512::BF16x8`: +/// - `splat(bits: u16) -> Self` (broadcast bf16 bit pattern across 8 lanes) +/// - `from_slice(s: &[u16]) -> Self` (load 8 raw bf16 bits as u16s) +/// - `to_array(self) -> [u16; 8]` +/// - `dot_f32(self, other: Self, acc: F32x4) -> F32x4` — wraps BFDOT +/// - `cvt_to_f32_lo(self) -> F32x4`, `cvt_to_f32_hi(self) -> F32x4` +/// +/// Without `target_feature = "bf16"`, this falls back to round-trip +/// through f32 (slow). With the feature on, it uses asm-byte BFDOT. +pub struct BF16x8Stub; + +/// Placeholder for the BF16 16-lane composed wrapper. +/// +/// Real implementation: `pub struct BF16x16(pub [bfloat16x8_t; 2])`. +/// API mirror of `simd_avx512::BF16x16`. The 16-lane variant is the +/// natural width for matmul tile rows in transformer attention. +pub struct BF16x16Stub; + +impl BF16x8Stub { + pub fn unimplemented() -> ! { + unimplemented!( + "BF16x8 NEON bf16-tier implementation TODO. See \ + src/simd_neon_bf16.rs module docs for the BFDOT / BFMMLA \ + asm-byte encoding (stable Rust 1.95 can't reach the \ + nightly-only vbfdotq_f32 intrinsic). Reference: \ + src/simd_amx.rs's `.byte` pattern." + ) + } +} + +impl BF16x16Stub { + pub fn unimplemented() -> ! { + unimplemented!( + "BF16x16 NEON bf16-tier implementation TODO. Two-half \ + composed wrapper [bfloat16x8_t; 2] — see module docs." + ) + } +} + +// ─── BFMMLA: the prize intrinsic ───────────────────────────────────── +// +// BFMMLA is the most important instruction this tier unlocks. It +// computes a 2×2 outer-product matrix multiply of bf16 inputs, +// accumulating into a 2×2 f32 tile. One instruction = 8 bf16 mults + +// 4 f32 adds. On Apple M2 the throughput is ~32 GFLOP/s per core in +// bf16-matmul-bound kernels. +// +// Encoding for `BFMMLA Vd.4s, Vn.8h, Vm.8h`: 0x6e40_ec00 | (Vm << 16) +// | (Vn << 5) | Vd. Use a `bfmmla!` macro to emit the asm-byte for any +// (acc, a, b) v-register triple. +// +// TODO(Phase-3): implement `bfmmla(acc: F32x4, a: BF16x8, b: BF16x8) +// -> F32x4` as the primary export. The rest of the BF16 API builds on +// it (BFDOT is BFMMLA's diagonal, BFMLALB/T are its half-slices). + +// ─── BFDOT: same shape as DotProd, but bf16 ────────────────────────── +// +// Where `vdotq_s32(acc, a, b)` does 4×(4×i8·4×i8) → 4×i32, BFDOT does +// 2×(2×bf16·2×bf16) → 2×f32 accumulated into 4×f32. The bf16 analogue +// is HALF the lane count per output (2 vs 4) because bf16 is twice as +// wide as i8. +// +// TODO(Phase-3): implement `bfdot(acc: F32x4, a: BF16x8, b: BF16x8) +// -> F32x4`. Asm-byte for `BFDOT Vd.4s, Vn.8h, Vm.8h`: +// 0x4e40_ec00 | (Vm << 16) | (Vn << 5) | Vd diff --git a/src/simd_neon_dotprod.rs b/src/simd_neon_dotprod.rs new file mode 100644 index 00000000..d3bc3fcd --- /dev/null +++ b/src/simd_neon_dotprod.rs @@ -0,0 +1,156 @@ +//! NEON + dotprod + FP16 tier — ARMv8.2-A `+dotprod,+fp16`. +//! +//! Builds on `simd_neon_baseline.rs`. This tier adds two ISA features +//! that landed together in ARMv8.2-A and ship as a pair on every +//! consumer aarch64 chip from ~2019 onward. +//! +//! # Silicon +//! +//! - **Raspberry Pi 5** (BCM2712, Cortex-A76) — ARMv8.2-A. The first +//! Pi with dotprod + fp16. 4× int8 dot-product throughput vs Pi 4. +//! - **Cortex-A55r2+** — ARMv8.2-A revision 2 silicon. Mid-range +//! Android, low-power servers (Ampere Altra little cores). +//! - **Cortex-A75 / A76 / A77 / A78 / A78AE / X1 / X2 / X3** — all +//! ARMv8.2-A or later, all have dotprod + fp16. +//! - **Apple A11 / A12 / A13 / M1** — ARMv8.3-A or later. (Note: Apple +//! exposes `bf16` from M2 onward; see `simd_neon_bf16.rs`.) +//! - **Snapdragon 8 Gen 1+** — ARMv8.4-A+. dotprod + fp16 always on. +//! +//! # How to detect at runtime +//! +//! - **Linux**: read `/proc/cpuinfo`'s `Features:` line. dotprod is +//! reported as `asimddp`; fp16 as `asimdhp` (asimd half-precision) +//! and `fphp` (scalar fp16). Both should be present on this tier. +//! Or use `std::arch::is_aarch64_feature_detected!("dotprod")` / +//! `("fp16")` — already wired into `simd.rs::detect_tier()` at +//! line 63: `if std::arch::is_aarch64_feature_detected!("dotprod") +//! { return Tier::NeonDotProd; }`. +//! - **macOS** (Apple silicon): all M-series chips have dotprod + fp16. +//! `sysctl hw.optional.arm.FEAT_DotProd` returns 1. +//! - **Android NDK**: `getauxval(AT_HWCAP) & HWCAP_ASIMDDP` (bit 20). +//! +//! # How to detect at compile time +//! +//! Cargo config flags `-Ctarget-feature=+dotprod,+fp16` (or +//! `-Ctarget-cpu=cortex-a76` which includes both as defaults). Inside +//! Rust: +//! +//! ```ignore +//! #[cfg(all(target_arch = "aarch64", target_feature = "dotprod"))] +//! pub use crate::simd_neon_dotprod::dot_i8x16; +//! +//! #[cfg(all(target_arch = "aarch64", target_feature = "fp16"))] +//! pub use crate::simd_neon_dotprod::F16x16; +//! ``` +//! +//! # What you get +//! +//! ## DotProd (already wired in `simd_neon.rs:190-203`) +//! +//! - `vdotq_s32(acc, a, b)` — 4×(4×i8·4×i8) → 4×i32 in ONE instruction. +//! - `vdotq_u32` — unsigned variant. +//! - `vdotq_laned_s32` — broadcast lane of `b` across all 4 dotproducts. +//! - Already exposed: `dot_i8x16_neon`, `codebook_gather_i8_dotprod`. +//! +//! ## FP16 (TODO — stubs below) +//! +//! - `vaddq_f16`, `vsubq_f16`, `vmulq_f16`, `vfmaq_f16` — 8×fp16 per +//! `float16x8_t` register. 2× the throughput of fp32 for inference. +//! - `vcvt_f32_f16` / `vcvt_f16_f32` — pack-into / unpack-from fp32. +//! Already implemented as scalar bit-twiddle in `simd_neon.rs:324-420` +//! (`f16_to_f32_scalar`, `f32_to_f16_scalar`); the SIMD versions +//! need the `target_feature = "fp16"` gate. +//! +//! # Composed wrapper shape +//! +//! `F16x16` = `[float16x8_t; 2]` — two 128-bit FP16 registers compose +//! 16 lanes, matching the AVX-512 / nightly width. +//! +//! # Cargo config +//! +//! ```toml +//! # .cargo/config-pi5.toml +//! [build] +//! target = "aarch64-unknown-linux-gnu" +//! [target.aarch64-unknown-linux-gnu] +//! rustflags = ["-Ctarget-cpu=cortex-a76", "-Ctarget-feature=+dotprod,+fp16"] +//! ``` +//! +//! Cross-build for Pi 5 from x86_64: +//! `cargo --config .cargo/config-pi5.toml build`. +//! +//! Apple build (Mac M1+ runs this natively under `aarch64-apple-darwin`): +//! ```toml +//! # .cargo/config-apple.toml +//! [build] +//! target = "aarch64-apple-darwin" +//! [target.aarch64-apple-darwin] +//! rustflags = ["-Ctarget-cpu=apple-m1", "-Ctarget-feature=+dotprod,+fp16"] +//! ``` + +#![cfg(all(target_arch = "aarch64", feature = "std"))] + +// ─── F16 stubs ──────────────────────────────────────────────────────── +// +// Target intrinsic family: `float16x8_t` arithmetic, gated on +// `target_feature = "fp16"`. Rust stable 1.95 has `float16x8_t` in +// `core::arch::aarch64` but the intrinsics like `vaddq_f16` are nightly +// only (issue #112800). On stable we have three options: +// +// 1. Use `vfmaq_f16` etc. via `unsafe asm!` with the literal +// ARM64 instruction (e.g. `asm!("fmla v0.8h, v1.8h, v2.8h", +// in("v1") a, in("v2") b, inout("v0") acc)`). This is the AMX +// precedent — see `src/simd_amx.rs::amx_available()` for the +// pattern of byte-encoded asm on stable Rust. +// 2. Round-trip through f32 using the existing scalar conversions +// in `simd_neon.rs:324-420`. Loses throughput; only useful as a +// correctness baseline. +// 3. Wait for `core::arch::aarch64::vaddq_f16` to stabilize. +// +// Phase 3 implementation should pick (1) — it's the only path that +// realizes the 2× throughput claim for inference. The asm-byte +// encoding for `fmla v0.8h, v1.8h, v2.8h` is `0x0e40cc20` (verify with +// `aarch64-linux-gnu-objdump`). + +/// Placeholder for the FP16 16-lane composed wrapper. +/// +/// Real implementation: `pub struct F16x16(pub [float16x8_t; 2])`. +/// Add methods mirroring `F32x16` (splat, from_slice, add, sub, mul, +/// mul_add, sqrt, reduce_sum, simd_min/max/eq/lt/le/gt/ge/ne). +/// +/// Intrinsics map: +/// - `splat` → `vdupq_n_f16` +/// - `from_slice` → `vld1q_f16` +/// - `to_array` → `vst1q_f16` +/// - `Add` → `vaddq_f16` +/// - `Sub` → `vsubq_f16` +/// - `Mul` → `vmulq_f16` +/// - `mul_add` → `vfmaq_f16` (fused multiply-add) +/// - `sqrt` → `vsqrtq_f16` +/// - `reduce_sum` → `vaddvq_f16` +/// - mask compare → `vcgtq_f16`, `vceqq_f16`, ... yielding `uint16x8_t` +/// +/// Until implemented, consumers reach `crate::simd::F16x16` via +/// `simd_avx2::F16Scaler` (scalar polyfill) or `simd_nightly::F16x16` +/// (`core::simd::f16x16`). +pub struct F16x16Stub; + +impl F16x16Stub { + /// Placeholder — real impl needs `target_feature = "fp16"` + the + /// asm-byte / intrinsic path documented above. + pub fn unimplemented() -> ! { + unimplemented!( + "F16x16 NEON dotprod-tier implementation TODO. See \ + src/simd_neon_dotprod.rs module docs for the intrinsic \ + map and stable-Rust asm-byte path." + ) + } +} + +// ─── DotProd: already implemented in src/simd_neon.rs ──────────────── +// +// TODO(Phase-3): move `dot_i8x16_neon` and `codebook_gather_i8_dotprod` +// from `src/simd_neon.rs:191-237` to this file. Then re-export from +// `simd_neon.rs` for backwards compatibility during migration. Both +// already carry `#[target_feature(enable = "dotprod")]` so the gating +// is correct — the move is purely structural.