diff --git a/.cargo/config-avx512.toml b/.cargo/config-avx512.toml new file mode 100644 index 00000000..a4349ab9 --- /dev/null +++ b/.cargo/config-avx512.toml @@ -0,0 +1,16 @@ +[build] +# Explicit AVX-512 config — `x86-64-v4`. Use with: +# cargo --config .cargo/config-avx512.toml build +# cargo --config .cargo/config-avx512.toml test +# +# Compiles `target_feature = "avx512f"` on, so `src/simd.rs` selects the +# `simd_avx512` backend with native `__m512` / `__m512d` / `__m512i` +# storage. Required for the Sapphire Rapids / Granite Rapids hot paths +# (`f32_to_bf16_batch_rne`, the AVX-512BF16 BF16 lanes, the AMX tiles). +# +# Binary produced here will SIGILL on AVX2-only silicon — only use on +# hosts that report `avx512f` in `/proc/cpuinfo`. For shipping a single +# release artifact that adapts at process start, see the LazyLock runtime +# dispatch path in § 7.1 of the architecture doc instead. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=x86-64-v4"] diff --git a/.cargo/config-native.toml b/.cargo/config-native.toml new file mode 100644 index 00000000..b7cca895 --- /dev/null +++ b/.cargo/config-native.toml @@ -0,0 +1,13 @@ +[build] +# Native build config — `target-cpu = "native"`. Use with: +# cargo --config .cargo/config-native.toml build +# cargo --config .cargo/config-native.toml test +# +# rustc resolves the build host's CPUID at invocation and enables every +# `target_feature` the host CPU advertises. `simd.rs` then picks the +# matching backend (typically `simd_avx512` on modern dev machines). +# +# Produces a binary tuned for the developer's exact silicon. The result +# is NOT portable: do not distribute artifacts built with this config. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=native"] diff --git a/.cargo/config.toml b/.cargo/config.toml index 92467f26..ba6378ad 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,4 +1,26 @@ [build] -# No global target-cpu. Each kernel uses #[target_feature(enable = "avx512f")] -# per-function, with LazyLock runtime detection. One binary, all ISAs. -# Railway (AVX-512) and GitHub CI (AVX2) use the same binary. +# Default cargo config — x86-64-v3 (AVX2) baseline. Portable across all +# x86_64 silicon shipping since ~2013 (Haswell+). This is what GitHub CI +# runs against and what `cargo build` produces for general distribution. +# +# Why v3 and not "no target-cpu": +# `src/simd_avx2.rs` composes `F32x16` as two `__m256` halves (AVX +# intrinsics), and the `simd_avx2_*` op funcs use `__m256i` (AVX2). +# Without a global v3 baseline, rustc compiles to x86-64 generic (SSE2) +# and those intrinsics emit instructions the CPU never executes → +# SIGILL at run time, exactly the PR #170 CI failure mode. +# +# AVX-512 builds: use `--config .cargo/config-avx512.toml` (or +# `CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=x86-64-v4'`). The simd.rs dispatch +# arms key off `target_feature = "avx512f"`; under v4 they pick the +# `simd_avx512` backend (native `__m512` / `__m512d` / `__m512i`). +# +# Build-machine-tuned binaries: use `--config .cargo/config-native.toml` +# (`target-cpu = "native"`); rustc resolves the host CPUID at compile. +# +# Runtime LazyLock dispatch (one release binary, heterogeneous deployment +# silicon) is a fifth opt-in mode — see § 7.1 of +# .claude/knowledge/simd-dispatch-architecture.md. Reserved for the +# release-binary distribution path; never the dev / CI default. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=x86-64-v3"] diff --git a/src/simd.rs b/src/simd.rs index b0e3ade0..3c2ebdab 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -198,10 +198,17 @@ pub const PREFERRED_I16_LANES: usize = 16; // x86_64: re-export based on tier // ============================================================================ -// Compile-time AVX-512 dispatch via target_feature. -// With target-cpu=x86-64-v4 (.cargo/config.toml), avx512f is enabled -// at compile time → all types use native __m512/__m512d/__m512i. -// The 256-bit types (F32x8, F64x4) also live in simd_avx512 (__m256). +// Compile-time SIMD dispatch via target_feature. The cargo config +// chosen at build (.cargo/config.toml = v3 default / config-avx512.toml +// = v4 / config-native.toml = native) sets the `target_feature` flags +// that select exactly one arm below. +// * v3 / GitHub-CI default → `target_feature = "avx2"` only → +// simd_avx2 backend (F32x16 = two-half (f32x8, f32x8), int wrappers +// are scalar polyfills via the `avx2_int_type!` macro). +// * v4 (or native on AVX-512 host) → `target_feature = "avx512f"` → +// simd_avx512 backend with native __m512 / __m512d / __m512i. +// * aarch64 → simd_neon backend. +// * everything else (wasm32, riscv, etc.) → scalar fallback. // Note on the `nightly-simd` feature: it adds the `crate::simd_nightly` // module (a portable-simd backend wrapping `core::simd`) but does NOT @@ -272,6 +279,17 @@ pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne}; #[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))] pub use crate::simd_avx512::{BF16x16, BF16x8}; +// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The +// predicate is `not(avx512f)` rather than `avx2 + not(avx512f)`: the +// inner intrinsics in `simd_avx2.rs` use per-function `#[target_feature +// (enable = "avx,avx2,fma")]` annotations, so the OPERATIONS gate +// themselves at the symbol level even when the consumer build target +// is x86-64 baseline. The struct-field types (`__m256` / `__m256i`) +// are core::arch declarations and don't require AVX/AVX2 at the type +// level — only execution does. Keeps GitHub CI green (it runs with +// `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml, +// landing on x86-64 baseline → the previous tighter `avx2` predicate +// left no matching arm). #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))] pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32};