diff --git a/.cargo/config_ndarray_simd.toml b/.cargo/config_ndarray_simd.toml
new file mode 100644
index 0000000000000..9dbd90ceb5760
--- /dev/null
+++ b/.cargo/config_ndarray_simd.toml
@@ -0,0 +1,45 @@
+# Copy this file to `.cargo/config.toml` (or merge into your existing one)
+# to enable AVX-512 builds of the AdaWorldAPI/ndarray SIMD polyfill from
+# this Bevy fork.
+#
+# ## What it does
+#
+# Two build profiles for the ndarray polyfill — chosen at compile time
+# via `target-cpu`, never at runtime:
+#
+# - **Default** (`cargo build`): `target-cpu=x86-64-v3`. AVX2 baseline.
+#   Works on every GitHub Actions runner. `crate::simd::F32x16` picks
+#   the 8-lane AVX2 path; `U8x64` ditto.
+#
+# - **Opt-in AVX-512** (`cargo build-avx512`): `target-cpu=x86-64-v4`.
+#   Polyfill picks the 16-lane AVX-512 path. Required for the
+#   `ndarray_simd_smoke` / `ndarray_graph_plugin` examples to exercise
+#   `__m512` / `permute_bytes` / `pairwise_avg` (PR #112's rasterizer
+#   intrinsics). Only run on hardware with AVX-512F (Sapphire Rapids,
+#   Ice Lake-SP, Zen 4 with AVX-512, etc.). CI runners WILL SIGILL.
+#
+# ## Why two profiles
+#
+# GitHub Actions stock runners support x86-64-v3 (AVX2) but NOT
+# x86-64-v4 (AVX-512). Unconditionally setting `target-cpu=x86-64-v4`
+# would break CI. Project convention is: default build = CI-safe
+# baseline; AVX-512 = explicit opt-in via cargo alias.
+#
+# ## Runtime sanity
+#
+# Whichever profile you build with, the ndarray smoke test prints
+# `simd_caps()` at startup (CPUID-detected at runtime via the LazyLock
+# singleton). The smoke test catches the mismatch between
+# runtime-detected `avx512f=true` and a compile-time x86-64-v3 build
+# (`PREFERRED_F32_LANES=8`) — that's the asymmetry to watch for.
+
+[build]
+rustflags = ["-C", "target-cpu=x86-64-v3"]
+
+[alias]
+# AVX-512 variants — for AdaWorldAPI dev boxes (Sapphire Rapids+).
+# Do NOT run these binaries on a non-AVX-512 host.
+build-avx512 = ["build", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"]
+run-avx512   = ["run",   "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"]
+test-avx512  = ["test",  "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"]
+check-avx512 = ["check", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"]
diff --git a/.github/workflows/ndarray-smoke.yml b/.github/workflows/ndarray-smoke.yml
new file mode 100644
index 0000000000000..0177084eeb430
--- /dev/null
+++ b/.github/workflows/ndarray-smoke.yml
@@ -0,0 +1,44 @@
+name: ndarray-smoke
+on:
+  push:
+    branches: ["claude/**"]
+  pull_request:
+    branches: ["claude/**", "main", "master"]
+
+# Minimum permission set per CodeQL "Workflow does not contain permissions"
+# rule on PR #1. The job only checks out + builds; no write needs.
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      # Pinned to commit SHA per zizmor unpinned-action rule on PR #1.
+      # v4.1.7 corresponds to commit 692973e3d937129bcbf40652eb9f2f61becf3332.
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Install Bevy system deps
+        run: sudo apt-get update -y && sudo apt-get install -y libwayland-dev libasound2-dev libudev-dev
+
+      # Pinned to commit SHA per zizmor unpinned-action rule on PR #1.
+      # The action treats "1.95.0" as a toolchain version, but the action ref
+      # itself must be a commit SHA. Commit f04cf2e09f5b6448b46c0aa9893a76ee36ed64c2
+      # corresponds to the stable tag.
+      - uses: dtolnay/rust-toolchain@f04cf2e09f5b6448b46c0aa9893a76ee36ed64c2
+        with:
+          toolchain: "1.95.0"
+
+      # ndarray is now a git dev-dep in Cargo.toml (codex P1 fix on PR #1),
+      # so the workflow no longer needs to clone ../ndarray. The
+      # ndarray-examples feature must be enabled because the [[example]]
+      # entries require it (so upstream Bevy CI doesn't try to build them
+      # on macOS / Windows where ndarray's AMX path doesn't compile).
+      - name: cargo check --example ndarray_simd_smoke
+        run: cargo check --example ndarray_simd_smoke --features ndarray-examples
+
+      - name: cargo check --example ndarray_graph_plugin
+        run: cargo check --example ndarray_graph_plugin --features ndarray-examples
+
+      - name: cargo check --example ndarray_graph_plugin_tests
+        run: cargo check --example ndarray_graph_plugin_tests --features ndarray-examples
diff --git a/Cargo.toml b/Cargo.toml
index d60f7b1884307..65092dd499b7d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -133,6 +133,15 @@ unused_qualifications = "warn"
 [features]
 default = ["2d", "3d", "ui", "audio"]
 
+# PROFILE: Examples that depend on AdaWorldAPI/ndarray fork (Linux x86_64 only).
+# Marker feature: enabling it lets `cargo build --examples` pick up the
+# ndarray_* examples. Upstream Bevy CI does NOT enable this, so its
+# multi-platform build matrix doesn't try to compile these examples on
+# macOS / Windows / non-x86_64 (where ndarray's AMX + Linux prctl don't
+# build). Local dev / our own CI workflow enables this explicitly via
+# `cargo run --example ndarray_graph_plugin --features ndarray-examples`.
+ndarray-examples = []
+
 # PROFILE: The default 2D Bevy experience. This includes the core Bevy framework, 2D functionality, scenes and picking.
 2d = ["default_app", "default_platform", "2d_bevy_render", "scene", "picking"]
 
@@ -742,6 +751,21 @@ chacha20 = { version = "0.10.0", default-features = false, features = ["rng"] }
 ron = "0.12"
 flate2 = "1.0"
 serde = { version = "1", features = ["derive"] }
+
+# AdaWorldAPI/ndarray fork: HPC + SIMD polyfill — Linux x86_64 ONLY.
+# ndarray uses AMX inline asm + a Linux-x86_64 prctl syscall for AMX tile
+# permission grants, neither of which are available on macOS / Windows /
+# aarch64. The CI matrix on the bevy fork runs all three OS × ISA combos,
+# so without this target.cfg gate, the macOS and Windows runners try
+# (and fail) to fetch + build ndarray. Gating the dep AND stubbing the
+# example mains for non-Linux-x86_64 means upstream cargo check / build
+# passes cleanly on every target.
+#
+# Git dep (not path) so `cargo metadata` works without a sibling checkout
+# (codex P1 on PR #1: `path = "../ndarray"` made every cargo command on
+# the workspace fail unless every dev pre-cloned the sibling).
+[target.'cfg(all(target_os = "linux", target_arch = "x86_64"))'.dev-dependencies]
+ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "master", features = ["rayon"] }
 serde_json = "1.0.140"
 bytemuck = "1"
 # The following explicit dependencies are needed for proc macros to work inside of examples as they are part of the bevy crate itself.
@@ -797,6 +821,33 @@ doc-scrape-examples = true
 [package.metadata.example.hello_world]
 hidden = true
 
+[[example]]
+name = "ndarray_simd_smoke"
+path = "examples/ndarray_simd_smoke.rs"
+doc-scrape-examples = false
+required-features = ["ndarray-examples"]
+
+[package.metadata.example.ndarray_simd_smoke]
+hidden = true
+
+[[example]]
+name = "ndarray_graph_plugin"
+path = "examples/ndarray_graph_plugin.rs"
+doc-scrape-examples = false
+required-features = ["ndarray-examples"]
+
+[package.metadata.example.ndarray_graph_plugin]
+hidden = true
+
+[[example]]
+name = "ndarray_graph_plugin_tests"
+path = "examples/ndarray_graph_plugin_tests.rs"
+doc-scrape-examples = false
+required-features = ["ndarray-examples"]
+
+[package.metadata.example.ndarray_graph_plugin_tests]
+hidden = true
+
 # 2D Rendering
 [[example]]
 name = "bloom_2d"
diff --git a/examples/README_NDARRAY_PLUGIN.md b/examples/README_NDARRAY_PLUGIN.md
new file mode 100644
index 0000000000000..664c7749371dd
--- /dev/null
+++ b/examples/README_NDARRAY_PLUGIN.md
@@ -0,0 +1,219 @@
+# ndarray Graph Plugin for Bevy
+
+## What this is
+
+`ndarray_graph_plugin` is a Bevy example that shows how to wire the
+AdaWorldAPI/ndarray SIMD polyfill (`crate::simd::F32x16`, `Framebuffer`,
+`compose_neo4j`, `GLOBAL_RENDERER`) directly into a Bevy `App` as a
+first-class `Plugin`. Each Bevy `Update` tick advances a 64-node /
+80-edge force-directed graph through `ndarray::hpc::renderer`'s
+double-buffer integrator, rasterizes the result into a 512x512 palette-indexed
+`Framebuffer` using `compose_neo4j`, converts the palette indices to RGBA via a
+compile-time LUT, uploads the result as a `bevy::asset::Image`, and displays it
+on a `Sprite`. The SIMD path (`F32x16::mul_add`, `U8x64::pairwise_avg`) is
+selected at compile time from the `target-cpu` flag and confirmed at runtime
+via `simd_caps()`.
+
+---
+
+## Build
+
+### Prerequisites
+
+**Rust toolchain**
+
+```
+rustup toolchain install 1.95.0
+rustup override set 1.95.0
+```
+
+**System libraries** (Debian/Ubuntu)
+
+```
+sudo apt-get update -y
+sudo apt-get install -y libwayland-dev libasound2-dev libudev-dev
+```
+
+**Sibling ndarray checkout**
+
+The Bevy `Cargo.toml` depends on ndarray as a local path dependency
+(`../ndarray`). The ndarray tree must be checked out next to the bevy
+tree before building:
+
+```
+git clone https://github.com/AdaWorldAPI/ndarray.git ../ndarray
+```
+
+Both repos must be on matching branches for the feature flags to align.
+The CI workflow clones the same-named branch if it exists, falling back
+to `master`.
+
+---
+
+## Run
+
+### CI-safe build (x86-64-v3, AVX2 baseline)
+
+This is the default. It works on every GitHub Actions runner. The ndarray
+polyfill picks the 8-lane AVX2 path; `PREFERRED_F32_LANES` is 8.
+
+```
+cargo run --example ndarray_graph_plugin
+```
+
+### AVX-512 build (x86-64-v4, Sapphire Rapids / Ice Lake-SP / Zen 4+)
+
+The `run-avx512` alias is defined in `.cargo/config_ndarray_simd.toml`.
+Copy or merge that file into `.cargo/config.toml` before using it.
+This build will SIGILL on any host without AVX-512F; do not run it in CI
+on stock GitHub Actions runners.
+
+```
+cargo run-avx512 --example ndarray_graph_plugin
+```
+
+---
+
+## What it shows
+
+On startup the plugin seeds `GLOBAL_RENDERER` with 64 nodes arranged in a
+circle and 80 directed edges forming a random sparse graph. Each `Update`
+tick:
+
+1. `GLOBAL_RENDERER.tick(dt, damping)` integrates node positions via
+   `integrate_simd` — `F32x16::mul_add` fused multiply-add over the
+   position/velocity SoA buffers, one AVX-512 (or AVX2) pass per 16
+   floats.
+
+2. `compose_neo4j(&mut fb, frame, &edges, scale, offset, node_color, edge_color)`
+   rasterizes the front buffer into a 512x512 `Framebuffer`:
+   - Edges drawn as Bresenham lines with palette index `edge_color`.
+   - Nodes drawn as dot sprites with palette index `node_color`.
+   - Pixel values are u8 palette indices (0–15 for AVX-512 tier, 0–7
+     for AVX2 tier, 0–3 for NEON/scalar tier).
+
+3. A compile-time RGBA lookup table (`ndarray_graph_palette.rs`) maps
+   each palette index to a 4-byte RGBA value. The 512x512 pixel array is
+   expanded to a 1048576-byte RGBA buffer suitable for `bevy::asset::Image`.
+
+4. The `Image` is uploaded to the Bevy asset server and bound to a `Sprite`
+   component, which Bevy's 2D renderer displays in the window.
+
+The window title shows the current tick count, SIMD tier, and frame time
+so the polyfill path is visible at a glance.
+
+---
+
+## Architecture
+
+```
+Bevy App
+  └── NdarrayGraphPlugin
+        ├── Resource<Renderer>        (wraps GLOBAL_RENDERER or a local instance)
+        │     └── ndarray::hpc::renderer::GLOBAL_RENDERER
+        │           ├── RenderFrame (front)  ← readers here
+        │           └── RenderFrame (back)   ← integrate_simd writes here
+        │
+        ├── System: tick_renderer
+        │     calls Renderer::tick(dt, damping)
+        │     → F32x16::mul_add via crate::simd polyfill
+        │
+        ├── System: rasterize_to_framebuffer
+        │     calls compose_neo4j(&mut fb, frame, edges, ...)
+        │     → Framebuffer { pixels: Vec<u8> }  (palette indices)
+        │
+        ├── System: palette_blit
+        │     expands palette indices → RGBA bytes via LUT
+        │     → bevy::asset::Image (Rgba8UnormSrgb, 512×512)
+        │
+        └── Sprite  ← displays the Image in the 2D world
+```
+
+Data flows in one direction: `Renderer` produces a `RenderFrame`, which
+`compose_neo4j` reads to fill a `Framebuffer`, which the palette LUT
+converts to an `Image`, which Bevy renders. No `&mut self` during any
+compute step; all mutation is via the renderer's internal `RwLock`
+double-buffer and Bevy's `ResMut`.
+
+---
+
+## Compile-time vs runtime tier
+
+The polyfill exposes two orthogonal tier signals that can disagree:
+
+| Signal | Where | Value on AVX2 build | Value on AVX-512 build |
+|--------|-------|---------------------|------------------------|
+| `PREFERRED_F32_LANES` | compile-time const (`crate::simd`) | `8` | `16` |
+| `simd_caps().avx512f` | runtime CPUID (`LazyLock`) | `true` (if Sapphire Rapids) | `true` |
+
+The smoke test caught exactly this mismatch: building with
+`target-cpu=x86-64-v3` (the CI default) on a Sapphire Rapids host
+produces `PREFERRED_F32_LANES=8` but `simd_caps().avx512f=true`. The two
+signals are not automatically reconciled.
+
+**What controls which path runs:**
+
+- `target-cpu=x86-64-v3` (the default in `.cargo/config.toml`): the
+  compiler emits AVX2 code; `cfg(target_feature = "avx512f")` is false
+  at compile time; `F32x16::mul_add` compiles to 8-lane AVX2 FMA;
+  `PREFERRED_F32_LANES = 8`. The runtime tier reported by `simd_caps()`
+  is informational only — no code path switches based on it.
+
+- `target-cpu=x86-64-v4` (via `cargo run-avx512` alias): the compiler
+  emits AVX-512 code; `cfg(target_feature = "avx512f")` is true at
+  compile time; `F32x16::mul_add` compiles to 16-lane `_mm512_fmadd_ps`;
+  `PREFERRED_F32_LANES = 16`. The runtime `simd_caps()` tier now agrees
+  with compile time.
+
+The plugin prints both values at startup:
+
+```
+[ndarray_graph_plugin] compile-time: PREFERRED_F32_LANES=8
+[ndarray_graph_plugin] runtime:      avx512f=true avx2=true
+```
+
+A mismatch is not an error — it is expected on Sapphire Rapids with a
+CI-safe x86-64-v3 binary — but it means you are leaving AVX-512 throughput
+on the table. Pass `-C target-cpu=x86-64-v4` (via the `run-avx512` alias)
+to close the gap.
+
+---
+
+## Companion files
+
+The full plugin is split across four files generated by the round-2 CCA2A
+fleet:
+
+| File | Agent | Contents |
+|------|-------|----------|
+| `bevy/examples/ndarray_graph_plugin.rs` | agent #1 plugin-core | `NdarrayGraphPlugin` struct and impl, Bevy systems (`tick_renderer`, `rasterize_to_framebuffer`, `palette_blit`), `Cargo.toml` `[[example]]` entry |
+| `bevy/examples/ndarray_graph_palette.rs` | agent #2 plugin-palette | Compile-time RGBA LUT, `palette_to_rgba` expansion function, tier-keyed color definitions for nodes / edges / background |
+| `bevy/.github/workflows/ndarray-smoke.yml` | agent #3 plugin-ci | GitHub Actions workflow: clones ndarray sibling, installs system deps, sets Rust 1.95.0, runs `cargo check` on both `ndarray_simd_smoke` and `ndarray_graph_plugin` examples on every push/PR to `claude/**` branches |
+| `bevy/examples/README_NDARRAY_PLUGIN.md` | agent #4 plugin-readme | This file |
+
+The existing smoke test at `bevy/examples/ndarray_simd_smoke.rs` remains
+the canonical end-to-end correctness check. The graph plugin builds on the
+same ndarray API surface that the smoke test exercises; see the smoke test's
+assertion 5 (`compose_neo4j`) and assertions 3–4 (`integrate_simd`,
+`integrate_simd_par`) for the tested contracts.
+
+---
+
+## Known limitations
+
+- `integrate_simd_par` (rayon) is deliberately not used in the per-frame
+  tick at 64 nodes. The documented crossover is 65536 floats; at 64 nodes
+  (192 floats) rayon overhead dominates. Use `integrate_simd` for scenes
+  under ~5000 nodes and switch to `integrate_simd_par` only when profiling
+  confirms the crossover is reached.
+
+- `PaletteTier::detect()` currently proxies off `PREFERRED_F32_LANES` (a
+  f32 lane count) to select u8 palette depth. On an AVX2 build
+  (`PREFERRED_F32_LANES=8`) the framebuffer uses `Mid8` (8 colors) even
+  though AVX2 has 32 u8 lanes. This is a known issue in `framebuffer.rs`;
+  the plugin uses whichever tier `PaletteTier::detect()` returns.
+
+- The `GLOBAL_RENDERER` singleton is initialized once per process at 4096
+  node capacity. It cannot be resized at runtime. For larger scenes,
+  construct a local `Renderer::with_capacity(n)` and store it as a Bevy
+  `Resource` instead of using `GLOBAL_RENDERER`.
diff --git a/examples/ndarray_graph_palette.rs b/examples/ndarray_graph_palette.rs
new file mode 100644
index 0000000000000..85d6d277bfb53
--- /dev/null
+++ b/examples/ndarray_graph_palette.rs
@@ -0,0 +1,156 @@
+//! Palette-index → RGBA conversion helper for the ndarray graph plugin.
+//!
+//! This module is a standalone library with no Bevy or ndarray dependencies.
+//! It is imported by `ndarray_graph_plugin.rs` which uses the
+//! `ndarray::simd::PaletteTier::Full16` tier (16-color palette).
+//!
+//! The [`PALETTE_LUT`] maps each of the 16 `PaletteTier::Full16` palette
+//! indices to an RGBA byte quad.  The palette is Neo4j/Palantir-inspired:
+//! index 0 is a dark-navy background, indices 1–12 graduate through
+//! deep-blue → cyan → white, and indices 13–15 are hot accent colours
+//! (amber → hot-orange → crimson-red).
+//!
+//! # Usage from the plugin
+//! ```rust,ignore
+//! // In ndarray_graph_plugin.rs:
+//! mod ndarray_graph_palette;
+//! use ndarray_graph_palette::blit_u8_palette_to_rgba;
+//!
+//! let mut rgba = vec![0u8; palette_pixels.len() * 4];
+//! blit_u8_palette_to_rgba(&palette_pixels, &mut rgba);
+//! ```
+
+/// 16-entry RGBA look-up table for the `PaletteTier::Full16` palette.
+///
+/// Each entry is `[R, G, B, A]` with A always 255 (fully opaque).
+///
+/// Palette rationale (Neo4j/Palantir graph aesthetic):
+/// - Index  0 — dark navy background  (#0D1B2A)
+/// - Index  1 — deep navy             (#1A2D45)
+/// - Index  2 — cobalt blue           (#1E3A5F)
+/// - Index  3 — medium blue           (#1B4F8A)
+/// - Index  4 — royal blue            (#1A6BB5)
+/// - Index  5 — sky blue              (#2389DA)
+/// - Index  6 — steel blue            (#41A9E0)
+/// - Index  7 — light cyan            (#6DC8E8)
+/// - Index  8 — pale cyan             (#9DE0EF)
+/// - Index  9 — ice blue              (#C2EEF7)
+/// - Index 10 — near-white blue       (#E0F6FD)
+/// - Index 11 — pure white            (#FFFFFF)
+/// - Index 12 — pale amber            (#FFE08A)
+/// - Index 13 — warm amber            (#FFC33B)
+/// - Index 14 — hot orange            (#FF7A00)
+/// - Index 15 — crimson accent        (#E8001A)
+pub const PALETTE_LUT: [[u8; 4]; 16] = [
+    [0x0D, 0x1B, 0x2A, 0xFF], // 0  dark navy background
+    [0x1A, 0x2D, 0x45, 0xFF], // 1  deep navy
+    [0x1E, 0x3A, 0x5F, 0xFF], // 2  cobalt blue
+    [0x1B, 0x4F, 0x8A, 0xFF], // 3  medium blue
+    [0x1A, 0x6B, 0xB5, 0xFF], // 4  royal blue
+    [0x23, 0x89, 0xDA, 0xFF], // 5  sky blue
+    [0x41, 0xA9, 0xE0, 0xFF], // 6  steel blue
+    [0x6D, 0xC8, 0xE8, 0xFF], // 7  light cyan
+    [0x9D, 0xE0, 0xEF, 0xFF], // 8  pale cyan
+    [0xC2, 0xEE, 0xF7, 0xFF], // 9  ice blue
+    [0xE0, 0xF6, 0xFD, 0xFF], // 10 near-white blue
+    [0xFF, 0xFF, 0xFF, 0xFF], // 11 pure white
+    [0xFF, 0xE0, 0x8A, 0xFF], // 12 pale amber
+    [0xFF, 0xC3, 0x3B, 0xFF], // 13 warm amber
+    [0xFF, 0x7A, 0x00, 0xFF], // 14 hot orange
+    [0xE8, 0x00, 0x1A, 0xFF], // 15 crimson accent
+];
+
+/// Expand a palette-indexed byte buffer into a 32-bit RGBA buffer.
+///
+/// Each byte in `palette_pixels` is treated as a 4-bit palette index
+/// (bits 3:0; the upper nibble is masked off via `& 0x0F`).  The
+/// corresponding [`PALETTE_LUT`] entry is copied into four consecutive
+/// bytes of `rgba_out`.
+///
+/// # Panics
+/// Panics (debug) / produces a short write (release) if
+/// `rgba_out.len() < palette_pixels.len() * 4`.  The caller is
+/// responsible for pre-allocating an output buffer of the correct size.
+///
+/// # Note on SIMD acceleration
+/// The per-byte LUT lookup pattern (`permute_bytes`-style) is directly
+/// supported by `crate::simd::U8x64::permute_bytes` in ndarray's SIMD
+/// polyfill, which maps to `_mm512_permutexvar_epi8` on VBMI hardware.
+/// For Round-2 scope the implementation uses a scalar `for` loop; a
+/// vectorised path can be added once `permute_bytes` carries the
+/// `#[target_feature(enable = "avx512vbmi")]` gate required by the
+/// round-1 fleet review.
+///
+/// # Examples
+/// ```
+/// use ndarray_graph_palette::{blit_u8_palette_to_rgba, PALETTE_LUT};
+///
+/// let palette = [0u8, 15u8, 11u8];
+/// let mut rgba = [0u8; 12];
+/// blit_u8_palette_to_rgba(&palette, &mut rgba);
+/// assert_eq!(&rgba[0..4],  &PALETTE_LUT[0]);   // index 0 → dark navy
+/// assert_eq!(&rgba[4..8],  &PALETTE_LUT[15]);  // index 15 → crimson
+/// assert_eq!(&rgba[8..12], &PALETTE_LUT[11]);  // index 11 → white
+/// ```
+#[inline]
+pub fn blit_u8_palette_to_rgba(palette_pixels: &[u8], rgba_out: &mut [u8]) {
+    debug_assert!(
+        rgba_out.len() >= palette_pixels.len() * 4,
+        "rgba_out too short: need {} bytes, got {}",
+        palette_pixels.len() * 4,
+        rgba_out.len(),
+    );
+    for (i, &p) in palette_pixels.iter().enumerate() {
+        rgba_out[i * 4..i * 4 + 4].copy_from_slice(&PALETTE_LUT[p as usize & 0x0F]);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Verify that a 64-byte palette buffer expands to 256 RGBA bytes,
+    /// and that the first and last pixels map to the expected LUT entries.
+    #[test]
+    fn palette_lut_roundtrip() {
+        // Build a 64-byte input: index 0 at position 0, index 15 at position 63,
+        // and a ramp through 0-15 in between.
+        let mut palette_pixels = [0u8; 64];
+        for (i, byte) in palette_pixels.iter_mut().enumerate() {
+            *byte = (i & 0x0F) as u8;
+        }
+        // position 0 → index 0, position 63 → index 15 (63 & 0x0F = 15)
+
+        let mut rgba_out = [0u8; 256]; // 64 * 4
+        blit_u8_palette_to_rgba(&palette_pixels, &mut rgba_out);
+
+        // Output length must be exactly 256 bytes.
+        assert_eq!(rgba_out.len(), 256);
+
+        // First pixel must match palette index 0 (dark navy background).
+        assert_eq!(
+            &rgba_out[0..4],
+            &PALETTE_LUT[0],
+            "pixel 0 should be index 0 (dark navy)"
+        );
+
+        // Last pixel must match palette index 15 (crimson accent).
+        assert_eq!(
+            &rgba_out[252..256],
+            &PALETTE_LUT[15],
+            "pixel 63 should be index 15 (crimson accent)"
+        );
+
+        // Spot-check: position 11 → index 11 (white).
+        assert_eq!(
+            &rgba_out[11 * 4..11 * 4 + 4],
+            &PALETTE_LUT[11],
+            "pixel 11 should be index 11 (white)"
+        );
+
+        // Alpha channel is always 255 for every entry.
+        for chunk in rgba_out.chunks_exact(4) {
+            assert_eq!(chunk[3], 0xFF, "alpha must be 255");
+        }
+    }
+}
diff --git a/examples/ndarray_graph_plugin.rs b/examples/ndarray_graph_plugin.rs
new file mode 100644
index 0000000000000..ad7b4ed3e1811
--- /dev/null
+++ b/examples/ndarray_graph_plugin.rs
@@ -0,0 +1,248 @@
+//! # NdarrayGraphPlugin — Bevy plugin for SIMD-accelerated graph rendering
+//!
+//! Visualises a force-directed graph using `ndarray::hpc::renderer::Renderer`
+//! (double-buffered, SIMD-integrated) and `ndarray::hpc::framebuffer::Framebuffer`
+//! (palette-indexed rasteriser). Each frame:
+//!
+//! 1. `tick_renderer` — advances physics via `Renderer::tick(dt, 0.98)`.
+//! 2. `render_to_framebuffer` — rasterises via `compose_neo4j` into a
+//!    long-lived 512×512 `Framebuffer`, expands palette→RGBA8 via the
+//!    shared `ndarray_graph_palette::PALETTE_LUT`, and blits into a
+//!    long-lived Bevy `Image`.
+//!
+//! Run headless (no window required for compile checks):
+//! ```
+//! cargo check --example ndarray_graph_plugin
+//! ```
+
+use std::f32::consts::TAU;
+
+use bevy::{
+    asset::RenderAssetUsages,
+    prelude::*,
+    render::render_resource::{Extent3d, TextureDimension, TextureFormat},
+};
+use ndarray::hpc::framebuffer::{compose_neo4j, Framebuffer};
+use ndarray::hpc::renderer::{Renderer, DT_60};
+
+// Share the canonical 16-entry RGBA8 palette with the smoke / tests examples.
+#[path = "ndarray_graph_palette.rs"]
+mod palette;
+use palette::{blit_u8_palette_to_rgba, PALETTE_LUT};
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+/// Side length of the off-screen framebuffer in pixels.
+const FB_SIZE: u32 = 512;
+/// Number of seed nodes placed in the circle layout on startup.
+const NODE_COUNT: usize = 64;
+/// Radius of the circle layout in logical units.
+const LAYOUT_RADIUS: f32 = 20.0;
+/// Node renderer capacity (must be ≥ NODE_COUNT, padded to SIMD lanes).
+const RENDERER_CAPACITY: usize = 1024;
+
+/// Palette index used for node dot sprites.
+const NODE_COLOR: u8 = 15;
+/// Palette index used for Bresenham edge lines.
+const EDGE_COLOR: u8 = 8;
+/// Scale factor: logical units → framebuffer pixels.
+const SCALE: f32 = 8.0;
+/// Offset that maps the graph origin to the centre of the 512×512 framebuffer.
+const OFFSET: (f32, f32) = (256.0, 256.0);
+/// Physics damping applied each tick (≈ 2 % velocity bleed per frame at 60 Hz).
+const DAMPING: f32 = 0.98;
+
+// ── Resources ────────────────────────────────────────────────────────────────
+
+/// Bevy `Resource` wrapping the double-buffered SIMD renderer.
+///
+/// Heap-allocated via `Box` so the `RwLock`-guarded frames don't move.
+#[derive(Resource)]
+pub struct GraphRenderer {
+    renderer: Box<Renderer>,
+    /// Flat edge list shared between the seeder and the rasteriser.
+    edges: Vec<(usize, usize)>,
+}
+
+/// Long-lived per-frame resources so we never allocate inside `Update`.
+#[derive(Resource)]
+struct RenderSurface {
+    /// 512×512 palette-indexed framebuffer (re-cleared each tick by `compose_neo4j`).
+    framebuffer: Framebuffer,
+    /// Handle to the Bevy `Image` asset we upload palette pixels into.
+    image_handle: Handle<Image>,
+}
+
+// ── Plugin ───────────────────────────────────────────────────────────────────
+
+/// Bevy plugin: SIMD-accelerated force-directed graph → Sprite display.
+///
+/// # Systems
+///
+/// | Schedule  | System                 | Purpose                        |
+/// |-----------|------------------------|--------------------------------|
+/// | `Startup` | `seed_graph`           | Place nodes + edges, swap once |
+/// | `Update`  | `tick_renderer`        | Physics step (SIMD)            |
+/// | `Update`  | `render_to_framebuffer`| Rasterise + blit to GPU Image  |
+pub struct NdarrayGraphPlugin;
+
+impl Plugin for NdarrayGraphPlugin {
+    fn build(&self, app: &mut App) {
+        app.add_systems(Startup, (setup_camera, setup_render_surface, seed_graph).chain())
+            .add_systems(
+                Update,
+                (tick_renderer, render_to_framebuffer).chain(),
+            );
+    }
+}
+
+// ── Startup systems ───────────────────────────────────────────────────────────
+
+/// Spawn a 2-D camera so the sprite is visible.
+fn setup_camera(mut commands: Commands) {
+    commands.spawn(Camera2d);
+}
+
+/// Allocate the long-lived `Framebuffer` and the Bevy `Image`, then spawn
+/// the `Sprite` that displays it.
+fn setup_render_surface(
+    mut commands: Commands,
+    mut images: ResMut<Assets<Image>>,
+) {
+    // Allocate a 512×512 RGBA8 image filled with black (palette index 0).
+    let rgba = PALETTE_LUT[0];
+    let image = Image::new_fill(
+        Extent3d {
+            width: FB_SIZE,
+            height: FB_SIZE,
+            depth_or_array_layers: 1,
+        },
+        TextureDimension::D2,
+        &rgba,
+        TextureFormat::Rgba8Unorm,
+        RenderAssetUsages::MAIN_WORLD | RenderAssetUsages::RENDER_WORLD,
+    );
+    let image_handle = images.add(image);
+
+    // Spawn the sprite that displays the image.
+    commands.spawn(Sprite::from_image(image_handle.clone()));
+
+    commands.insert_resource(RenderSurface {
+        framebuffer: Framebuffer::new(FB_SIZE as usize, FB_SIZE as usize),
+        image_handle,
+    });
+}
+
+/// Seed 64 nodes in a circle layout with ~80 random edges, write into the
+/// back frame, then swap front↔back so the first tick sees live data.
+fn seed_graph(mut commands: Commands) {
+    let renderer = Box::new(Renderer::with_capacity(RENDERER_CAPACITY));
+
+    // Write node positions into the back frame.
+    {
+        let mut back = renderer.write_back();
+        back.len = NODE_COUNT;
+        for i in 0..NODE_COUNT {
+            let angle = TAU * (i as f32) / (NODE_COUNT as f32);
+            let x = LAYOUT_RADIUS * angle.cos();
+            let y = LAYOUT_RADIUS * angle.sin();
+            back.positions[i * 3] = x;
+            back.positions[i * 3 + 1] = y;
+            back.positions[i * 3 + 2] = 0.0;
+            // Small tangential velocity to kick off the simulation.
+            back.velocities[i * 3] = -angle.sin() * 0.5;
+            back.velocities[i * 3 + 1] = angle.cos() * 0.5;
+            // Uniform charge so all nodes repel equally.
+            back.charges[i] = 1.0;
+        }
+    }
+    // Swap so the front frame (read by `render_to_framebuffer`) is populated.
+    renderer.swap();
+
+    // Build ~80 edges: ring edges + a handful of cross-links.
+    let mut edges: Vec<(usize, usize)> = Vec::with_capacity(96);
+    // Ring edges (64)
+    for i in 0..NODE_COUNT {
+        edges.push((i, (i + 1) % NODE_COUNT));
+    }
+    // Cross-links (~16) using a simple deterministic stride pattern.
+    for i in 0..16 {
+        let a = (i * 4) % NODE_COUNT;
+        let b = (i * 4 + NODE_COUNT / 2) % NODE_COUNT;
+        if a != b {
+            edges.push((a, b));
+        }
+    }
+
+    commands.insert_resource(GraphRenderer {
+        renderer,
+        edges,
+    });
+}
+
+// ── Update systems ────────────────────────────────────────────────────────────
+
+/// Advance the physics simulation by one frame.
+///
+/// Calls `Renderer::tick(dt, damping)` which: integrates velocities into
+/// positions via `F32x16::mul_add` (SIMD), then atomically swaps front/back.
+fn tick_renderer(graph: ResMut<GraphRenderer>, time: Res<Time>) {
+    // Use the real frame delta but clamp to avoid explosion on first frame.
+    let dt = time.delta_secs().clamp(0.001, DT_60 * 4.0);
+    graph.renderer.tick(dt, DAMPING);
+}
+
+/// Rasterise the current front frame into the `Framebuffer`, expand to RGBA8
+/// via the palette LUT, and upload into the Bevy `Image`.
+///
+/// Neither the `Framebuffer` nor the `Image` buffer is reallocated — only
+/// the pixel data is overwritten.
+fn render_to_framebuffer(
+    graph: Res<GraphRenderer>,
+    mut surface: ResMut<RenderSurface>,
+    mut images: ResMut<Assets<Image>>,
+) {
+    // Borrow split: read front frame, then rasterise into surface.framebuffer.
+    let front = graph.renderer.read_front();
+    compose_neo4j(
+        &mut surface.framebuffer,
+        &front,
+        &graph.edges,
+        SCALE,
+        OFFSET,
+        NODE_COLOR,
+        EDGE_COLOR,
+    );
+    drop(front); // release read-lock before the blit
+
+    // Expand palette u8 → RGBA8 directly into the Bevy image data buffer.
+    let Some(mut image) = images.get_mut(&surface.image_handle) else {
+        return;
+    };
+    let Some(data) = image.data.as_mut() else {
+        return;
+    };
+
+    let pixels = &surface.framebuffer.pixels;
+    debug_assert_eq!(
+        data.len(),
+        pixels.len() * 4,
+        "Image data length mismatch: expected {} bytes for {} palette pixels",
+        pixels.len() * 4,
+        pixels.len()
+    );
+
+    // Shared palette expander from `ndarray_graph_palette.rs`. Equivalent to
+    // the inline loop but the LUT lives in one place so the smoke test and
+    // tests pick up the same colours.
+    blit_u8_palette_to_rgba(pixels, data);
+}
+
+// ── Entry point ───────────────────────────────────────────────────────────────
+
+fn main() {
+    App::new()
+        .add_plugins(DefaultPlugins)
+        .add_plugins(NdarrayGraphPlugin)
+        .run();
+}
diff --git a/examples/ndarray_graph_plugin_tests.rs b/examples/ndarray_graph_plugin_tests.rs
new file mode 100644
index 0000000000000..e21a67bc88179
--- /dev/null
+++ b/examples/ndarray_graph_plugin_tests.rs
@@ -0,0 +1,312 @@
+//! Headless integration tests for the ndarray graph plugin.
+//!
+//! # Design choice
+//!
+//! Tests are written as **both** a `fn main()` that panics on failure
+//! (CI-runnable via `cargo run --example ndarray_graph_plugin_tests`)
+//! AND a `#[cfg(test)] mod tests` block so that
+//! `cargo test --example ndarray_graph_plugin_tests` also works.
+//!
+//! `NdarrayGraphPlugin` and `GraphRenderer` are defined inline here so
+//! the file is self-contained. Agent #1 (plugin-core) should produce an
+//! `ndarray_graph_plugin.rs` whose types match this contract; at that
+//! point the inline definitions here can be replaced with an import.
+//!
+//! # Running
+//!
+//! ```sh
+//! # Panic-on-failure run (CI):
+//! cargo run --example ndarray_graph_plugin_tests
+//!
+//! # Cargo test runner (alternative):
+//! cargo test --example ndarray_graph_plugin_tests
+//! ```
+
+use bevy::prelude::*;
+use ndarray::hpc::framebuffer::{compose_neo4j, Framebuffer};
+use ndarray::hpc::renderer::{DT_60, GLOBAL_RENDERER, RenderFrame, Renderer};
+use ndarray::hpc::simd_caps::simd_caps;
+use ndarray::simd::PREFERRED_F32_LANES;
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Minimal plugin definition (matches the contract agent #1 will deliver).
+//
+// `GraphRenderer` wraps the ndarray `Renderer` as a Bevy `Resource`.
+// `NdarrayGraphPlugin` inserts it and seeds a minimal graph on `Startup`.
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Bevy resource that wraps the ndarray double-buffered `Renderer`.
+#[derive(Resource)]
+pub struct GraphRenderer {
+    /// The ndarray double-buffered renderer.
+    pub renderer: Renderer,
+    /// Edges: list of (src_node_idx, dst_node_idx) pairs.
+    pub edges: Vec<(usize, usize)>,
+}
+
+impl Default for GraphRenderer {
+    fn default() -> Self {
+        Self {
+            renderer: Renderer::with_capacity(16),
+            edges: Vec::new(),
+        }
+    }
+}
+
+/// Bevy plugin that wires ndarray SIMD graph rendering into a Bevy `App`.
+pub struct NdarrayGraphPlugin;
+
+impl Plugin for NdarrayGraphPlugin {
+    fn build(&self, app: &mut App) {
+        app.insert_resource(GraphRenderer::default())
+            .add_systems(Startup, seed_graph)
+            .add_systems(Update, tick_graph);
+    }
+}
+
+/// Seed a single frame with the test graph data.
+fn seed_frame(frame: &mut RenderFrame) {
+    frame.len = 2;
+    // Node 0 at (10, 10, 0), velocity (1, 0, 0)
+    frame.positions[0] = 10.0;
+    frame.positions[1] = 10.0;
+    frame.positions[2] = 0.0;
+    frame.velocities[0] = 1.0;
+    // Node 1 at (50, 50, 0), velocity (0, 1, 0)
+    frame.positions[3] = 50.0;
+    frame.positions[4] = 50.0;
+    frame.positions[5] = 0.0;
+    frame.velocities[4] = 1.0;
+}
+
+/// Startup system: seeds two nodes with initial positions and one edge.
+///
+/// Both frames are seeded identically so the first `tick_graph` call
+/// integrates the correct initial state regardless of which frame is
+/// currently the back buffer.
+fn seed_graph(mut gr: ResMut<GraphRenderer>) {
+    let r = &mut gr.renderer;
+    // Seed both frames so the very first tick's integrate_simd starts
+    // from the correct initial positions/velocities (not zeros).
+    seed_frame(&mut r.frames[0].write().expect("frame 0 lock poisoned"));
+    seed_frame(&mut r.frames[1].write().expect("frame 1 lock poisoned"));
+    gr.edges.push((0, 1));
+}
+
+/// Update system: advance physics by one 60 fps tick.
+fn tick_graph(gr: ResMut<GraphRenderer>) {
+    gr.renderer.tick(DT_60, 0.99);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Test helpers
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Build a headless app with `NdarrayGraphPlugin` ready for assertions.
+fn make_app() -> App {
+    let mut app = App::new();
+    app.add_plugins(MinimalPlugins)
+        .add_plugins(NdarrayGraphPlugin);
+    app
+}
+
+/// Exit after exactly one update (used in App::run()-style tests).
+fn exit_on_first_update(mut exit: MessageWriter<AppExit>) {
+    exit.write(AppExit::Success);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Test bodies (callable from both fn main and #[test]).
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Test 1: plugin inserts `GraphRenderer` resource; GLOBAL_RENDERER tick == 0.
+fn test_plugin_initializes_global_renderer_resource() {
+    let app = make_app();
+    // Do NOT call app.update() yet — Startup systems run on first update.
+    // Still, the resource is inserted by `build()` via `insert_resource`.
+    assert!(
+        app.world().contains_resource::<GraphRenderer>(),
+        "GraphRenderer resource not found after add_plugins(NdarrayGraphPlugin)"
+    );
+
+    // The process-global renderer starts at tick zero.
+    assert_eq!(
+        GLOBAL_RENDERER.tick_count(),
+        0,
+        "GLOBAL_RENDERER.tick_count() should be 0 before any tick"
+    );
+    println!("[test 1] PASS: GraphRenderer resource present, GLOBAL_RENDERER.tick_count()=0");
+}
+
+/// Test 2: after one App::update(), the front frame has nodes and edges.
+fn test_startup_seeds_nodes_and_edges() {
+    let mut app = make_app();
+    app.update(); // Runs Startup + Update systems once.
+
+    let gr = app
+        .world()
+        .get_resource::<GraphRenderer>()
+        .expect("GraphRenderer missing after update");
+
+    let front = gr.renderer.read_front();
+    assert!(
+        front.len > 0,
+        "front frame len should be > 0 after seed_graph, got {}",
+        front.len
+    );
+    assert!(
+        gr.edges.len() > 0,
+        "edges list should be non-empty after seed_graph, got {}",
+        gr.edges.len()
+    );
+    println!(
+        "[test 2] PASS: front.len={} edges.len={}",
+        front.len,
+        gr.edges.len()
+    );
+}
+
+/// Test 3: tick advances position[0] by exactly velocity * dt (modulo damping).
+///
+/// This confirms `integrate_simd` (using `F32x16::mul_add`, the actual polyfill)
+/// ran inside the Bevy `tick_graph` system.
+fn test_tick_advances_position_via_integrate_simd() {
+    let mut app = make_app();
+    app.update(); // Startup: seeds positions / velocities, swaps, then Update ticks.
+
+    let gr = app
+        .world()
+        .get_resource::<GraphRenderer>()
+        .expect("GraphRenderer missing");
+
+    // After one tick: seed_graph ran first (sets back, swaps → front has the
+    // seed), then tick_graph ran (writes to back, swaps → front has ticked data).
+    // Node 0 x-position started at 10.0, velocity x = 1.0, damping = 0.99.
+    // Expected: position_x = vel_x * DT_60 + pos_x = 1.0 * DT_60 + 10.0.
+    let front = gr.renderer.read_front();
+    let pos_x = front.positions[0];
+    let expected = 1.0_f32.mul_add(DT_60, 10.0);
+
+    assert!(
+        (pos_x - expected).abs() < 1e-5,
+        "Node 0 x-position after one tick: got {pos_x:.6}, expected {expected:.6} (vel*dt+pos)"
+    );
+    println!(
+        "[test 3] PASS: position[0] advanced from 10.0 to {pos_x:.6} (expected {expected:.6})"
+    );
+}
+
+/// Test 4: after one App::update(), `compose_neo4j` writes non-zero pixels.
+///
+/// Builds a framebuffer from the seeded frame + edges and checks that at
+/// least 50 bytes are non-zero in the pixel buffer.
+fn test_compose_neo4j_emits_pixels_to_framebuffer() {
+    const NONZERO_THRESHOLD: usize = 50;
+
+    let mut app = make_app();
+    app.update();
+
+    let gr = app
+        .world()
+        .get_resource::<GraphRenderer>()
+        .expect("GraphRenderer missing");
+
+    let front = gr.renderer.read_front();
+    let mut fb = Framebuffer::new(128, 128);
+    compose_neo4j(&mut fb, &front, &gr.edges, 1.0, (0.0, 0.0), 5, 2);
+
+    let nonzero_count = fb.pixels.iter().filter(|&&p| p != 0).count();
+    assert!(
+        nonzero_count >= NONZERO_THRESHOLD,
+        "compose_neo4j wrote only {nonzero_count} non-zero pixels (threshold={NONZERO_THRESHOLD})"
+    );
+    println!(
+        "[test 4] PASS: compose_neo4j emitted {nonzero_count} non-zero pixels (threshold={NONZERO_THRESHOLD})"
+    );
+}
+
+/// Test 5: polyfill runtime tier matches compile-time expectation on x86_64.
+///
+/// On x86_64, exactly one of avx512f or avx2 should be true (the machine
+/// has at minimum AVX2 if we've compiled this far with the simd feature).
+/// Prints the full caps struct for CI log visibility.
+fn test_polyfill_runtime_tier_matches_expectation() {
+    let caps = simd_caps();
+    println!(
+        "[test 5] simd_caps: avx512f={} avx2={} fma={} neon={}  \
+         PREFERRED_F32_LANES={}",
+        caps.avx512f, caps.avx2, caps.fma, caps.neon, PREFERRED_F32_LANES
+    );
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        assert!(
+            caps.avx512f || caps.avx2,
+            "Expected avx512f or avx2 to be true on x86_64, got caps={caps:?}"
+        );
+        println!("[test 5] PASS: x86_64 has avx512f={} or avx2={}", caps.avx512f, caps.avx2);
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        // On non-x86 (aarch64, WASM, etc.) just print — no mandatory assertion.
+        println!("[test 5] PASS: non-x86_64 platform, caps printed above");
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// fn main — CI entry point (panic on assertion failure → non-zero exit)
+// ─────────────────────────────────────────────────────────────────────────────
+
+fn main() {
+    println!("=== ndarray_graph_plugin_tests (headless) ===");
+
+    test_plugin_initializes_global_renderer_resource();
+    test_startup_seeds_nodes_and_edges();
+    test_tick_advances_position_via_integrate_simd();
+    test_compose_neo4j_emits_pixels_to_framebuffer();
+    test_polyfill_runtime_tier_matches_expectation();
+
+    println!("=== ALL TESTS PASSED ===");
+
+    // Headless Bevy spin-up proof: MinimalPlugins + NdarrayGraphPlugin link ok.
+    App::new()
+        .add_plugins(MinimalPlugins)
+        .add_plugins(NdarrayGraphPlugin)
+        .add_systems(Update, exit_on_first_update)
+        .run();
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// #[cfg(test)] block — for `cargo test --example ndarray_graph_plugin_tests`
+// ─────────────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn plugin_initializes_global_renderer_resource() {
+        test_plugin_initializes_global_renderer_resource();
+    }
+
+    #[test]
+    fn startup_seeds_nodes_and_edges() {
+        test_startup_seeds_nodes_and_edges();
+    }
+
+    #[test]
+    fn tick_advances_position_via_integrate_simd() {
+        test_tick_advances_position_via_integrate_simd();
+    }
+
+    #[test]
+    fn compose_neo4j_emits_pixels_to_framebuffer() {
+        test_compose_neo4j_emits_pixels_to_framebuffer();
+    }
+
+    #[test]
+    fn polyfill_runtime_tier_matches_expectation() {
+        test_polyfill_runtime_tier_matches_expectation();
+    }
+}
diff --git a/examples/ndarray_simd_smoke.rs b/examples/ndarray_simd_smoke.rs
new file mode 100644
index 0000000000000..3fcca45fe306b
--- /dev/null
+++ b/examples/ndarray_simd_smoke.rs
@@ -0,0 +1,144 @@
+//! Smoke test: ndarray `crate::simd` polyfill + rayon parallel integrate
+//! reachable from a Bevy downstream crate.
+//!
+//! Run: `cargo run --release --example ndarray_simd_smoke`
+//!
+//! Asserts:
+//!   1. `simd_caps()` LazyLock initializes and reports the live CPU tier.
+//!   2. `F32x16::mul_add` is bit-exact against scalar `f32::mul_add`.
+//!   3. `integrate_simd` advances positions by exactly `v * dt`.
+//!   4. `integrate_simd_par` (rayon × SIMD) matches sequential bit-exactly.
+//!   5. `compose_neo4j` emits both node and edge pixels.
+//!
+//! What this *proves* end-to-end:
+//!   - `target-cpu` propagates from Bevy → ndarray (the `cfg(target_feature
+//!     = "avx512f")` in ndarray/src/simd.rs:206-239 picks the right path).
+//!   - `LazyLock` runtime detect agrees with compile-time cfg.
+//!   - The Pumpkin-derived palette/rasterizer is reachable as a library.
+//!   - rayon `par_chunks_mut` composes with `F32x16::mul_add` without
+//!     divergence (FMA is deterministic at one dispatch tier).
+
+use bevy::prelude::*;
+use ndarray::hpc::framebuffer::{compose_neo4j, Framebuffer, PaletteTier};
+use ndarray::hpc::renderer::{
+    cached_splat, integrate_simd, integrate_simd_par, RenderFrame, BLOCK_FLOATS, DT_60,
+};
+use ndarray::hpc::simd_caps::simd_caps;
+use ndarray::simd::{F32x16, PREFERRED_F32_LANES};
+
+fn main() {
+    // 1. Tier print — proves LazyLock<SimdCaps> initialized.
+    let caps = simd_caps();
+    println!(
+        "[smoke] caps: avx512f={} avx512vnni={} avx2={} fma={} neon={}",
+        caps.avx512f, caps.avx512vnni, caps.avx2, caps.fma, caps.neon
+    );
+    println!(
+        "[smoke] compile-time: PREFERRED_F32_LANES={} PaletteTier::detect()={:?}",
+        PREFERRED_F32_LANES,
+        PaletteTier::detect()
+    );
+
+    // 2. F32x16 FMA bit-exact check — proves crate::simd routes correctly.
+    let dt = DT_60;
+    let dt_v = cached_splat(dt);
+    let v = F32x16::splat(0.5);
+    let p = F32x16::splat(1.0);
+    let out = v.mul_add(dt_v, p);
+    let mut out_arr = [0.0f32; 16];
+    out.copy_to_slice(&mut out_arr);
+    let expected = 0.5_f32.mul_add(dt, 1.0);
+    for x in out_arr {
+        assert!(
+            (x - expected).abs() < 1e-6,
+            "F32x16::mul_add lane mismatch: got {}, expected {}",
+            x,
+            expected
+        );
+    }
+    println!("[smoke] F32x16::mul_add ok (expected={})", expected);
+
+    // 3. integrate_simd contract: x[i] += v[i] * dt.
+    let n_nodes = 64;
+    let mut frame = RenderFrame::with_capacity(n_nodes);
+    frame.len = n_nodes;
+    for i in 0..n_nodes {
+        frame.positions[i * 3] = i as f32;
+        frame.velocities[i * 3] = 1.0;
+    }
+    let p_before = frame.positions[3];
+    integrate_simd(&mut frame.positions, &mut frame.velocities, dt, 1.0);
+    let p_after = frame.positions[3];
+    assert!(
+        (p_after - (p_before + dt)).abs() < 1e-6,
+        "integrate_simd did not advance: {} -> {}",
+        p_before,
+        p_after
+    );
+    println!("[smoke] integrate_simd advanced by {} (expected {})", p_after - p_before, dt);
+
+    // 4. rayon × SIMD: integrate_simd_par must match integrate_simd bit-exactly.
+    //    Buffer is 4 × BLOCK_FLOATS so rayon actually parallelizes.
+    let n = 4 * BLOCK_FLOATS;
+    let mut p_seq = (0..n).map(|i| (i as f32) * 0.001).collect::<Vec<_>>();
+    let mut v_seq = (0..n).map(|i| (i as f32).sin() * 0.1).collect::<Vec<_>>();
+    let mut p_par = p_seq.clone();
+    let mut v_par = v_seq.clone();
+
+    let t0 = std::time::Instant::now();
+    integrate_simd(&mut p_seq, &mut v_seq, dt, 0.98);
+    let seq = t0.elapsed();
+
+    let t0 = std::time::Instant::now();
+    integrate_simd_par(&mut p_par, &mut v_par, dt, 0.98);
+    let par = t0.elapsed();
+
+    for i in 0..n {
+        assert_eq!(
+            p_seq[i].to_bits(),
+            p_par[i].to_bits(),
+            "rayon vs sequential diverged at i={}",
+            i
+        );
+    }
+    println!(
+        "[smoke] integrate_simd_par bit-exact vs sequential ({} floats: seq={:?} par={:?})",
+        n, seq, par
+    );
+
+    // 5. Rasterize: compose_neo4j on a tiny frame with one edge.
+    let mut frame2 = RenderFrame::with_capacity(2);
+    frame2.len = 2;
+    frame2.positions[0] = 10.0;
+    frame2.positions[1] = 10.0;
+    frame2.positions[3] = 50.0;
+    frame2.positions[4] = 50.0;
+    let edges = vec![(0usize, 1usize)];
+    let mut fb = Framebuffer::new(64, 64);
+    compose_neo4j(&mut fb, &frame2, &edges, 1.0, (0.0, 0.0), 5, 2);
+    let edge_pixels = fb.pixels.iter().filter(|&&p| p == 2).count();
+    let node_pixels = fb.pixels.iter().filter(|&&p| p == 5).count();
+    assert!(
+        edge_pixels > 0 && node_pixels > 0,
+        "rasterizer empty: edge={} node={}",
+        edge_pixels,
+        node_pixels
+    );
+    println!(
+        "[smoke] compose_neo4j emitted {} node pixels + {} edge pixels",
+        node_pixels, edge_pixels
+    );
+
+    println!("[smoke] ALL OK — ndarray::simd polyfill + rayon reachable from bevy");
+
+    // Headless App spin-up — proves the example links against the full Bevy
+    // crate. MinimalPlugins runs once and exits via exit_on_first_update.
+    App::new()
+        .add_plugins(MinimalPlugins)
+        .add_systems(Update, exit_on_first_update)
+        .run();
+}
+
+fn exit_on_first_update(mut exit: MessageWriter<AppExit>) {
+    exit.write(AppExit::Success);
+}