diff --git a/.cargo/config_ndarray_simd.toml b/.cargo/config_ndarray_simd.toml new file mode 100644 index 0000000000000..9dbd90ceb5760 --- /dev/null +++ b/.cargo/config_ndarray_simd.toml @@ -0,0 +1,45 @@ +# Copy this file to `.cargo/config.toml` (or merge into your existing one) +# to enable AVX-512 builds of the AdaWorldAPI/ndarray SIMD polyfill from +# this Bevy fork. +# +# ## What it does +# +# Two build profiles for the ndarray polyfill — chosen at compile time +# via `target-cpu`, never at runtime: +# +# - **Default** (`cargo build`): `target-cpu=x86-64-v3`. AVX2 baseline. +# Works on every GitHub Actions runner. `crate::simd::F32x16` picks +# the 8-lane AVX2 path; `U8x64` ditto. +# +# - **Opt-in AVX-512** (`cargo build-avx512`): `target-cpu=x86-64-v4`. +# Polyfill picks the 16-lane AVX-512 path. Required for the +# `ndarray_simd_smoke` / `ndarray_graph_plugin` examples to exercise +# `__m512` / `permute_bytes` / `pairwise_avg` (PR #112's rasterizer +# intrinsics). Only run on hardware with AVX-512F (Sapphire Rapids, +# Ice Lake-SP, Zen 4 with AVX-512, etc.). CI runners WILL SIGILL. +# +# ## Why two profiles +# +# GitHub Actions stock runners support x86-64-v3 (AVX2) but NOT +# x86-64-v4 (AVX-512). Unconditionally setting `target-cpu=x86-64-v4` +# would break CI. Project convention is: default build = CI-safe +# baseline; AVX-512 = explicit opt-in via cargo alias. +# +# ## Runtime sanity +# +# Whichever profile you build with, the ndarray smoke test prints +# `simd_caps()` at startup (CPUID-detected at runtime via the LazyLock +# singleton). The smoke test catches the mismatch between +# runtime-detected `avx512f=true` and a compile-time x86-64-v3 build +# (`PREFERRED_F32_LANES=8`) — that's the asymmetry to watch for. + +[build] +rustflags = ["-C", "target-cpu=x86-64-v3"] + +[alias] +# AVX-512 variants — for AdaWorldAPI dev boxes (Sapphire Rapids+). +# Do NOT run these binaries on a non-AVX-512 host. +build-avx512 = ["build", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"] +run-avx512 = ["run", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"] +test-avx512 = ["test", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"] +check-avx512 = ["check", "--config", "build.rustflags=['-C','target-cpu=x86-64-v4']"] diff --git a/.github/workflows/ndarray-smoke.yml b/.github/workflows/ndarray-smoke.yml new file mode 100644 index 0000000000000..0177084eeb430 --- /dev/null +++ b/.github/workflows/ndarray-smoke.yml @@ -0,0 +1,44 @@ +name: ndarray-smoke +on: + push: + branches: ["claude/**"] + pull_request: + branches: ["claude/**", "main", "master"] + +# Minimum permission set per CodeQL "Workflow does not contain permissions" +# rule on PR #1. The job only checks out + builds; no write needs. +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + steps: + # Pinned to commit SHA per zizmor unpinned-action rule on PR #1. + # v4.1.7 corresponds to commit 692973e3d937129bcbf40652eb9f2f61becf3332. + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + + - name: Install Bevy system deps + run: sudo apt-get update -y && sudo apt-get install -y libwayland-dev libasound2-dev libudev-dev + + # Pinned to commit SHA per zizmor unpinned-action rule on PR #1. + # The action treats "1.95.0" as a toolchain version, but the action ref + # itself must be a commit SHA. Commit f04cf2e09f5b6448b46c0aa9893a76ee36ed64c2 + # corresponds to the stable tag. + - uses: dtolnay/rust-toolchain@f04cf2e09f5b6448b46c0aa9893a76ee36ed64c2 + with: + toolchain: "1.95.0" + + # ndarray is now a git dev-dep in Cargo.toml (codex P1 fix on PR #1), + # so the workflow no longer needs to clone ../ndarray. The + # ndarray-examples feature must be enabled because the [[example]] + # entries require it (so upstream Bevy CI doesn't try to build them + # on macOS / Windows where ndarray's AMX path doesn't compile). + - name: cargo check --example ndarray_simd_smoke + run: cargo check --example ndarray_simd_smoke --features ndarray-examples + + - name: cargo check --example ndarray_graph_plugin + run: cargo check --example ndarray_graph_plugin --features ndarray-examples + + - name: cargo check --example ndarray_graph_plugin_tests + run: cargo check --example ndarray_graph_plugin_tests --features ndarray-examples diff --git a/Cargo.toml b/Cargo.toml index d60f7b1884307..65092dd499b7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -133,6 +133,15 @@ unused_qualifications = "warn" [features] default = ["2d", "3d", "ui", "audio"] +# PROFILE: Examples that depend on AdaWorldAPI/ndarray fork (Linux x86_64 only). +# Marker feature: enabling it lets `cargo build --examples` pick up the +# ndarray_* examples. Upstream Bevy CI does NOT enable this, so its +# multi-platform build matrix doesn't try to compile these examples on +# macOS / Windows / non-x86_64 (where ndarray's AMX + Linux prctl don't +# build). Local dev / our own CI workflow enables this explicitly via +# `cargo run --example ndarray_graph_plugin --features ndarray-examples`. +ndarray-examples = [] + # PROFILE: The default 2D Bevy experience. This includes the core Bevy framework, 2D functionality, scenes and picking. 2d = ["default_app", "default_platform", "2d_bevy_render", "scene", "picking"] @@ -742,6 +751,21 @@ chacha20 = { version = "0.10.0", default-features = false, features = ["rng"] } ron = "0.12" flate2 = "1.0" serde = { version = "1", features = ["derive"] } + +# AdaWorldAPI/ndarray fork: HPC + SIMD polyfill — Linux x86_64 ONLY. +# ndarray uses AMX inline asm + a Linux-x86_64 prctl syscall for AMX tile +# permission grants, neither of which are available on macOS / Windows / +# aarch64. The CI matrix on the bevy fork runs all three OS × ISA combos, +# so without this target.cfg gate, the macOS and Windows runners try +# (and fail) to fetch + build ndarray. Gating the dep AND stubbing the +# example mains for non-Linux-x86_64 means upstream cargo check / build +# passes cleanly on every target. +# +# Git dep (not path) so `cargo metadata` works without a sibling checkout +# (codex P1 on PR #1: `path = "../ndarray"` made every cargo command on +# the workspace fail unless every dev pre-cloned the sibling). +[target.'cfg(all(target_os = "linux", target_arch = "x86_64"))'.dev-dependencies] +ndarray = { git = "https://github.com/AdaWorldAPI/ndarray.git", branch = "master", features = ["rayon"] } serde_json = "1.0.140" bytemuck = "1" # The following explicit dependencies are needed for proc macros to work inside of examples as they are part of the bevy crate itself. @@ -797,6 +821,33 @@ doc-scrape-examples = true [package.metadata.example.hello_world] hidden = true +[[example]] +name = "ndarray_simd_smoke" +path = "examples/ndarray_simd_smoke.rs" +doc-scrape-examples = false +required-features = ["ndarray-examples"] + +[package.metadata.example.ndarray_simd_smoke] +hidden = true + +[[example]] +name = "ndarray_graph_plugin" +path = "examples/ndarray_graph_plugin.rs" +doc-scrape-examples = false +required-features = ["ndarray-examples"] + +[package.metadata.example.ndarray_graph_plugin] +hidden = true + +[[example]] +name = "ndarray_graph_plugin_tests" +path = "examples/ndarray_graph_plugin_tests.rs" +doc-scrape-examples = false +required-features = ["ndarray-examples"] + +[package.metadata.example.ndarray_graph_plugin_tests] +hidden = true + # 2D Rendering [[example]] name = "bloom_2d" diff --git a/examples/README_NDARRAY_PLUGIN.md b/examples/README_NDARRAY_PLUGIN.md new file mode 100644 index 0000000000000..664c7749371dd --- /dev/null +++ b/examples/README_NDARRAY_PLUGIN.md @@ -0,0 +1,219 @@ +# ndarray Graph Plugin for Bevy + +## What this is + +`ndarray_graph_plugin` is a Bevy example that shows how to wire the +AdaWorldAPI/ndarray SIMD polyfill (`crate::simd::F32x16`, `Framebuffer`, +`compose_neo4j`, `GLOBAL_RENDERER`) directly into a Bevy `App` as a +first-class `Plugin`. Each Bevy `Update` tick advances a 64-node / +80-edge force-directed graph through `ndarray::hpc::renderer`'s +double-buffer integrator, rasterizes the result into a 512x512 palette-indexed +`Framebuffer` using `compose_neo4j`, converts the palette indices to RGBA via a +compile-time LUT, uploads the result as a `bevy::asset::Image`, and displays it +on a `Sprite`. The SIMD path (`F32x16::mul_add`, `U8x64::pairwise_avg`) is +selected at compile time from the `target-cpu` flag and confirmed at runtime +via `simd_caps()`. + +--- + +## Build + +### Prerequisites + +**Rust toolchain** + +``` +rustup toolchain install 1.95.0 +rustup override set 1.95.0 +``` + +**System libraries** (Debian/Ubuntu) + +``` +sudo apt-get update -y +sudo apt-get install -y libwayland-dev libasound2-dev libudev-dev +``` + +**Sibling ndarray checkout** + +The Bevy `Cargo.toml` depends on ndarray as a local path dependency +(`../ndarray`). The ndarray tree must be checked out next to the bevy +tree before building: + +``` +git clone https://github.com/AdaWorldAPI/ndarray.git ../ndarray +``` + +Both repos must be on matching branches for the feature flags to align. +The CI workflow clones the same-named branch if it exists, falling back +to `master`. + +--- + +## Run + +### CI-safe build (x86-64-v3, AVX2 baseline) + +This is the default. It works on every GitHub Actions runner. The ndarray +polyfill picks the 8-lane AVX2 path; `PREFERRED_F32_LANES` is 8. + +``` +cargo run --example ndarray_graph_plugin +``` + +### AVX-512 build (x86-64-v4, Sapphire Rapids / Ice Lake-SP / Zen 4+) + +The `run-avx512` alias is defined in `.cargo/config_ndarray_simd.toml`. +Copy or merge that file into `.cargo/config.toml` before using it. +This build will SIGILL on any host without AVX-512F; do not run it in CI +on stock GitHub Actions runners. + +``` +cargo run-avx512 --example ndarray_graph_plugin +``` + +--- + +## What it shows + +On startup the plugin seeds `GLOBAL_RENDERER` with 64 nodes arranged in a +circle and 80 directed edges forming a random sparse graph. Each `Update` +tick: + +1. `GLOBAL_RENDERER.tick(dt, damping)` integrates node positions via + `integrate_simd` — `F32x16::mul_add` fused multiply-add over the + position/velocity SoA buffers, one AVX-512 (or AVX2) pass per 16 + floats. + +2. `compose_neo4j(&mut fb, frame, &edges, scale, offset, node_color, edge_color)` + rasterizes the front buffer into a 512x512 `Framebuffer`: + - Edges drawn as Bresenham lines with palette index `edge_color`. + - Nodes drawn as dot sprites with palette index `node_color`. + - Pixel values are u8 palette indices (0–15 for AVX-512 tier, 0–7 + for AVX2 tier, 0–3 for NEON/scalar tier). + +3. A compile-time RGBA lookup table (`ndarray_graph_palette.rs`) maps + each palette index to a 4-byte RGBA value. The 512x512 pixel array is + expanded to a 1048576-byte RGBA buffer suitable for `bevy::asset::Image`. + +4. The `Image` is uploaded to the Bevy asset server and bound to a `Sprite` + component, which Bevy's 2D renderer displays in the window. + +The window title shows the current tick count, SIMD tier, and frame time +so the polyfill path is visible at a glance. + +--- + +## Architecture + +``` +Bevy App + └── NdarrayGraphPlugin + ├── Resource (wraps GLOBAL_RENDERER or a local instance) + │ └── ndarray::hpc::renderer::GLOBAL_RENDERER + │ ├── RenderFrame (front) ← readers here + │ └── RenderFrame (back) ← integrate_simd writes here + │ + ├── System: tick_renderer + │ calls Renderer::tick(dt, damping) + │ → F32x16::mul_add via crate::simd polyfill + │ + ├── System: rasterize_to_framebuffer + │ calls compose_neo4j(&mut fb, frame, edges, ...) + │ → Framebuffer { pixels: Vec } (palette indices) + │ + ├── System: palette_blit + │ expands palette indices → RGBA bytes via LUT + │ → bevy::asset::Image (Rgba8UnormSrgb, 512×512) + │ + └── Sprite ← displays the Image in the 2D world +``` + +Data flows in one direction: `Renderer` produces a `RenderFrame`, which +`compose_neo4j` reads to fill a `Framebuffer`, which the palette LUT +converts to an `Image`, which Bevy renders. No `&mut self` during any +compute step; all mutation is via the renderer's internal `RwLock` +double-buffer and Bevy's `ResMut`. + +--- + +## Compile-time vs runtime tier + +The polyfill exposes two orthogonal tier signals that can disagree: + +| Signal | Where | Value on AVX2 build | Value on AVX-512 build | +|--------|-------|---------------------|------------------------| +| `PREFERRED_F32_LANES` | compile-time const (`crate::simd`) | `8` | `16` | +| `simd_caps().avx512f` | runtime CPUID (`LazyLock`) | `true` (if Sapphire Rapids) | `true` | + +The smoke test caught exactly this mismatch: building with +`target-cpu=x86-64-v3` (the CI default) on a Sapphire Rapids host +produces `PREFERRED_F32_LANES=8` but `simd_caps().avx512f=true`. The two +signals are not automatically reconciled. + +**What controls which path runs:** + +- `target-cpu=x86-64-v3` (the default in `.cargo/config.toml`): the + compiler emits AVX2 code; `cfg(target_feature = "avx512f")` is false + at compile time; `F32x16::mul_add` compiles to 8-lane AVX2 FMA; + `PREFERRED_F32_LANES = 8`. The runtime tier reported by `simd_caps()` + is informational only — no code path switches based on it. + +- `target-cpu=x86-64-v4` (via `cargo run-avx512` alias): the compiler + emits AVX-512 code; `cfg(target_feature = "avx512f")` is true at + compile time; `F32x16::mul_add` compiles to 16-lane `_mm512_fmadd_ps`; + `PREFERRED_F32_LANES = 16`. The runtime `simd_caps()` tier now agrees + with compile time. + +The plugin prints both values at startup: + +``` +[ndarray_graph_plugin] compile-time: PREFERRED_F32_LANES=8 +[ndarray_graph_plugin] runtime: avx512f=true avx2=true +``` + +A mismatch is not an error — it is expected on Sapphire Rapids with a +CI-safe x86-64-v3 binary — but it means you are leaving AVX-512 throughput +on the table. Pass `-C target-cpu=x86-64-v4` (via the `run-avx512` alias) +to close the gap. + +--- + +## Companion files + +The full plugin is split across four files generated by the round-2 CCA2A +fleet: + +| File | Agent | Contents | +|------|-------|----------| +| `bevy/examples/ndarray_graph_plugin.rs` | agent #1 plugin-core | `NdarrayGraphPlugin` struct and impl, Bevy systems (`tick_renderer`, `rasterize_to_framebuffer`, `palette_blit`), `Cargo.toml` `[[example]]` entry | +| `bevy/examples/ndarray_graph_palette.rs` | agent #2 plugin-palette | Compile-time RGBA LUT, `palette_to_rgba` expansion function, tier-keyed color definitions for nodes / edges / background | +| `bevy/.github/workflows/ndarray-smoke.yml` | agent #3 plugin-ci | GitHub Actions workflow: clones ndarray sibling, installs system deps, sets Rust 1.95.0, runs `cargo check` on both `ndarray_simd_smoke` and `ndarray_graph_plugin` examples on every push/PR to `claude/**` branches | +| `bevy/examples/README_NDARRAY_PLUGIN.md` | agent #4 plugin-readme | This file | + +The existing smoke test at `bevy/examples/ndarray_simd_smoke.rs` remains +the canonical end-to-end correctness check. The graph plugin builds on the +same ndarray API surface that the smoke test exercises; see the smoke test's +assertion 5 (`compose_neo4j`) and assertions 3–4 (`integrate_simd`, +`integrate_simd_par`) for the tested contracts. + +--- + +## Known limitations + +- `integrate_simd_par` (rayon) is deliberately not used in the per-frame + tick at 64 nodes. The documented crossover is 65536 floats; at 64 nodes + (192 floats) rayon overhead dominates. Use `integrate_simd` for scenes + under ~5000 nodes and switch to `integrate_simd_par` only when profiling + confirms the crossover is reached. + +- `PaletteTier::detect()` currently proxies off `PREFERRED_F32_LANES` (a + f32 lane count) to select u8 palette depth. On an AVX2 build + (`PREFERRED_F32_LANES=8`) the framebuffer uses `Mid8` (8 colors) even + though AVX2 has 32 u8 lanes. This is a known issue in `framebuffer.rs`; + the plugin uses whichever tier `PaletteTier::detect()` returns. + +- The `GLOBAL_RENDERER` singleton is initialized once per process at 4096 + node capacity. It cannot be resized at runtime. For larger scenes, + construct a local `Renderer::with_capacity(n)` and store it as a Bevy + `Resource` instead of using `GLOBAL_RENDERER`. diff --git a/examples/ndarray_graph_palette.rs b/examples/ndarray_graph_palette.rs new file mode 100644 index 0000000000000..85d6d277bfb53 --- /dev/null +++ b/examples/ndarray_graph_palette.rs @@ -0,0 +1,156 @@ +//! Palette-index → RGBA conversion helper for the ndarray graph plugin. +//! +//! This module is a standalone library with no Bevy or ndarray dependencies. +//! It is imported by `ndarray_graph_plugin.rs` which uses the +//! `ndarray::simd::PaletteTier::Full16` tier (16-color palette). +//! +//! The [`PALETTE_LUT`] maps each of the 16 `PaletteTier::Full16` palette +//! indices to an RGBA byte quad. The palette is Neo4j/Palantir-inspired: +//! index 0 is a dark-navy background, indices 1–12 graduate through +//! deep-blue → cyan → white, and indices 13–15 are hot accent colours +//! (amber → hot-orange → crimson-red). +//! +//! # Usage from the plugin +//! ```rust,ignore +//! // In ndarray_graph_plugin.rs: +//! mod ndarray_graph_palette; +//! use ndarray_graph_palette::blit_u8_palette_to_rgba; +//! +//! let mut rgba = vec![0u8; palette_pixels.len() * 4]; +//! blit_u8_palette_to_rgba(&palette_pixels, &mut rgba); +//! ``` + +/// 16-entry RGBA look-up table for the `PaletteTier::Full16` palette. +/// +/// Each entry is `[R, G, B, A]` with A always 255 (fully opaque). +/// +/// Palette rationale (Neo4j/Palantir graph aesthetic): +/// - Index 0 — dark navy background (#0D1B2A) +/// - Index 1 — deep navy (#1A2D45) +/// - Index 2 — cobalt blue (#1E3A5F) +/// - Index 3 — medium blue (#1B4F8A) +/// - Index 4 — royal blue (#1A6BB5) +/// - Index 5 — sky blue (#2389DA) +/// - Index 6 — steel blue (#41A9E0) +/// - Index 7 — light cyan (#6DC8E8) +/// - Index 8 — pale cyan (#9DE0EF) +/// - Index 9 — ice blue (#C2EEF7) +/// - Index 10 — near-white blue (#E0F6FD) +/// - Index 11 — pure white (#FFFFFF) +/// - Index 12 — pale amber (#FFE08A) +/// - Index 13 — warm amber (#FFC33B) +/// - Index 14 — hot orange (#FF7A00) +/// - Index 15 — crimson accent (#E8001A) +pub const PALETTE_LUT: [[u8; 4]; 16] = [ + [0x0D, 0x1B, 0x2A, 0xFF], // 0 dark navy background + [0x1A, 0x2D, 0x45, 0xFF], // 1 deep navy + [0x1E, 0x3A, 0x5F, 0xFF], // 2 cobalt blue + [0x1B, 0x4F, 0x8A, 0xFF], // 3 medium blue + [0x1A, 0x6B, 0xB5, 0xFF], // 4 royal blue + [0x23, 0x89, 0xDA, 0xFF], // 5 sky blue + [0x41, 0xA9, 0xE0, 0xFF], // 6 steel blue + [0x6D, 0xC8, 0xE8, 0xFF], // 7 light cyan + [0x9D, 0xE0, 0xEF, 0xFF], // 8 pale cyan + [0xC2, 0xEE, 0xF7, 0xFF], // 9 ice blue + [0xE0, 0xF6, 0xFD, 0xFF], // 10 near-white blue + [0xFF, 0xFF, 0xFF, 0xFF], // 11 pure white + [0xFF, 0xE0, 0x8A, 0xFF], // 12 pale amber + [0xFF, 0xC3, 0x3B, 0xFF], // 13 warm amber + [0xFF, 0x7A, 0x00, 0xFF], // 14 hot orange + [0xE8, 0x00, 0x1A, 0xFF], // 15 crimson accent +]; + +/// Expand a palette-indexed byte buffer into a 32-bit RGBA buffer. +/// +/// Each byte in `palette_pixels` is treated as a 4-bit palette index +/// (bits 3:0; the upper nibble is masked off via `& 0x0F`). The +/// corresponding [`PALETTE_LUT`] entry is copied into four consecutive +/// bytes of `rgba_out`. +/// +/// # Panics +/// Panics (debug) / produces a short write (release) if +/// `rgba_out.len() < palette_pixels.len() * 4`. The caller is +/// responsible for pre-allocating an output buffer of the correct size. +/// +/// # Note on SIMD acceleration +/// The per-byte LUT lookup pattern (`permute_bytes`-style) is directly +/// supported by `crate::simd::U8x64::permute_bytes` in ndarray's SIMD +/// polyfill, which maps to `_mm512_permutexvar_epi8` on VBMI hardware. +/// For Round-2 scope the implementation uses a scalar `for` loop; a +/// vectorised path can be added once `permute_bytes` carries the +/// `#[target_feature(enable = "avx512vbmi")]` gate required by the +/// round-1 fleet review. +/// +/// # Examples +/// ``` +/// use ndarray_graph_palette::{blit_u8_palette_to_rgba, PALETTE_LUT}; +/// +/// let palette = [0u8, 15u8, 11u8]; +/// let mut rgba = [0u8; 12]; +/// blit_u8_palette_to_rgba(&palette, &mut rgba); +/// assert_eq!(&rgba[0..4], &PALETTE_LUT[0]); // index 0 → dark navy +/// assert_eq!(&rgba[4..8], &PALETTE_LUT[15]); // index 15 → crimson +/// assert_eq!(&rgba[8..12], &PALETTE_LUT[11]); // index 11 → white +/// ``` +#[inline] +pub fn blit_u8_palette_to_rgba(palette_pixels: &[u8], rgba_out: &mut [u8]) { + debug_assert!( + rgba_out.len() >= palette_pixels.len() * 4, + "rgba_out too short: need {} bytes, got {}", + palette_pixels.len() * 4, + rgba_out.len(), + ); + for (i, &p) in palette_pixels.iter().enumerate() { + rgba_out[i * 4..i * 4 + 4].copy_from_slice(&PALETTE_LUT[p as usize & 0x0F]); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Verify that a 64-byte palette buffer expands to 256 RGBA bytes, + /// and that the first and last pixels map to the expected LUT entries. + #[test] + fn palette_lut_roundtrip() { + // Build a 64-byte input: index 0 at position 0, index 15 at position 63, + // and a ramp through 0-15 in between. + let mut palette_pixels = [0u8; 64]; + for (i, byte) in palette_pixels.iter_mut().enumerate() { + *byte = (i & 0x0F) as u8; + } + // position 0 → index 0, position 63 → index 15 (63 & 0x0F = 15) + + let mut rgba_out = [0u8; 256]; // 64 * 4 + blit_u8_palette_to_rgba(&palette_pixels, &mut rgba_out); + + // Output length must be exactly 256 bytes. + assert_eq!(rgba_out.len(), 256); + + // First pixel must match palette index 0 (dark navy background). + assert_eq!( + &rgba_out[0..4], + &PALETTE_LUT[0], + "pixel 0 should be index 0 (dark navy)" + ); + + // Last pixel must match palette index 15 (crimson accent). + assert_eq!( + &rgba_out[252..256], + &PALETTE_LUT[15], + "pixel 63 should be index 15 (crimson accent)" + ); + + // Spot-check: position 11 → index 11 (white). + assert_eq!( + &rgba_out[11 * 4..11 * 4 + 4], + &PALETTE_LUT[11], + "pixel 11 should be index 11 (white)" + ); + + // Alpha channel is always 255 for every entry. + for chunk in rgba_out.chunks_exact(4) { + assert_eq!(chunk[3], 0xFF, "alpha must be 255"); + } + } +} diff --git a/examples/ndarray_graph_plugin.rs b/examples/ndarray_graph_plugin.rs new file mode 100644 index 0000000000000..ad7b4ed3e1811 --- /dev/null +++ b/examples/ndarray_graph_plugin.rs @@ -0,0 +1,248 @@ +//! # NdarrayGraphPlugin — Bevy plugin for SIMD-accelerated graph rendering +//! +//! Visualises a force-directed graph using `ndarray::hpc::renderer::Renderer` +//! (double-buffered, SIMD-integrated) and `ndarray::hpc::framebuffer::Framebuffer` +//! (palette-indexed rasteriser). Each frame: +//! +//! 1. `tick_renderer` — advances physics via `Renderer::tick(dt, 0.98)`. +//! 2. `render_to_framebuffer` — rasterises via `compose_neo4j` into a +//! long-lived 512×512 `Framebuffer`, expands palette→RGBA8 via the +//! shared `ndarray_graph_palette::PALETTE_LUT`, and blits into a +//! long-lived Bevy `Image`. +//! +//! Run headless (no window required for compile checks): +//! ``` +//! cargo check --example ndarray_graph_plugin +//! ``` + +use std::f32::consts::TAU; + +use bevy::{ + asset::RenderAssetUsages, + prelude::*, + render::render_resource::{Extent3d, TextureDimension, TextureFormat}, +}; +use ndarray::hpc::framebuffer::{compose_neo4j, Framebuffer}; +use ndarray::hpc::renderer::{Renderer, DT_60}; + +// Share the canonical 16-entry RGBA8 palette with the smoke / tests examples. +#[path = "ndarray_graph_palette.rs"] +mod palette; +use palette::{blit_u8_palette_to_rgba, PALETTE_LUT}; + +// ── Constants ──────────────────────────────────────────────────────────────── + +/// Side length of the off-screen framebuffer in pixels. +const FB_SIZE: u32 = 512; +/// Number of seed nodes placed in the circle layout on startup. +const NODE_COUNT: usize = 64; +/// Radius of the circle layout in logical units. +const LAYOUT_RADIUS: f32 = 20.0; +/// Node renderer capacity (must be ≥ NODE_COUNT, padded to SIMD lanes). +const RENDERER_CAPACITY: usize = 1024; + +/// Palette index used for node dot sprites. +const NODE_COLOR: u8 = 15; +/// Palette index used for Bresenham edge lines. +const EDGE_COLOR: u8 = 8; +/// Scale factor: logical units → framebuffer pixels. +const SCALE: f32 = 8.0; +/// Offset that maps the graph origin to the centre of the 512×512 framebuffer. +const OFFSET: (f32, f32) = (256.0, 256.0); +/// Physics damping applied each tick (≈ 2 % velocity bleed per frame at 60 Hz). +const DAMPING: f32 = 0.98; + +// ── Resources ──────────────────────────────────────────────────────────────── + +/// Bevy `Resource` wrapping the double-buffered SIMD renderer. +/// +/// Heap-allocated via `Box` so the `RwLock`-guarded frames don't move. +#[derive(Resource)] +pub struct GraphRenderer { + renderer: Box, + /// Flat edge list shared between the seeder and the rasteriser. + edges: Vec<(usize, usize)>, +} + +/// Long-lived per-frame resources so we never allocate inside `Update`. +#[derive(Resource)] +struct RenderSurface { + /// 512×512 palette-indexed framebuffer (re-cleared each tick by `compose_neo4j`). + framebuffer: Framebuffer, + /// Handle to the Bevy `Image` asset we upload palette pixels into. + image_handle: Handle, +} + +// ── Plugin ─────────────────────────────────────────────────────────────────── + +/// Bevy plugin: SIMD-accelerated force-directed graph → Sprite display. +/// +/// # Systems +/// +/// | Schedule | System | Purpose | +/// |-----------|------------------------|--------------------------------| +/// | `Startup` | `seed_graph` | Place nodes + edges, swap once | +/// | `Update` | `tick_renderer` | Physics step (SIMD) | +/// | `Update` | `render_to_framebuffer`| Rasterise + blit to GPU Image | +pub struct NdarrayGraphPlugin; + +impl Plugin for NdarrayGraphPlugin { + fn build(&self, app: &mut App) { + app.add_systems(Startup, (setup_camera, setup_render_surface, seed_graph).chain()) + .add_systems( + Update, + (tick_renderer, render_to_framebuffer).chain(), + ); + } +} + +// ── Startup systems ─────────────────────────────────────────────────────────── + +/// Spawn a 2-D camera so the sprite is visible. +fn setup_camera(mut commands: Commands) { + commands.spawn(Camera2d); +} + +/// Allocate the long-lived `Framebuffer` and the Bevy `Image`, then spawn +/// the `Sprite` that displays it. +fn setup_render_surface( + mut commands: Commands, + mut images: ResMut>, +) { + // Allocate a 512×512 RGBA8 image filled with black (palette index 0). + let rgba = PALETTE_LUT[0]; + let image = Image::new_fill( + Extent3d { + width: FB_SIZE, + height: FB_SIZE, + depth_or_array_layers: 1, + }, + TextureDimension::D2, + &rgba, + TextureFormat::Rgba8Unorm, + RenderAssetUsages::MAIN_WORLD | RenderAssetUsages::RENDER_WORLD, + ); + let image_handle = images.add(image); + + // Spawn the sprite that displays the image. + commands.spawn(Sprite::from_image(image_handle.clone())); + + commands.insert_resource(RenderSurface { + framebuffer: Framebuffer::new(FB_SIZE as usize, FB_SIZE as usize), + image_handle, + }); +} + +/// Seed 64 nodes in a circle layout with ~80 random edges, write into the +/// back frame, then swap front↔back so the first tick sees live data. +fn seed_graph(mut commands: Commands) { + let renderer = Box::new(Renderer::with_capacity(RENDERER_CAPACITY)); + + // Write node positions into the back frame. + { + let mut back = renderer.write_back(); + back.len = NODE_COUNT; + for i in 0..NODE_COUNT { + let angle = TAU * (i as f32) / (NODE_COUNT as f32); + let x = LAYOUT_RADIUS * angle.cos(); + let y = LAYOUT_RADIUS * angle.sin(); + back.positions[i * 3] = x; + back.positions[i * 3 + 1] = y; + back.positions[i * 3 + 2] = 0.0; + // Small tangential velocity to kick off the simulation. + back.velocities[i * 3] = -angle.sin() * 0.5; + back.velocities[i * 3 + 1] = angle.cos() * 0.5; + // Uniform charge so all nodes repel equally. + back.charges[i] = 1.0; + } + } + // Swap so the front frame (read by `render_to_framebuffer`) is populated. + renderer.swap(); + + // Build ~80 edges: ring edges + a handful of cross-links. + let mut edges: Vec<(usize, usize)> = Vec::with_capacity(96); + // Ring edges (64) + for i in 0..NODE_COUNT { + edges.push((i, (i + 1) % NODE_COUNT)); + } + // Cross-links (~16) using a simple deterministic stride pattern. + for i in 0..16 { + let a = (i * 4) % NODE_COUNT; + let b = (i * 4 + NODE_COUNT / 2) % NODE_COUNT; + if a != b { + edges.push((a, b)); + } + } + + commands.insert_resource(GraphRenderer { + renderer, + edges, + }); +} + +// ── Update systems ──────────────────────────────────────────────────────────── + +/// Advance the physics simulation by one frame. +/// +/// Calls `Renderer::tick(dt, damping)` which: integrates velocities into +/// positions via `F32x16::mul_add` (SIMD), then atomically swaps front/back. +fn tick_renderer(graph: ResMut, time: Res