Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .claude/knowledge/simd-dispatch-architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,19 @@ tracked as TD-SIMD-3.)
| `U8x64` | βœ… `__m512i` | 🟠 `[u8; 64]` polyfill | ❌ | πŸ”΅ | βœ… |
| `U8x32` | βœ… `__m256i` | βœ… `__m256i` | ❌ | πŸ”΅ | βœ… |
| `U16x32` | βœ… `__m512i` | 🟠 `[u16; 32]` polyfill | ❌ | πŸ”΅ | βœ… |
| `U16x16` | ❌ | ❌ | ❌ | ❌ | ❌ |
| `U16x16` | 🟠 (via `simd_avx2`) | 🟠 `[u16; 16]` polyfill | ❌ | πŸ”΅ `core::simd::u16x16` | 🟠 |
| `U32x16` | βœ… `__m512i` | 🟠 `[u32; 16]` polyfill | ❌ | πŸ”΅ | βœ… |
| `U32x8` | ❌ | ❌ | ❌ | πŸ”΅ `core::simd::u32x8` | ❌ |
| `U32x8` | 🟠 (via `simd_avx2`) | 🟠 `[u32; 8]` polyfill | ❌ | πŸ”΅ `core::simd::u32x8` | 🟠 |
| `U64x8` | βœ… `__m512i` | 🟠 `[u64; 8]` polyfill | ❌ | πŸ”΅ | βœ… |
| `U64x4` | ❌ | ❌ | ❌ | πŸ”΅ `core::simd::u64x4` | ❌ |
| `U64x4` | 🟠 (via `simd_avx2`) | 🟠 `[u64; 4]` polyfill | ❌ | πŸ”΅ `core::simd::u64x4` | 🟠 |
| `I8x32` | βœ… `__m256i` | βœ… `__m256i` (in `simd_avx512`) | ❌ | πŸ”΅ | βœ… |
| `I8x64` | βœ… `__m512i` | 🟠 `[i8; 64]` polyfill | ❌ | πŸ”΅ | βœ… |
| `I16x16` | βœ… `__m256i` | βœ… `__m256i` (in `simd_avx512`) | ❌ | πŸ”΅ | βœ… |
| `I16x32` | βœ… `__m512i` | 🟠 `[i16; 32]` polyfill | ❌ | πŸ”΅ | βœ… |
| `I32x16` | βœ… `__m512i` | 🟠 `[i32; 16]` polyfill | ❌ | πŸ”΅ | βœ… |
| `I32x8` | ❌ | ❌ | ❌ | ❌ | ❌ |
| `I32x8` | 🟠 (via `simd_avx2`) | 🟠 `[i32; 8]` polyfill | ❌ | πŸ”΅ `core::simd::i32x8` | 🟠 |
| `I64x8` | βœ… `__m512i` | 🟠 `[i64; 8]` polyfill | ❌ | πŸ”΅ | βœ… |
| `I64x4` | ❌ | ❌ | ❌ | ❌ | ❌ |
| `I64x4` | 🟠 (via `simd_avx2`) | 🟠 `[i64; 4]` polyfill | ❌ | πŸ”΅ `core::simd::i64x4` | 🟠 |
| `BF16x8` | βœ… `__m128bh` | ❌ | ❌ | πŸ”΅ | βœ… |
| `BF16x16` | βœ… `__m256bh` | ❌ | ❌ | πŸ”΅ | βœ… |
| `F16x16` | ❌ | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🟠 `[u16; 16]` polyfill (`simd_half`) | πŸ”΅ | βœ… |
Expand Down
32 changes: 24 additions & 8 deletions src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,10 @@ pub const PREFERRED_I16_LANES: usize = 16;
// as soon as `nightly-simd` is on.
#[cfg(feature = "nightly-simd")]
pub use crate::simd_nightly::{
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8,
u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8,
I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u16x32, u32x16,
u32x8, u64x4, u64x8, u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8,
F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8, I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4,
Comment on lines +223 to +225
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Add nightly arithmetic/bitwise ops for newly exported int lanes

This change publicly exposes U16x16, I32x8, and I64x4 through crate::simd::* when nightly-simd is enabled, but src/simd_nightly/ops.rs still only implements impl_int_ops!/impl_int_neg! for the older set (U16x32, U32x16, U32x8, U64x8, U64x4, I16x16, I16x32, I32x16, I64x8). As a result, code that uses operators like +, -, &, |, ^ (and unary - for signed types) on these new lanes will fail to compile only under the nightly backend, creating backend-specific API breakage.

Useful? React with πŸ‘Β / πŸ‘Ž.

U64x8, U8x32, U8x64,
};

#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
Expand All @@ -234,10 +235,15 @@ pub use crate::simd_avx512::{
i16x16,
i16x32,
i32x16,
i32x8,
i64x4,
i64x8,
i8x32,
i8x64,
u16x16,
u32x16,
u32x8,
u64x4,
u64x8,
u8x64,
F32Mask16,
Expand All @@ -251,11 +257,18 @@ pub use crate::simd_avx512::{
I16x16,
I16x32,
I32x16,
// 256-bit int polyfills surfaced 2026-05-20 (re-exported from
// `simd_avx2` via `simd_avx512`'s re-export at line ~2260).
I32x8,
I64x4,
I64x8,
I8x32,
I8x64,
U16x16,
U16x32,
U32x16,
U32x8,
U64x4,
U64x8,
U8x64,
};
Expand Down Expand Up @@ -302,8 +315,9 @@ pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16,
not(feature = "nightly-simd")
))]
pub use crate::simd_avx2::{
f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
f32x16, f64x8, i16x32, i32x16, i32x8, i64x4, i64x8, i8x64, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32Mask16,
F32x16, F64Mask8, F64x8, I16x32, I32x16, I32x8, I64x4, I64x8, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8,
U8x64,
};

// U8x32 β€” native AVX2 byte width (one __m256i = 32 bytes). Available on
Expand Down Expand Up @@ -335,7 +349,8 @@ pub(crate) mod scalar;
pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
pub use scalar::{
f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
f32x8, f64x4, i32x16, i32x8, i64x4, i64x8, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32x8, F64x4, I32x16, I32x8,
I64x4, I64x8, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
};

// Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
Expand All @@ -345,8 +360,9 @@ pub use scalar::{
not(feature = "nightly-simd")
))]
pub use scalar::{
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u32x16, u32x8,
u64x4, u64x8, u8x64, F32Mask16, F32x16, F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8,
I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
};

// Scalar BF16 conversion β€” always available on all platforms
Expand Down
26 changes: 26 additions & 0 deletions src/simd_avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1542,6 +1542,19 @@ avx2_int_type!(U16x32, u16, 32, 0u16);
avx2_int_type!(U32x16, u32, 16, 0u32);
avx2_int_type!(U64x8, u64, 8, 0u64);

// 256-bit int lanes β€” scalar polyfills filling the gap surfaced by the
// 2026-05-20 matrix audit. None of these had wrappers anywhere except
// for `U32x8` / `U64x4` in `simd_nightly`. Adding `U16x16`, `U32x8`,
// `U64x4`, `I32x8`, `I64x4` here mirrors the existing 512-bit polyfill
// pattern (`[$elem; $lanes]` storage, align 64). Native AVX2 `__m256i`
// upgrades for these are TD-SIMD-3 (the same fold-into-real-SIMD task
// already tracked for the 512-bit polyfills above).
avx2_int_type!(U16x16, u16, 16, 0u16);
avx2_int_type!(U32x8, u32, 8, 0u32);
avx2_int_type!(U64x4, u64, 4, 0u64);
avx2_int_type!(I32x8, i32, 8, 0i32);
avx2_int_type!(I64x4, i64, 4, 0i64);

// Extra methods for U16x32 (widen/narrow, shift, multiply) β€” AVX2 scalar fallback.
impl U16x32 {
#[inline(always)]
Expand Down Expand Up @@ -2266,6 +2279,19 @@ pub type i8x64 = I8x64;
#[allow(non_camel_case_types)]
pub type i16x32 = I16x32;

// Lowercase aliases for the 256-bit polyfills added in the 2026-05-20
// missing-lanes sweep.
#[allow(non_camel_case_types)]
pub type u16x16 = U16x16;
#[allow(non_camel_case_types)]
pub type u32x8 = U32x8;
#[allow(non_camel_case_types)]
pub type u64x4 = U64x4;
#[allow(non_camel_case_types)]
pub type i32x8 = I32x8;
#[allow(non_camel_case_types)]
pub type i64x4 = I64x4;

#[cfg(test)]
mod tests {
use super::*;
Expand Down
8 changes: 8 additions & 0 deletions src/simd_avx512.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2256,6 +2256,14 @@ pub type i16x32 = I16x32;
#[allow(non_camel_case_types)]
pub type i16x16 = I16x16;

// 256-bit int lanes β€” added 2026-05-20 missing-lanes sweep. These types
// don't have native `__m256i` wrappers in this module yet; re-exported
// from `simd_avx2.rs` (where they live as scalar-storage polyfills via
// the `avx2_int_type!` macro) so the v4 dispatch arm in `simd.rs` can
// surface them through `crate::simd::*` with the same names the v3 arm
// uses. Native AVX2 `__m256i` upgrades for these are TD-SIMD-3.
pub use crate::simd_avx2::{i32x8, i64x4, u16x16, u32x8, u64x4, I32x8, I64x4, U16x16, U32x8, U64x4};

// ============================================================================
// BF16 conversion wrappers β€” AVX-512 BF16 hardware instructions
// ============================================================================
Expand Down
176 changes: 175 additions & 1 deletion src/simd_nightly/i_word_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

use core::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
use core::simd::num::SimdInt;
use core::simd::{i16x16, i16x32, i32x16, i64x8};
use core::simd::{i16x16, i16x32, i32x16, i32x8, i64x4, i64x8};

// ════════════════════════════════════════════════════════════════════
// I16x16 β€” 16-lane signed 16-bit integer
Expand Down Expand Up @@ -428,3 +428,177 @@ impl core::fmt::Display for I64x8 {
write!(f, "I64x8({:?})", &self.to_array()[..])
}
}

// ════════════════════════════════════════════════════════════════════
// I32x8 β€” 8-lane i32 (256-bit, added 2026-05-20 missing-lanes sweep)
// ════════════════════════════════════════════════════════════════════

/// 8-lane `i32` SIMD vector backed by `core::simd::i32x8`.
///
/// API mirrors `simd_avx512::I32x16` at half-width. Miri-executable.
#[derive(Copy, Clone, Debug)]
#[repr(transparent)]
pub struct I32x8(pub i32x8);

impl I32x8 {
pub const LANES: usize = 8;

#[inline(always)]
pub fn splat(v: i32) -> Self {
Self(i32x8::splat(v))
}

#[inline(always)]
pub fn from_slice(s: &[i32]) -> Self {
assert!(s.len() >= 8, "I32x8::from_slice needs >=8 elements");
Self(i32x8::from_slice(s))
}

#[inline(always)]
pub fn from_array(arr: [i32; 8]) -> Self {
Self(i32x8::from_array(arr))
}

#[inline(always)]
pub fn to_array(self) -> [i32; 8] {
self.0.to_array()
}

#[inline(always)]
pub fn copy_to_slice(self, s: &mut [i32]) {
assert!(s.len() >= 8, "I32x8::copy_to_slice needs >=8 elements");
self.0.copy_to_slice(s);
}

#[inline(always)]
pub fn reduce_sum(self) -> i32 {
self.0.reduce_sum()
}
#[inline(always)]
pub fn reduce_min(self) -> i32 {
self.0.reduce_min()
}
#[inline(always)]
pub fn reduce_max(self) -> i32 {
self.0.reduce_max()
}

#[inline(always)]
pub fn simd_min(self, other: Self) -> Self {
Self(self.0.simd_min(other.0))
}
#[inline(always)]
pub fn simd_max(self, other: Self) -> Self {
Self(self.0.simd_max(other.0))
}

#[inline(always)]
pub fn cmpeq_mask(self, other: Self) -> u8 {
self.0.simd_eq(other.0).to_bitmask() as u8
}
#[inline(always)]
pub fn cmpgt_mask(self, other: Self) -> u8 {
self.0.simd_gt(other.0).to_bitmask() as u8
}
}

impl Default for I32x8 {
#[inline(always)]
fn default() -> Self {
Self::splat(0)
}
}

impl PartialEq for I32x8 {
#[inline(always)]
fn eq(&self, other: &Self) -> bool {
self.to_array() == other.to_array()
}
}

// ════════════════════════════════════════════════════════════════════
// I64x4 β€” 4-lane i64 (256-bit, added 2026-05-20 missing-lanes sweep)
// ════════════════════════════════════════════════════════════════════

/// 4-lane `i64` SIMD vector backed by `core::simd::i64x4`.
///
/// API mirrors `simd_avx512::I64x8` at half-width. Miri-executable.
#[derive(Copy, Clone, Debug)]
#[repr(transparent)]
pub struct I64x4(pub i64x4);

impl I64x4 {
pub const LANES: usize = 4;

#[inline(always)]
pub fn splat(v: i64) -> Self {
Self(i64x4::splat(v))
}

#[inline(always)]
pub fn from_slice(s: &[i64]) -> Self {
assert!(s.len() >= 4, "I64x4::from_slice needs >=4 elements");
Self(i64x4::from_slice(s))
}

#[inline(always)]
pub fn from_array(arr: [i64; 4]) -> Self {
Self(i64x4::from_array(arr))
}

#[inline(always)]
pub fn to_array(self) -> [i64; 4] {
self.0.to_array()
}

#[inline(always)]
pub fn copy_to_slice(self, s: &mut [i64]) {
assert!(s.len() >= 4, "I64x4::copy_to_slice needs >=4 elements");
self.0.copy_to_slice(s);
}

#[inline(always)]
pub fn reduce_sum(self) -> i64 {
self.0.reduce_sum()
}
#[inline(always)]
pub fn reduce_min(self) -> i64 {
self.0.reduce_min()
}
#[inline(always)]
pub fn reduce_max(self) -> i64 {
self.0.reduce_max()
}

#[inline(always)]
pub fn simd_min(self, other: Self) -> Self {
Self(self.0.simd_min(other.0))
}
#[inline(always)]
pub fn simd_max(self, other: Self) -> Self {
Self(self.0.simd_max(other.0))
}

#[inline(always)]
pub fn cmpeq_mask(self, other: Self) -> u8 {
self.0.simd_eq(other.0).to_bitmask() as u8
}
#[inline(always)]
pub fn cmpgt_mask(self, other: Self) -> u8 {
self.0.simd_gt(other.0).to_bitmask() as u8
}
}

impl Default for I64x4 {
#[inline(always)]
fn default() -> Self {
Self::splat(0)
}
}

impl PartialEq for I64x4 {
#[inline(always)]
fn eq(&self, other: &Self) -> bool {
self.to_array() == other.to_array()
}
}
11 changes: 9 additions & 2 deletions src/simd_nightly/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ pub use f16_types::F16x16;
pub use f32_types::{F32x16, F32x8};
pub use f64_types::{F64x4, F64x8};
pub use i8_types::{I8x32, I8x64};
pub use i_word_types::{I16x16, I16x32, I32x16, I64x8};
pub use i_word_types::{I16x16, I16x32, I32x16, I32x8, I64x4, I64x8};
pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8};
pub use u8_types::{U8x32, U8x64};
pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8};
pub use u_word_types::{U16x16, U16x32, U32x16, U32x8, U64x4, U64x8};

// Lowercase aliases β€” match the std::simd convention used by
// `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in
Expand Down Expand Up @@ -83,3 +83,10 @@ pub type i16x16 = I16x16;
pub type i32x16 = I32x16;
#[allow(non_camel_case_types)]
pub type i64x8 = I64x8;
// 256-bit aliases for the missing-lanes sweep (2026-05-20).
#[allow(non_camel_case_types)]
pub type u16x16 = U16x16;
#[allow(non_camel_case_types)]
pub type i32x8 = I32x8;
#[allow(non_camel_case_types)]
pub type i64x4 = I64x4;
Loading
Loading