From b85ca35cbed5179b4aaab9fdce204016c5aa830d Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 3 Apr 2026 17:54:04 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20I32x16=20Base17=20ops=20=E2=80=94=20fro?= =?UTF-8?q?m=5Fi16=5Fslice,=20abs,=20to=5Fi16=5Farray,=20cmpge=5Fzero=5Fma?= =?UTF-8?q?sk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added to all 3 tiers (AVX-512 / AVX2 / scalar): from_i16_slice(&[i16]) → I32x16 — load 16×i16, sign-extend to 16×i32 abs() → I32x16 — absolute value per lane to_i16_array() → [i16; 16] — narrow 16×i32 back to 16×i16 cmpge_zero_mask() → u16 — bit mask where lane >= 0 These are the primitives bgz17_bridge.rs needs to replace its 92 raw intrinsics with crate::simd::I32x16 calls. Fixed duplicate abs() in AVX-512 I32x16. 19 bgz17_bridge tests pass. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp --- src/simd.rs | 19 +++++++++++++++++++ src/simd_avx2.rs | 25 +++++++++++++++++++++++++ src/simd_avx512.rs | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 78 insertions(+), 5 deletions(-) diff --git a/src/simd.rs b/src/simd.rs index a6654b74..fc2e56ff 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -546,6 +546,25 @@ mod scalar { for i in 0..16 { out[i] = self.0[i].abs(); } Self(out) } + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + let mut o = [0i32; 16]; + for i in 0..16 { o[i] = s[i] as i32; } + Self(o) + } + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + let mut o = [0i16; 16]; + for i in 0..16 { o[i] = self.0[i] as i16; } + o + } + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + let mut mask = 0u16; + for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } } + mask + } } impl Mul for I32x16 { diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index b8f9ad84..bf3726b6 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -843,6 +843,31 @@ impl I32x16 { #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].max(other.0[i]); } Self(o) } #[inline(always)] pub fn cast_f32(self) -> F32x16 { let mut o = [0.0f32; 16]; for i in 0..16 { o[i] = self.0[i] as f32; } F32x16::from_array(o) } #[inline(always)] pub fn abs(self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].abs(); } Self(o) } + + /// Load 16 × i16, sign-extend to 16 × i32. + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + let mut o = [0i32; 16]; + for i in 0..16 { o[i] = s[i] as i32; } + Self(o) + } + + /// Narrow 16 × i32 to 16 × i16 (truncation). + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + let mut o = [0i16; 16]; + for i in 0..16 { o[i] = self.0[i] as i16; } + o + } + + /// Mask: bit i set where lane i >= 0. + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + let mut mask = 0u16; + for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } } + mask + } } impl Mul for I32x16 { type Output = Self; #[inline(always)] fn mul(self, r: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].wrapping_mul(r.0[i]); } Self(o) } } impl MulAssign for I32x16 { #[inline(always)] fn mul_assign(&mut self, r: Self) { *self = *self * r; } } diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs index ad249d3d..99592963 100644 --- a/src/simd_avx512.rs +++ b/src/simd_avx512.rs @@ -758,6 +758,40 @@ impl I32x16 { unsafe { _mm512_reduce_max_epi32(self.0) } } + // ── Base17 i16[17] operations: load-widen, abs, narrow ────────────── + // Used by bgz17_bridge.rs for L1 distance, weighted L1, sign agreement, xor_bind. + + /// Load 16 × i16 from slice, sign-extend to 16 × i32. + /// This is the first step of every Base17 kernel: i16 → i32 to avoid overflow. + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + Self(unsafe { _mm512_cvtepi16_epi32(_mm256_loadu_si256(s.as_ptr() as *const __m256i)) }) + } + + /// Absolute value per lane. + #[inline(always)] + pub fn abs(self) -> Self { + Self(unsafe { _mm512_abs_epi32(self.0) }) + } + + /// Narrow 16 × i32 back to 16 × i16 (truncation, no saturation). + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + unsafe { + let packed = _mm512_cvtepi32_epi16(self.0); + let mut arr = [0i16; 16]; + _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, packed); + arr + } + } + + /// Compare >= 0: returns 16-bit mask. Bit i set where lane i >= 0. + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + unsafe { _mm512_cmpge_epi32_mask(self.0, _mm512_setzero_si512()) } + } + #[inline(always)] pub fn simd_min(self, other: Self) -> Self { Self(unsafe { _mm512_min_epi32(self.0, other.0) }) @@ -773,11 +807,6 @@ impl I32x16 { pub fn cast_f32(self) -> F32x16 { F32x16(unsafe { _mm512_cvtepi32_ps(self.0) }) } - - #[inline(always)] - pub fn abs(self) -> Self { - Self(unsafe { _mm512_abs_epi32(self.0) }) - } } impl_bin_op!(I32x16, Add, add, _mm512_add_epi32);