From b85ca35cbed5179b4aaab9fdce204016c5aa830d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 3 Apr 2026 17:54:04 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20I32x16=20Base17=20ops=20=E2=80=94=20fro?=
 =?UTF-8?q?m=5Fi16=5Fslice,=20abs,=20to=5Fi16=5Farray,=20cmpge=5Fzero=5Fma?=
 =?UTF-8?q?sk?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added to all 3 tiers (AVX-512 / AVX2 / scalar):
  from_i16_slice(&[i16]) → I32x16  — load 16×i16, sign-extend to 16×i32
  abs() → I32x16                    — absolute value per lane
  to_i16_array() → [i16; 16]       — narrow 16×i32 back to 16×i16
  cmpge_zero_mask() → u16          — bit mask where lane >= 0

These are the primitives bgz17_bridge.rs needs to replace its 92 raw
intrinsics with crate::simd::I32x16 calls.

Fixed duplicate abs() in AVX-512 I32x16.
19 bgz17_bridge tests pass.

https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
---
 src/simd.rs        | 19 +++++++++++++++++++
 src/simd_avx2.rs   | 25 +++++++++++++++++++++++++
 src/simd_avx512.rs | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/src/simd.rs b/src/simd.rs
index a6654b74..fc2e56ff 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -546,6 +546,25 @@ mod scalar {
             for i in 0..16 { out[i] = self.0[i].abs(); }
             Self(out)
         }
+        #[inline(always)]
+        pub fn from_i16_slice(s: &[i16]) -> Self {
+            assert!(s.len() >= 16);
+            let mut o = [0i32; 16];
+            for i in 0..16 { o[i] = s[i] as i32; }
+            Self(o)
+        }
+        #[inline(always)]
+        pub fn to_i16_array(self) -> [i16; 16] {
+            let mut o = [0i16; 16];
+            for i in 0..16 { o[i] = self.0[i] as i16; }
+            o
+        }
+        #[inline(always)]
+        pub fn cmpge_zero_mask(self) -> u16 {
+            let mut mask = 0u16;
+            for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } }
+            mask
+        }
     }
 
     impl Mul for I32x16 {
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
index b8f9ad84..bf3726b6 100644
--- a/src/simd_avx2.rs
+++ b/src/simd_avx2.rs
@@ -843,6 +843,31 @@ impl I32x16 {
     #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
     #[inline(always)] pub fn cast_f32(self) -> F32x16 { let mut o = [0.0f32; 16]; for i in 0..16 { o[i] = self.0[i] as f32; } F32x16::from_array(o) }
     #[inline(always)] pub fn abs(self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].abs(); } Self(o) }
+
+    /// Load 16 × i16, sign-extend to 16 × i32.
+    #[inline(always)]
+    pub fn from_i16_slice(s: &[i16]) -> Self {
+        assert!(s.len() >= 16);
+        let mut o = [0i32; 16];
+        for i in 0..16 { o[i] = s[i] as i32; }
+        Self(o)
+    }
+
+    /// Narrow 16 × i32 to 16 × i16 (truncation).
+    #[inline(always)]
+    pub fn to_i16_array(self) -> [i16; 16] {
+        let mut o = [0i16; 16];
+        for i in 0..16 { o[i] = self.0[i] as i16; }
+        o
+    }
+
+    /// Mask: bit i set where lane i >= 0.
+    #[inline(always)]
+    pub fn cmpge_zero_mask(self) -> u16 {
+        let mut mask = 0u16;
+        for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } }
+        mask
+    }
 }
 impl Mul for I32x16 { type Output = Self; #[inline(always)] fn mul(self, r: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].wrapping_mul(r.0[i]); } Self(o) } }
 impl MulAssign for I32x16 { #[inline(always)] fn mul_assign(&mut self, r: Self) { *self = *self * r; } }
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
index ad249d3d..99592963 100644
--- a/src/simd_avx512.rs
+++ b/src/simd_avx512.rs
@@ -758,6 +758,40 @@ impl I32x16 {
         unsafe { _mm512_reduce_max_epi32(self.0) }
     }
 
+    // ── Base17 i16[17] operations: load-widen, abs, narrow ──────────────
+    // Used by bgz17_bridge.rs for L1 distance, weighted L1, sign agreement, xor_bind.
+
+    /// Load 16 × i16 from slice, sign-extend to 16 × i32.
+    /// This is the first step of every Base17 kernel: i16 → i32 to avoid overflow.
+    #[inline(always)]
+    pub fn from_i16_slice(s: &[i16]) -> Self {
+        assert!(s.len() >= 16);
+        Self(unsafe { _mm512_cvtepi16_epi32(_mm256_loadu_si256(s.as_ptr() as *const __m256i)) })
+    }
+
+    /// Absolute value per lane.
+    #[inline(always)]
+    pub fn abs(self) -> Self {
+        Self(unsafe { _mm512_abs_epi32(self.0) })
+    }
+
+    /// Narrow 16 × i32 back to 16 × i16 (truncation, no saturation).
+    #[inline(always)]
+    pub fn to_i16_array(self) -> [i16; 16] {
+        unsafe {
+            let packed = _mm512_cvtepi32_epi16(self.0);
+            let mut arr = [0i16; 16];
+            _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, packed);
+            arr
+        }
+    }
+
+    /// Compare >= 0: returns 16-bit mask. Bit i set where lane i >= 0.
+    #[inline(always)]
+    pub fn cmpge_zero_mask(self) -> u16 {
+        unsafe { _mm512_cmpge_epi32_mask(self.0, _mm512_setzero_si512()) }
+    }
+
     #[inline(always)]
     pub fn simd_min(self, other: Self) -> Self {
         Self(unsafe { _mm512_min_epi32(self.0, other.0) })
@@ -773,11 +807,6 @@ impl I32x16 {
     pub fn cast_f32(self) -> F32x16 {
         F32x16(unsafe { _mm512_cvtepi32_ps(self.0) })
     }
-
-    #[inline(always)]
-    pub fn abs(self) -> Self {
-        Self(unsafe { _mm512_abs_epi32(self.0) })
-    }
 }
 
 impl_bin_op!(I32x16, Add, add, _mm512_add_epi32);