AdaWorldAPI · AdaWorldAPI · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/.claude/agents/l3-strategist.md b/.claude/agents/l3-strategist.md
@@ -6,7 +6,7 @@ description: >
   mapping rustynum capabilities to ndarray's trait system, identifying
   architectural gaps, or planning multi-phase implementation roadmaps.
 tools: Read, Glob, Grep, Bash
-model: sonnet
+model: opus
 ---
 
 You are the L3_STRATEGIST for Project NDARRAY Expansion.

diff --git a/.claude/agents/migration-tracker.md b/.claude/agents/migration-tracker.md
@@ -5,7 +5,7 @@ description: >
   Updates blackboard. Identifies gaps. Prevents duplication.
   READ ONLY — never writes code.
 tools: Read, Glob, Grep, Bash
-model: sonnet
+model: opus
 ---
 
 # Migration Tracker

diff --git a/.claude/agents/product-engineer.md b/.claude/agents/product-engineer.md
@@ -6,7 +6,7 @@ description: >
   finalizing API surface, writing doc comments, designing error types,
   managing feature flags, or ensuring the crate is publishable.
 tools: Read, Glob, Grep, Bash, Edit, Write
-model: sonnet
+model: opus
 ---
 
 You are the PRODUCT_ENGINEER for Project NDARRAY Expansion.

diff --git a/.claude/agents/vector-synthesis.md b/.claude/agents/vector-synthesis.md
@@ -6,7 +6,7 @@ description: >
   similarity search kernels, ndarray↔vector store bridges,
   distance metrics (cosine, L2, dot product), or batch vector operations.
 tools: Read, Glob, Grep, Bash, Edit
-model: sonnet
+model: opus
 ---
 
 You are the VECTOR_SYNTHESIS_EXPERT for Project NDARRAY Expansion.

diff --git a/src/hpc/cam_pq.rs b/src/hpc/cam_pq.rs
@@ -460,7 +460,7 @@ pub fn train_hybrid(
 /// For 16D subvectors (CAM-PQ subspace dimension), this is one F32x16
 /// load-subtract-multiply-reduce. Consumer never sees hardware details.
 #[inline(always)]
-fn squared_l2(a: &[f32], b: &[f32]) -> f32 {
+pub fn squared_l2(a: &[f32], b: &[f32]) -> f32 {
     debug_assert_eq!(a.len(), b.len());
     let n = a.len();
 
@@ -518,7 +518,7 @@ fn jaccard_similarity(a: &[String], b: &[String]) -> f32 {
 /// Simple k-means clustering.
 ///
 /// Returns `k` centroid vectors of length `dim`.
-fn kmeans(data: &[Vec<f32>], k: usize, dim: usize, iterations: usize) -> Vec<Vec<f32>> {
+pub fn kmeans(data: &[Vec<f32>], k: usize, dim: usize, iterations: usize) -> Vec<Vec<f32>> {
     let n = data.len();
     if n == 0 || k == 0 {
         return vec![vec![0.0; dim]; k];

diff --git a/src/hpc/fft.rs b/src/hpc/fft.rs
@@ -136,6 +136,103 @@ pub fn rfft_f32(input: &[f32]) -> Vec<f32> {
     complex[..2 * out_len].to_vec()
 }
 
+// ── Walsh-Hadamard Transform ──────────────────────────────────────
+//
+// The WHT is to quantization codecs what FFT is to signal processing:
+// an O(n log n) orthogonal rotation that spreads energy uniformly
+// across all coefficients. Unlike SVD (data-adaptive, O(n²k) training),
+// the Hadamard rotation is deterministic, free, and self-inverse.
+//
+// Used by the HadCascade codec (bgz-tensor) for residual rotation
+// before i4/i2 quantization. ICC 1.0000 on real model weights.
+
+/// In-place Walsh-Hadamard Transform (normalized, self-inverse).
+///
+/// `data` length must be a power of 2. After transform, `||WHT(x)|| = ||x||`
+/// (energy-preserving). Applying WHT twice returns the original vector.
+///
+/// SIMD: uses F32x16 butterfly for blocks ≥ 16 elements.
+///
+/// # Example
+///
+/// ```
+/// use ndarray::hpc::fft::wht_f32;
+///
+/// let mut x = vec![1.0f32, 0.0, 0.0, 0.0];
+/// wht_f32(&mut x);
+/// assert!((x[0] - 0.5).abs() < 1e-6); // 1/sqrt(4) * 1 = 0.5
+///
+/// // Self-inverse: WHT(WHT(x)) = x
+/// wht_f32(&mut x);
+/// assert!((x[0] - 1.0).abs() < 1e-5);
+/// ```
+pub fn wht_f32(data: &mut [f32]) {
+    let n = data.len();
+    assert!(n.is_power_of_two(), "WHT length must be a power of 2");
+
+    let mut h = 1;
+    while h < n {
+        if h >= 16 {
+            wht_butterfly_simd(data, n, h);
+        } else {
+            for i in (0..n).step_by(h * 2) {
+                for j in i..i + h {
+                    let x = data[j];
+                    let y = data[j + h];
+                    data[j] = x + y;
+                    data[j + h] = x - y;
+                }
+            }
+        }
+        h *= 2;
+    }
+
+    let norm = 1.0 / (n as f32).sqrt();
+    let mut i = 0;
+    while i + 16 <= n {
+        use crate::simd::F32x16;
+        let v = F32x16::from_slice(&data[i..]);
+        let scaled = v * F32x16::splat(norm);
+        scaled.copy_to_slice(&mut data[i..i + 16]);
+        i += 16;
+    }
+    while i < n {
+        data[i] *= norm;
+        i += 1;
+    }
+}
+
+/// WHT butterfly for one level, SIMD-accelerated for h ≥ 16.
+fn wht_butterfly_simd(data: &mut [f32], n: usize, h: usize) {
+    use crate::simd::F32x16;
+    for i in (0..n).step_by(h * 2) {
+        let mut j = 0;
+        while j + 16 <= h {
+            let a = F32x16::from_slice(&data[i + j..]);
+            let b = F32x16::from_slice(&data[i + j + h..]);
+            let sum = a + b;
+            let diff = a - b;
+            sum.copy_to_slice(&mut data[i + j..i + j + 16]);
+            diff.copy_to_slice(&mut data[i + j + h..i + j + h + 16]);
+            j += 16;
+        }
+        while j < h {
+            let x = data[i + j];
+            let y = data[i + j + h];
+            data[i + j] = x + y;
+            data[i + j + h] = x - y;
+            j += 1;
+        }
+    }
+}
+
+/// Convenience: WHT on a new vector (non-mutating).
+pub fn wht_f32_new(input: &[f32]) -> Vec<f32> {
+    let mut out = input.to_vec();
+    wht_f32(&mut out);
+    out
+}
+
 // ── Helpers ────────────────────────────────────────────────────────
 
 fn bit_reverse_f32(data: &mut [f32], n: usize) {
@@ -197,6 +294,44 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_wht_self_inverse() {
+        let original = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let mut data = original.clone();
+        wht_f32(&mut data);
+        wht_f32(&mut data);
+        for (a, b) in original.iter().zip(data.iter()) {
+            assert!((a - b).abs() < 1e-5, "self-inverse: {} vs {}", a, b);
+        }
+    }
+
+    #[test]
+    fn test_wht_energy_preservation() {
+        let mut data = vec![1.0f32, -2.0, 3.0, -4.0];
+        let norm_before: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
+        wht_f32(&mut data);
+        let norm_after: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!((norm_before - norm_after).abs() < 1e-4,
+            "energy: {} vs {}", norm_before, norm_after);
+    }
+
+    #[test]
+    fn test_wht_large_simd() {
+        let mut data: Vec<f32> = (0..1024).map(|i| (i as f32 * 0.618).sin()).collect();
+        let original = data.clone();
+        wht_f32(&mut data);
+        // Norm preservation at 1024-d (hits SIMD path)
+        let n_orig: f32 = original.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let n_wht: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!((n_orig - n_wht).abs() / n_orig < 1e-4,
+            "SIMD WHT norm: {} vs {}", n_orig, n_wht);
+        // Self-inverse
+        wht_f32(&mut data);
+        let max_err = original.iter().zip(data.iter())
+            .map(|(a, b)| (a - b).abs()).fold(0.0f32, f32::max);
+        assert!(max_err < 1e-3, "SIMD self-inverse max_err: {}", max_err);
+    }
+
     #[test]
     fn test_rfft() {
         let input = vec![1.0f32, 2.0, 3.0, 4.0];

diff --git a/src/hpc/quantized.rs b/src/hpc/quantized.rs
@@ -374,6 +374,49 @@ pub fn dequantize_i4_to_f32(packed: &[u8], params: &QuantParams, len: usize) ->
     result
 }
 
+/// Quantize f32 to i2 (packed: four i2 values per byte, signed ±1).
+///
+/// Each value is clamped to {-1, 0, +1} after scaling by abs_max.
+/// Packing: 4 crumbs per byte, low bits first.
+/// Symmetric quantization with zero_point = 0.
+pub fn quantize_f32_to_i2(data: &[f32]) -> (Vec<u8>, QuantParams) {
+    let abs_max = data.iter().fold(0.0f32, |a, &b| a.max(b.abs()));
+    let scale = if abs_max > 0.0 { abs_max } else { 1.0 };
+
+    let packed_len = (data.len() + 3) / 4;
+    let mut packed = vec![0u8; packed_len];
+
+    for (i, &v) in data.iter().enumerate() {
+        let q = (v / scale).round().clamp(-1.0, 1.0) as i8;
+        let u = (q + 1) as u8; // map {-1,0,1} to {0,1,2}
+        let shift = (i % 4) * 2;
+        packed[i / 4] |= (u & 0x03) << shift;
+    }
+
+    (
+        packed,
+        QuantParams {
+            scale,
+            zero_point: 0,
+            min_val: -abs_max,
+            max_val: abs_max,
+        },
+    )
+}
+
+/// Dequantize i2 (packed) to f32.
+pub fn dequantize_i2_to_f32(packed: &[u8], params: &QuantParams, len: usize) -> Vec<f32> {
+    let mut result = Vec::with_capacity(len);
+    for i in 0..len {
+        let byte = packed[i / 4];
+        let shift = (i % 4) * 2;
+        let u = (byte >> shift) & 0x03;
+        let q = u as i8 - 1; // map {0,1,2} back to {-1,0,1}
+        result.push(q as f32 * params.scale);
+    }
+    result
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;