Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/agents/l3-strategist.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description: >
mapping rustynum capabilities to ndarray's trait system, identifying
architectural gaps, or planning multi-phase implementation roadmaps.
tools: Read, Glob, Grep, Bash
model: sonnet
model: opus
---

You are the L3_STRATEGIST for Project NDARRAY Expansion.
Expand Down
2 changes: 1 addition & 1 deletion .claude/agents/migration-tracker.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: >
Updates blackboard. Identifies gaps. Prevents duplication.
READ ONLY — never writes code.
tools: Read, Glob, Grep, Bash
model: sonnet
model: opus
---

# Migration Tracker
Expand Down
2 changes: 1 addition & 1 deletion .claude/agents/product-engineer.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description: >
finalizing API surface, writing doc comments, designing error types,
managing feature flags, or ensuring the crate is publishable.
tools: Read, Glob, Grep, Bash, Edit, Write
model: sonnet
model: opus
---

You are the PRODUCT_ENGINEER for Project NDARRAY Expansion.
Expand Down
2 changes: 1 addition & 1 deletion .claude/agents/vector-synthesis.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description: >
similarity search kernels, ndarray↔vector store bridges,
distance metrics (cosine, L2, dot product), or batch vector operations.
tools: Read, Glob, Grep, Bash, Edit
model: sonnet
model: opus
---

You are the VECTOR_SYNTHESIS_EXPERT for Project NDARRAY Expansion.
Expand Down
4 changes: 2 additions & 2 deletions src/hpc/cam_pq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ pub fn train_hybrid(
/// For 16D subvectors (CAM-PQ subspace dimension), this is one F32x16
/// load-subtract-multiply-reduce. Consumer never sees hardware details.
#[inline(always)]
fn squared_l2(a: &[f32], b: &[f32]) -> f32 {
pub fn squared_l2(a: &[f32], b: &[f32]) -> f32 {
debug_assert_eq!(a.len(), b.len());
let n = a.len();

Expand Down Expand Up @@ -518,7 +518,7 @@ fn jaccard_similarity(a: &[String], b: &[String]) -> f32 {
/// Simple k-means clustering.
///
/// Returns `k` centroid vectors of length `dim`.
fn kmeans(data: &[Vec<f32>], k: usize, dim: usize, iterations: usize) -> Vec<Vec<f32>> {
pub fn kmeans(data: &[Vec<f32>], k: usize, dim: usize, iterations: usize) -> Vec<Vec<f32>> {
let n = data.len();
if n == 0 || k == 0 {
return vec![vec![0.0; dim]; k];
Expand Down
135 changes: 135 additions & 0 deletions src/hpc/fft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,103 @@ pub fn rfft_f32(input: &[f32]) -> Vec<f32> {
complex[..2 * out_len].to_vec()
}

// ── Walsh-Hadamard Transform ──────────────────────────────────────
//
// The WHT is to quantization codecs what FFT is to signal processing:
// an O(n log n) orthogonal rotation that spreads energy uniformly
// across all coefficients. Unlike SVD (data-adaptive, O(n²k) training),
// the Hadamard rotation is deterministic, free, and self-inverse.
//
// Used by the HadCascade codec (bgz-tensor) for residual rotation
// before i4/i2 quantization. ICC 1.0000 on real model weights.

/// In-place Walsh-Hadamard Transform (normalized, self-inverse).
///
/// `data` length must be a power of 2. After transform, `||WHT(x)|| = ||x||`
/// (energy-preserving). Applying WHT twice returns the original vector.
///
/// SIMD: uses F32x16 butterfly for blocks ≥ 16 elements.
///
/// # Example
///
/// ```
/// use ndarray::hpc::fft::wht_f32;
///
/// let mut x = vec![1.0f32, 0.0, 0.0, 0.0];
/// wht_f32(&mut x);
/// assert!((x[0] - 0.5).abs() < 1e-6); // 1/sqrt(4) * 1 = 0.5
///
/// // Self-inverse: WHT(WHT(x)) = x
/// wht_f32(&mut x);
/// assert!((x[0] - 1.0).abs() < 1e-5);
/// ```
pub fn wht_f32(data: &mut [f32]) {
let n = data.len();
assert!(n.is_power_of_two(), "WHT length must be a power of 2");

let mut h = 1;
while h < n {
if h >= 16 {
wht_butterfly_simd(data, n, h);
} else {
for i in (0..n).step_by(h * 2) {
for j in i..i + h {
let x = data[j];
let y = data[j + h];
data[j] = x + y;
data[j + h] = x - y;
}
}
}
h *= 2;
}

let norm = 1.0 / (n as f32).sqrt();
let mut i = 0;
while i + 16 <= n {
use crate::simd::F32x16;
let v = F32x16::from_slice(&data[i..]);
let scaled = v * F32x16::splat(norm);
scaled.copy_to_slice(&mut data[i..i + 16]);
i += 16;
}
while i < n {
data[i] *= norm;
i += 1;
}
}

/// WHT butterfly for one level, SIMD-accelerated for h ≥ 16.
fn wht_butterfly_simd(data: &mut [f32], n: usize, h: usize) {
use crate::simd::F32x16;
for i in (0..n).step_by(h * 2) {
let mut j = 0;
while j + 16 <= h {
let a = F32x16::from_slice(&data[i + j..]);
let b = F32x16::from_slice(&data[i + j + h..]);
let sum = a + b;
let diff = a - b;
sum.copy_to_slice(&mut data[i + j..i + j + 16]);
diff.copy_to_slice(&mut data[i + j + h..i + j + h + 16]);
j += 16;
}
while j < h {
let x = data[i + j];
let y = data[i + j + h];
data[i + j] = x + y;
data[i + j + h] = x - y;
j += 1;
}
}
}

/// Convenience: WHT on a new vector (non-mutating).
pub fn wht_f32_new(input: &[f32]) -> Vec<f32> {
let mut out = input.to_vec();
wht_f32(&mut out);
out
}

// ── Helpers ────────────────────────────────────────────────────────

fn bit_reverse_f32(data: &mut [f32], n: usize) {
Expand Down Expand Up @@ -197,6 +294,44 @@ mod tests {
}
}

#[test]
fn test_wht_self_inverse() {
let original = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let mut data = original.clone();
wht_f32(&mut data);
wht_f32(&mut data);
for (a, b) in original.iter().zip(data.iter()) {
assert!((a - b).abs() < 1e-5, "self-inverse: {} vs {}", a, b);
}
}

#[test]
fn test_wht_energy_preservation() {
let mut data = vec![1.0f32, -2.0, 3.0, -4.0];
let norm_before: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
wht_f32(&mut data);
let norm_after: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!((norm_before - norm_after).abs() < 1e-4,
"energy: {} vs {}", norm_before, norm_after);
}

#[test]
fn test_wht_large_simd() {
let mut data: Vec<f32> = (0..1024).map(|i| (i as f32 * 0.618).sin()).collect();
let original = data.clone();
wht_f32(&mut data);
// Norm preservation at 1024-d (hits SIMD path)
let n_orig: f32 = original.iter().map(|x| x * x).sum::<f32>().sqrt();
let n_wht: f32 = data.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!((n_orig - n_wht).abs() / n_orig < 1e-4,
"SIMD WHT norm: {} vs {}", n_orig, n_wht);
// Self-inverse
wht_f32(&mut data);
let max_err = original.iter().zip(data.iter())
.map(|(a, b)| (a - b).abs()).fold(0.0f32, f32::max);
assert!(max_err < 1e-3, "SIMD self-inverse max_err: {}", max_err);
}

#[test]
fn test_rfft() {
let input = vec![1.0f32, 2.0, 3.0, 4.0];
Expand Down
43 changes: 43 additions & 0 deletions src/hpc/quantized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,49 @@ pub fn dequantize_i4_to_f32(packed: &[u8], params: &QuantParams, len: usize) ->
result
}

/// Quantize f32 to i2 (packed: four i2 values per byte, signed ±1).
///
/// Each value is clamped to {-1, 0, +1} after scaling by abs_max.
/// Packing: 4 crumbs per byte, low bits first.
/// Symmetric quantization with zero_point = 0.
pub fn quantize_f32_to_i2(data: &[f32]) -> (Vec<u8>, QuantParams) {
let abs_max = data.iter().fold(0.0f32, |a, &b| a.max(b.abs()));
let scale = if abs_max > 0.0 { abs_max } else { 1.0 };

let packed_len = (data.len() + 3) / 4;
let mut packed = vec![0u8; packed_len];

for (i, &v) in data.iter().enumerate() {
let q = (v / scale).round().clamp(-1.0, 1.0) as i8;
let u = (q + 1) as u8; // map {-1,0,1} to {0,1,2}
let shift = (i % 4) * 2;
packed[i / 4] |= (u & 0x03) << shift;
}

(
packed,
QuantParams {
scale,
zero_point: 0,
min_val: -abs_max,
max_val: abs_max,
},
)
}

/// Dequantize i2 (packed) to f32.
pub fn dequantize_i2_to_f32(packed: &[u8], params: &QuantParams, len: usize) -> Vec<f32> {
let mut result = Vec::with_capacity(len);
for i in 0..len {
let byte = packed[i / 4];
let shift = (i % 4) * 2;
let u = (byte >> shift) & 0x03;
let q = u as i8 - 1; // map {0,1,2} back to {-1,0,1}
result.push(q as f32 * params.scale);
}
result
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading