Findit-AI · uqio · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -34,9 +34,9 @@ fn rgbf32_to_rgb_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -49,9 +49,9 @@ fn rgbf32_to_rgba_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_neon, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -64,9 +64,9 @@ fn rgbf32_to_rgb_u16_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -79,9 +79,9 @@ fn rgbf32_to_rgba_u16_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_neon, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -94,9 +94,9 @@ fn rgbf32_to_rgb_f32_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_neon = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
     // Lossless: output should equal input bit-exact.
@@ -131,9 +131,9 @@ fn neon_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -152,9 +152,9 @@ fn neon_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_neon, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -173,9 +173,9 @@ fn neon_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -194,9 +194,9 @@ fn neon_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -215,9 +215,9 @@ fn neon_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_neon = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -233,12 +233,250 @@ fn neon_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_neon = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
     // Lossless: output should equal input bit-exact.
     assert_eq!(out_neon, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — Rgbf32 -----------------------------------------------
+//
+// For each kernel: byte-swap the LE f32 inputs into a BE buffer, call the
+// kernel with `BE=true`, and assert the output matches the LE run (`BE=false`).
+
+/// Build a BE-encoded f32 slice by byte-swapping every 32-bit element.
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+/// Build a BE-encoded f16 slice by byte-swapping every 16-bit element.
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_f32_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    // BE path byte-swaps each f32, producing host-native = same as LE.
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+// ---- BE parity tests — Rgbf16 -----------------------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgba_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_f32_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_f16_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    // BE byte-swap should reconstruct original LE output bit-exact.
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f16 BE parity width {w}");
+  }
+}