diff --git a/src/row/arch/neon/gray.rs b/src/row/arch/neon/gray.rs
index a4d24371..15e2bf7c 100644
--- a/src/row/arch/neon/gray.rs
+++ b/src/row/arch/neon/gray.rs
@@ -15,7 +15,10 @@
 
 use core::arch::aarch64::*;
 
-use crate::row::scalar::{bits_mask, gray as scalar};
+use crate::row::{
+  arch::neon::endian::{load_endian_u16x8, load_endian_u32x4},
+  scalar::{bits_mask, gray as scalar},
+};
 
 // ---- helpers -----------------------------------------------------------------
 
@@ -179,7 +182,7 @@ pub(crate) unsafe fn gray8_to_hsv_row(
 /// NEON must be available. Slices sized correctly for `width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   if !full_range {
-    return scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   let shift = (BITS - 8) as i32;
   let mask = bits_mask::<BITS>();
@@ -196,7 +199,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
   unsafe {
     let mask_v = vdupq_n_u16(mask);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = vandq_u16(raw, mask_v);
       let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
       // narrow u16x8 → u8x8
@@ -208,7 +211,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_rgb_row::<BITS>(
+    scalar::gray_n_to_rgb_row::<BITS, BE>(
       &y_plane[x..width],
       &mut out[x * 3..width * 3],
       width - x,
@@ -225,7 +228,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 /// NEON must be available. Slices sized correctly for `width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -234,7 +237,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
   if !full_range {
-    return scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   let shift = (BITS - 8) as i32;
   let mask = bits_mask::<BITS>();
@@ -243,7 +246,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(mask);
     let alpha = vdup_n_u8(0xFF);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = vandq_u16(raw, mask_v);
       let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
       let narrow = vmovn_u16(shifted);
@@ -253,7 +256,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_rgba_row::<BITS>(
+    scalar::gray_n_to_rgba_row::<BITS, BE>(
       &y_plane[x..width],
       &mut out[x * 4..width * 4],
       width - x,
@@ -270,7 +273,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -279,14 +282,14 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   if !full_range {
-    return scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let mut x = 0usize;
   unsafe {
     let mask_v = vdupq_n_u16(mask);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y = vandq_u16(raw, mask_v);
       let rgb = uint16x8x3_t(y, y, y);
       vst3q_u16(out.as_mut_ptr().add(x * 3), rgb);
@@ -294,7 +297,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_rgb_u16_row::<BITS>(
+    scalar::gray_n_to_rgb_u16_row::<BITS, BE>(
       &y_plane[x..width],
       &mut out[x * 3..width * 3],
       width - x,
@@ -311,7 +314,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -320,7 +323,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
   if !full_range {
-    return scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let mut x = 0usize;
@@ -328,7 +331,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(mask);
     let alpha_v = vdupq_n_u16(mask); // full-range max for BITS
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y = vandq_u16(raw, mask_v);
       let rgba = uint16x8x4_t(y, y, y, alpha_v);
       vst4q_u16(out.as_mut_ptr().add(x * 4), rgba);
@@ -336,7 +339,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_rgba_u16_row::<BITS>(
+    scalar::gray_n_to_rgba_u16_row::<BITS, BE>(
       &y_plane[x..width],
       &mut out[x * 4..width * 4],
       width - x,
@@ -353,7 +356,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -366,7 +369,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
   unsafe {
     let mask_v = vdupq_n_u16(mask);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = vandq_u16(raw, mask_v);
       let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
       let narrow = vmovn_u16(shifted);
@@ -375,7 +378,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -387,7 +390,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -399,14 +402,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
   unsafe {
     let mask_v = vdupq_n_u16(mask);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = vandq_u16(raw, mask_v);
       vst1q_u16(out.as_mut_ptr().add(x), masked);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_u16_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -419,7 +422,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -432,7 +435,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
   debug_assert!(s_out.len() >= width);
   debug_assert!(v_out.len() >= width);
   if !full_range {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let shift = (BITS - 8) as i32;
   let mask = bits_mask::<BITS>();
@@ -441,7 +444,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(mask);
     let zero = vdup_n_u8(0);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = vandq_u16(raw, mask_v);
       let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
       let narrow = vmovn_u16(shifted);
@@ -452,7 +455,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_hsv_row::<BITS>(
+    scalar::gray_n_to_hsv_row::<BITS, BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -473,7 +476,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_rgb_row(
+pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -482,12 +485,12 @@ pub(crate) unsafe fn gray16_to_rgb_row(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   if !full_range {
-    return scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y8 = vshrn_n_u16::<8>(raw);
       let rgb = uint8x8x3_t(y8, y8, y8);
       vst3_u8(out.as_mut_ptr().add(x * 3), rgb);
@@ -495,7 +498,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
     }
   }
   if x < width {
-    scalar::gray16_to_rgb_row(
+    scalar::gray16_to_rgb_row::<BE>(
       &y_plane[x..width],
       &mut out[x * 3..width * 3],
       width - x,
@@ -512,7 +515,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_rgba_row(
+pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -521,13 +524,13 @@ pub(crate) unsafe fn gray16_to_rgba_row(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
   if !full_range {
-    return scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let alpha = vdup_n_u8(0xFF);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y8 = vshrn_n_u16::<8>(raw);
       let rgba = uint8x8x4_t(y8, y8, y8, alpha);
       vst4_u8(out.as_mut_ptr().add(x * 4), rgba);
@@ -535,7 +538,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
     }
   }
   if x < width {
-    scalar::gray16_to_rgba_row(
+    scalar::gray16_to_rgba_row::<BE>(
       &y_plane[x..width],
       &mut out[x * 4..width * 4],
       width - x,
@@ -552,7 +555,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_rgb_u16_row(
+pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -561,19 +564,19 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   if !full_range {
-    return scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = vld1q_u16(y_plane.as_ptr().add(x));
+      let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let rgb = uint16x8x3_t(y, y, y);
       vst3q_u16(out.as_mut_ptr().add(x * 3), rgb);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_rgb_u16_row(
+    scalar::gray16_to_rgb_u16_row::<BE>(
       &y_plane[x..width],
       &mut out[x * 3..width * 3],
       width - x,
@@ -590,7 +593,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_rgba_u16_row(
+pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -599,20 +602,20 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
   if !full_range {
-    return scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let alpha = vdupq_n_u16(0xFFFF);
     while x + 8 <= width {
-      let y = vld1q_u16(y_plane.as_ptr().add(x));
+      let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let rgba = uint16x8x4_t(y, y, y, alpha);
       vst4q_u16(out.as_mut_ptr().add(x * 4), rgba);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_rgba_u16_row(
+    scalar::gray16_to_rgba_u16_row::<BE>(
       &y_plane[x..width],
       &mut out[x * 4..width * 4],
       width - x,
@@ -629,20 +632,24 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y8 = vshrn_n_u16::<8>(raw);
       vst1_u8(out.as_mut_ptr().add(x), y8);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -654,19 +661,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = vld1q_u16(y_plane.as_ptr().add(x));
+      let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       vst1q_u16(out.as_mut_ptr().add(x), y);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -679,7 +690,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gray16_to_hsv_row(
+pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -692,13 +703,13 @@ pub(crate) unsafe fn gray16_to_hsv_row(
   debug_assert!(s_out.len() >= width);
   debug_assert!(v_out.len() >= width);
   if !full_range {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let zero = vdup_n_u8(0);
     while x + 8 <= width {
-      let raw = vld1q_u16(y_plane.as_ptr().add(x));
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let y8 = vshrn_n_u16::<8>(raw);
       vst1_u8(h_out.as_mut_ptr().add(x), zero);
       vst1_u8(s_out.as_mut_ptr().add(x), zero);
@@ -707,7 +718,7 @@ pub(crate) unsafe fn gray16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::gray16_to_hsv_row(
+    scalar::gray16_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -728,7 +739,11 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 3`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -738,8 +753,12 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y0 = vld1q_f32(y_plane.as_ptr().add(x));
-      let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4));
+      let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
+      let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
+      ));
       let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
       let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
       // vcvtaq_u32_f32: round-to-nearest-even, no FPCR manipulation needed.
@@ -754,7 +773,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -764,7 +783,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
 /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 4`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -775,8 +798,12 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y0 = vld1q_f32(y_plane.as_ptr().add(x));
-      let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4));
+      let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
+      let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
+      ));
       let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
       let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
       let u0 = vcvtaq_u32_f32(c0);
@@ -788,7 +815,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
   }
 }
 
@@ -798,7 +825,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
 /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 3`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -808,7 +839,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = vld1q_f32(y_plane.as_ptr().add(x));
+      let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
       let u32v = vcvtaq_u32_f32(c);
       let u16v = vqmovn_u32(u32v); // saturating narrow to u16
@@ -818,7 +851,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -828,7 +861,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
 /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 4`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -839,7 +876,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = vld1q_f32(y_plane.as_ptr().add(x));
+      let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
       let u16v = vqmovn_u32(vcvtaq_u32_f32(c));
       let rgba = uint16x4x4_t(u16v, u16v, u16v, alpha);
@@ -848,7 +887,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_u16_row::<BE>(
+      &y_plane[x..width],
+      &mut out[x * 4..width * 4],
+      width - x,
+    );
   }
 }
 
@@ -859,21 +902,27 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = vld1q_f32(y_plane.as_ptr().add(x));
+      let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let rgb = float32x4x3_t(y, y, y);
       vst3q_f32(out.as_mut_ptr().add(x * 3), rgb);
       x += 4;
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_f32_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_f32_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -883,7 +932,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -893,8 +946,12 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y0 = vld1q_f32(y_plane.as_ptr().add(x));
-      let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4));
+      let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
+      let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
+      ));
       let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
       let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
       let n8 = vmovn_u16(vcombine_u16(
@@ -906,7 +963,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -916,7 +973,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -926,7 +987,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = vld1q_f32(y_plane.as_ptr().add(x));
+      let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
       let u16v = vqmovn_u32(vcvtaq_u32_f32(c));
       vst1_u16(out.as_mut_ptr().add(x), u16v);
@@ -934,7 +997,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -945,20 +1008,26 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = vld1q_f32(y_plane.as_ptr().add(x));
+      let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       vst1q_f32(out.as_mut_ptr().add(x), y);
       x += 4;
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_f32_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_f32_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -968,7 +1037,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn grayf32_to_hsv_row(
+pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
   y_plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -984,8 +1053,12 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y0 = vld1q_f32(y_plane.as_ptr().add(x));
-      let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4));
+      let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
+      let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
+      ));
       let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero_f), scale);
       let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero_f), scale);
       let v8 = vmovn_u16(vcombine_u16(
@@ -999,7 +1072,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
     }
   }
   if x < width {
-    scalar::grayf32_to_hsv_row(
+    scalar::grayf32_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1238,10 +1311,13 @@ pub(crate) unsafe fn ya8_to_hsv_row(
 /// NEON must be available. `packed.len() >= width * 2`. `out.len() >= width * 3`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
+  if BE {
+    return scalar::ya16_to_rgb_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1253,7 +1329,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
     }
   }
   if x < width {
-    scalar::ya16_to_rgb_row(
+    scalar::ya16_to_rgb_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 3..width * 3],
       width - x,
@@ -1267,10 +1343,17 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
+  if BE {
+    return scalar::ya16_to_rgba_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1283,7 +1366,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_rgba_row(
+    scalar::ya16_to_rgba_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 4..width * 4],
       width - x,
@@ -1297,10 +1380,17 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
+  if BE {
+    return scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1315,7 +1405,7 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
     }
   }
   if x < width {
-    scalar::ya16_to_rgb_u16_row(
+    scalar::ya16_to_rgb_u16_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 3..width * 3],
       width - x,
@@ -1329,10 +1419,17 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
+  if BE {
+    return scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1349,7 +1446,7 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
     }
   }
   if x < width {
-    scalar::ya16_to_rgba_u16_row(
+    scalar::ya16_to_rgba_u16_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 4..width * 4],
       width - x,
@@ -1363,10 +1460,17 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
+  if BE {
+    return scalar::ya16_to_luma_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1377,7 +1481,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -1387,10 +1491,17 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
+  if BE {
+    return scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
+  }
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
@@ -1400,7 +1511,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
     }
   }
   if x < width {
-    scalar::ya16_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    scalar::ya16_to_luma_u16_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -1410,7 +1521,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 /// NEON must be available.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ya16_to_hsv_row(
+pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -1419,6 +1530,9 @@ pub(crate) unsafe fn ya16_to_hsv_row(
 ) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
+  if BE {
+    return scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
+  }
   let mut x = 0usize;
   unsafe {
     let zero = vdup_n_u8(0);
@@ -1432,7 +1546,7 @@ pub(crate) unsafe fn ya16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::ya16_to_hsv_row(
+    scalar::ya16_to_hsv_row::<BE>(
       &packed[x * 2..width * 2],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1519,8 +1633,8 @@ mod tests {
       prng16(&mut plane, 0xABCD_1234);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray_n_to_rgb_row::<10>(&plane, &mut simd, w, true) };
-      scalar::gray_n_to_rgb_row::<10>(&plane, &mut scal, w, true);
+      unsafe { super::gray_n_to_rgb_row::<10, false>(&plane, &mut simd, w, true) };
+      scalar::gray_n_to_rgb_row::<10, false>(&plane, &mut scal, w, true);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1533,8 +1647,8 @@ mod tests {
       prng16(&mut plane, 0xDEAD_BEEF);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, true) };
-      scalar::gray16_to_rgb_row(&plane, &mut scal, w, true);
+      unsafe { super::gray16_to_rgb_row::<false>(&plane, &mut simd, w, true) };
+      scalar::gray16_to_rgb_row::<false>(&plane, &mut scal, w, true);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1563,8 +1677,8 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) };
-      scalar::gray16_to_rgb_row(&plane, &mut scal, w, false);
+      unsafe { super::gray16_to_rgb_row::<false>(&plane, &mut simd, w, false) };
+      scalar::gray16_to_rgb_row::<false>(&plane, &mut scal, w, false);
       assert_eq!(simd, scal, "width={w} limited-range");
     }
   }
@@ -1589,8 +1703,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1604,8 +1718,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1619,8 +1733,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1634,8 +1748,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1649,8 +1763,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0005);
       let mut simd = std::vec![0.0f32; w * 3];
       let mut scal = std::vec![0.0f32; w * 3];
-      unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1664,8 +1778,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0006);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1679,8 +1793,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0007);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1694,8 +1808,8 @@ mod tests {
       prng_f32(&mut plane, 0xF32A_0008);
       let mut simd = std::vec![0.0f32; w];
       let mut scal = std::vec![0.0f32; w];
-      unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1713,8 +1827,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) };
-      sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::grayf32_to_hsv_row::<false>(&plane, &mut sh, &mut ss, &mut sv, w) };
+      sf::grayf32_to_hsv_row::<false>(&plane, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1861,8 +1975,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1876,8 +1990,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1891,8 +2005,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1906,8 +2020,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1921,8 +2035,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0005);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1936,8 +2050,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0006);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1955,11 +2069,78 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) };
-      sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::ya16_to_hsv_row::<false>(&packed, &mut sh, &mut ss, &mut sv, w) };
+      sy::ya16_to_hsv_row::<false>(&packed, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
     }
   }
+
+  // ---- BE parity tests: NEON BE kernel == scalar LE kernel on byte-swapped input ----
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn neon_gray10_be_parity_rgb() {
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE10_0001);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w * 3];
+      let mut scal_le = std::vec![0u8; w * 3];
+      unsafe { super::gray_n_to_rgb_row::<10, true>(&be, &mut simd_be, w, true) };
+      scalar::gray_n_to_rgb_row::<10, false>(&le, &mut scal_le, w, true);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn neon_gray16_be_parity_luma() {
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE16_0002);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      scalar::gray16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn neon_grayf32_be_parity_luma() {
+    use crate::row::scalar::grayf32 as sf;
+    for &w in WIDTHS {
+      let mut le = std::vec![0.0f32; w];
+      prng_f32(&mut le, 0xBEF3_0003);
+      let be: std::vec::Vec<f32> = le
+        .iter()
+        .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+        .collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::grayf32_to_luma_row::<true>(&be, &mut simd_be, w) };
+      sf::grayf32_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn neon_ya16_be_parity_luma() {
+    use crate::row::scalar::ya16 as sy;
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w * 2];
+      prng_ya16(&mut le, 0xBEA1_0004);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::ya16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      sy::ya16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
 }
diff --git a/src/row/arch/wasm_simd128/gray.rs b/src/row/arch/wasm_simd128/gray.rs
index 39f26ca9..fdb667c7 100644
--- a/src/row/arch/wasm_simd128/gray.rs
+++ b/src/row/arch/wasm_simd128/gray.rs
@@ -15,7 +15,10 @@
 
 use core::arch::wasm32::*;
 
-use crate::row::scalar::{bits_mask, gray as scalar, grayf32, ya8, ya16};
+use crate::row::{
+  arch::wasm_simd128::endian::{load_endian_u16x8, load_endian_u32x4},
+  scalar::{bits_mask, gray as scalar, grayf32, ya8, ya16},
+};
 
 // ---- Gray8 ------------------------------------------------------------------
 
@@ -112,7 +115,7 @@ pub(crate) unsafe fn gray8_to_hsv_row(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -120,7 +123,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray_n_to_rgba_row<BITS>`.
@@ -131,7 +134,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -139,7 +142,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray_n_to_rgb_u16_row<BITS>`.
@@ -150,7 +153,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -158,7 +161,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray_n_to_rgba_u16_row<BITS>`.
@@ -169,7 +172,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -177,7 +180,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray_n_to_luma_row<BITS>`: mask + shift → u8. 8 pixels/iter.
@@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -201,7 +204,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
   unsafe {
     let mask_v = u16x8_splat(mask);
     while x + 8 <= width {
-      let raw = v128_load(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = v128_and(raw, mask_v);
       let shifted = u16x8_shr(masked, shift);
       // Narrow u16x8 → u8x8 via u8x16_narrow_i16x8 (saturation, but values
@@ -215,7 +218,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -227,7 +230,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -239,14 +242,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
   unsafe {
     let mask_v = u16x8_splat(mask);
     while x + 8 <= width {
-      let raw = v128_load(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = v128_and(raw, mask_v);
       v128_store(out.as_mut_ptr().add(x).cast(), masked);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_u16_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -260,7 +263,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
 /// simd128 must be enabled. All slices have length >= width.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -270,7 +273,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let shift = BITS - 8;
@@ -279,7 +282,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     let mask_v = u16x8_splat(mask);
     let zero = i64x2(0, 0);
     while x + 8 <= width {
-      let raw = v128_load(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = v128_and(raw, mask_v);
       let shifted = u16x8_shr(masked, shift);
       let narrowed = u8x16_narrow_i16x8(shifted, zero);
@@ -292,7 +295,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_hsv_row::<BITS>(
+    scalar::gray_n_to_hsv_row::<BITS, BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -313,7 +316,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_rgb_row(
+pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -321,7 +324,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray16_to_rgba_row`.
@@ -332,7 +335,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_rgba_row(
+pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -340,7 +343,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray16_to_rgb_u16_row`.
@@ -351,7 +354,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_rgb_u16_row(
+pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -359,7 +362,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray16_to_rgba_u16_row`.
@@ -371,7 +374,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_rgba_u16_row(
+pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -379,7 +382,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// wasm-simd128 `gray16_to_luma_row`: `>> 8` → u8. 8 pixels/iter.
@@ -390,14 +393,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     let zero = i64x2(0, 0);
     while x + 8 <= width {
-      let raw = v128_load(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = u16x8_shr(raw, 8);
       let narrowed = u8x16_narrow_i16x8(shifted, zero);
       let val = i64x2_extract_lane::<0>(narrowed) as u64;
@@ -406,7 +413,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -418,19 +425,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = v128_load(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       v128_store(out.as_mut_ptr().add(x).cast(), y);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -443,7 +454,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi
 /// simd128 must be enabled. All slices have length >= width.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gray16_to_hsv_row(
+pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -453,13 +464,13 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let zero = i64x2(0, 0);
     while x + 8 <= width {
-      let raw = v128_load(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = u16x8_shr(raw, 8);
       let narrowed = u8x16_narrow_i16x8(shifted, zero);
       let val = i64x2_extract_lane::<0>(narrowed) as u64;
@@ -471,7 +482,7 @@ pub(crate) unsafe fn gray16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::gray16_to_hsv_row(
+    scalar::gray16_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -490,10 +501,14 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  grayf32::grayf32_to_rgb_row(y_plane, out, width);
+  grayf32::grayf32_to_rgb_row::<BE>(y_plane, out, width);
 }
 
 /// wasm-simd128 `grayf32_to_rgba_row`: delegates to scalar.
@@ -502,10 +517,14 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  grayf32::grayf32_to_rgba_row(y_plane, out, width);
+  grayf32::grayf32_to_rgba_row::<BE>(y_plane, out, width);
 }
 
 /// wasm-simd128 `grayf32_to_rgb_u16_row`: delegates to scalar.
@@ -514,10 +533,14 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  grayf32::grayf32_to_rgb_u16_row(y_plane, out, width);
+  grayf32::grayf32_to_rgb_u16_row::<BE>(y_plane, out, width);
 }
 
 /// wasm-simd128 `grayf32_to_rgba_u16_row`: delegates to scalar.
@@ -526,10 +549,14 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  grayf32::grayf32_to_rgba_u16_row(y_plane, out, width);
+  grayf32::grayf32_to_rgba_u16_row::<BE>(y_plane, out, width);
 }
 
 /// wasm-simd128 `grayf32_to_rgb_f32_row`: delegates to scalar.
@@ -539,10 +566,14 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[inline]
 #[target_feature(enable = "simd128")]
 #[allow(dead_code)] // dispatcher always uses scalar; function is exercised by tests only
-pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  grayf32::grayf32_to_rgb_f32_row(y_plane, out, width);
+  grayf32::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
 }
 
 /// wasm-simd128 `grayf32_to_luma_row`: clamp→scale→round→u8. 4 pixels/iter.
@@ -554,7 +585,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let scale = f32x4_splat(255.0);
@@ -565,7 +600,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = v128_load(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
       let clamped = f32x4_min(f32x4_max(y, zero4), one4);
       let scaled = f32x4_mul(clamped, scale);
       let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
@@ -579,7 +614,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    grayf32::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    grayf32::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -589,7 +624,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
 /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let scale = f32x4_splat(65535.0);
@@ -600,7 +639,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = v128_load(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
       let clamped = f32x4_min(f32x4_max(y, zero4), one4);
       let scaled = f32x4_mul(clamped, scale);
       let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
@@ -614,7 +653,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    grayf32::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    grayf32::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -625,19 +664,23 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[inline]
 #[target_feature(enable = "simd128")]
 #[allow(dead_code)] // dispatcher always uses scalar; function is exercised by tests only
-pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = v128_load(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
       v128_store(out.as_mut_ptr().add(x).cast(), y);
       x += 4;
     }
   }
   if x < width {
-    grayf32::grayf32_to_luma_f32_row(&y_plane[x..width], &mut out[x..width], width - x);
+    grayf32::grayf32_to_luma_f32_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -647,7 +690,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w
 /// simd128 must be enabled. All slices have length >= width.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn grayf32_to_hsv_row(
+pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
   y_plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -663,7 +706,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = v128_load(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
       let clamped = f32x4_min(f32x4_max(y, zero4), one4);
       let scaled = f32x4_mul(clamped, scale);
       let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
@@ -678,7 +721,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
     }
   }
   if x < width {
-    grayf32::grayf32_to_hsv_row(
+    grayf32::grayf32_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -840,10 +883,10 @@ pub(crate) unsafe fn ya8_to_hsv_row(
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
-  ya16::ya16_to_rgb_row(packed, out, width);
+  ya16::ya16_to_rgb_row::<BE>(packed, out, width);
 }
 
 /// wasm-simd128 `ya16_to_rgba_row`: delegates to scalar.
@@ -852,10 +895,14 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
-  ya16::ya16_to_rgba_row(packed, out, width);
+  ya16::ya16_to_rgba_row::<BE>(packed, out, width);
 }
 
 /// wasm-simd128 `ya16_to_rgb_u16_row`: delegates to scalar.
@@ -864,10 +911,14 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
-  ya16::ya16_to_rgb_u16_row(packed, out, width);
+  ya16::ya16_to_rgb_u16_row::<BE>(packed, out, width);
 }
 
 /// wasm-simd128 `ya16_to_rgba_u16_row`: delegates to scalar.
@@ -876,10 +927,14 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
 /// simd128 must be enabled.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
-  ya16::ya16_to_rgba_u16_row(packed, out, width);
+  ya16::ya16_to_rgba_u16_row::<BE>(packed, out, width);
 }
 
 /// wasm-simd128 `ya16_to_luma_row`: deinterleave Y u16 → `>> 8` → u8.
@@ -892,7 +947,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
 /// simd128 must be enabled. `packed.len() >= width * 2`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
   // Shuffle mask: gather words at indices 0,2,4,6 (byte offsets 0-1,4-5,8-9,12-13)
@@ -926,7 +985,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    ya16::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    ya16::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -937,7 +996,11 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// simd128 must be enabled. `packed.len() >= width * 2`. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
   let shuf_lo = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
@@ -960,7 +1023,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
     }
   }
   if x < width {
-    ya16::ya16_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    ya16::ya16_to_luma_u16_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -970,7 +1033,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 /// simd128 must be enabled. All slices have length >= width.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ya16_to_hsv_row(
+pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -1005,7 +1068,7 @@ pub(crate) unsafe fn ya16_to_hsv_row(
     }
   }
   if x < width {
-    ya16::ya16_to_hsv_row(
+    ya16::ya16_to_hsv_row::<BE>(
       &packed[x * 2..width * 2],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1055,8 +1118,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1070,8 +1133,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1085,8 +1148,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1100,8 +1163,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1115,8 +1178,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0005);
       let mut simd = std::vec![0.0f32; w * 3];
       let mut scal = std::vec![0.0f32; w * 3];
-      unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1130,8 +1193,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0006);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1145,8 +1208,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0007);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1160,8 +1223,8 @@ mod tests {
       prng_f32(&mut plane, 0xF800_0008);
       let mut simd = std::vec![0.0f32; w];
       let mut scal = std::vec![0.0f32; w];
-      unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1179,8 +1242,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) };
-      sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::grayf32_to_hsv_row::<false>(&plane, &mut sh, &mut ss, &mut sv, w) };
+      sf::grayf32_to_hsv_row::<false>(&plane, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1327,8 +1390,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1342,8 +1405,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1357,8 +1420,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1372,8 +1435,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1387,8 +1450,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0005);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1402,8 +1465,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA862_0006);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1421,8 +1484,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) };
-      sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::ya16_to_hsv_row::<false>(&packed, &mut sh, &mut ss, &mut sv, w) };
+      sy::ya16_to_hsv_row::<false>(&packed, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1457,8 +1520,8 @@ mod tests {
       prng16(&mut plane, 0xCAFE_BABE);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) };
-      scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w);
+      unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) };
+      scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1471,9 +1534,41 @@ mod tests {
       prng16(&mut plane, 0xDEAD_BEEF);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
+
+  // ---- BE parity tests --------------------------------------------------------
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn wasm_gray10_be_parity_luma() {
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE10_0001);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) };
+      scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+  fn wasm_gray16_be_parity_luma() {
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE16_0002);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      scalar::gray16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
 }
diff --git a/src/row/arch/x86_avx2/gray.rs b/src/row/arch/x86_avx2/gray.rs
index a0977b02..48e973b8 100644
--- a/src/row/arch/x86_avx2/gray.rs
+++ b/src/row/arch/x86_avx2/gray.rs
@@ -16,7 +16,10 @@
 
 use core::arch::x86_64::*;
 
-use crate::row::scalar::{bits_mask, gray as scalar};
+use crate::row::{
+  arch::x86_avx2::endian::{load_endian_u16x16, load_endian_u32x8},
+  scalar::{bits_mask, gray as scalar},
+};
 
 // ---- Gray8 ------------------------------------------------------------------
 
@@ -120,7 +123,7 @@ pub(crate) unsafe fn gray8_to_hsv_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -128,7 +131,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray_n_to_rgba_row<BITS>`.
@@ -140,7 +143,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -148,7 +151,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray_n_to_rgb_u16_row<BITS>`.
@@ -160,7 +163,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -168,7 +171,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray_n_to_rgba_u16_row<BITS>`.
@@ -180,7 +183,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray_n_to_luma_row<BITS>`: mask + shift to u8. 16 pixels/iter.
@@ -200,7 +203,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -215,7 +218,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     // requires a literal const generic shift not expressible as `BITS - 8`.
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     while x + 16 <= width {
-      let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm256_and_si256(raw, mask_v);
       let shifted = _mm256_srl_epi16(masked, shr);
       // Pack u16x16 → u8x16 (with lane-cross fixup via permute4x64)
@@ -231,7 +234,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -244,7 +247,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -256,14 +259,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
   unsafe {
     let mask_v = _mm256_set1_epi16(mask as i16);
     while x + 16 <= width {
-      let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm256_and_si256(raw, mask_v);
       _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), masked);
       x += 16;
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_u16_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -277,7 +280,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -287,7 +290,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let mut x = 0usize;
@@ -296,7 +299,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     let zero256 = _mm256_setzero_si256();
     while x + 16 <= width {
-      let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm256_and_si256(raw, mask_v);
       let shifted = _mm256_srl_epi16(masked, shr);
       let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero256));
@@ -315,7 +318,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_hsv_row::<BITS>(
+    scalar::gray_n_to_hsv_row::<BITS, BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -337,7 +340,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_rgb_row(
+pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -345,7 +348,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray16_to_rgba_row`.
@@ -357,7 +360,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_rgba_row(
+pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -365,7 +368,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray16_to_rgb_u16_row`.
@@ -377,7 +380,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_rgb_u16_row(
+pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -385,7 +388,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray16_to_rgba_u16_row`.
@@ -397,7 +400,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_rgba_u16_row(
+pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -405,7 +408,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX2 `gray16_to_luma_row`: `>> 8`, pack, store. 16 pixels/iter.
@@ -417,14 +420,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     let zero = _mm256_setzero_si256();
     while x + 16 <= width {
-      let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm256_srli_epi16(raw, 8);
       // Pack u16x16 → u8x16 with lane-cross fixup.
       let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero));
@@ -434,7 +441,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -447,19 +454,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y);
       x += 16;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -473,7 +484,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gray16_to_hsv_row(
+pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -483,13 +494,13 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let zero = _mm256_setzero_si256();
     while x + 16 <= width {
-      let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm256_srli_epi16(raw, 8);
       let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero));
       let lo = _mm256_castsi256_si128(packed);
@@ -506,7 +517,7 @@ pub(crate) unsafe fn gray16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::gray16_to_hsv_row(
+    scalar::gray16_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -529,7 +540,11 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -539,7 +554,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -561,7 +578,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -571,7 +588,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -581,7 +602,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -602,7 +625,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
   }
 }
 
@@ -612,7 +635,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -622,7 +649,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -668,7 +697,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -678,7 +707,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -688,7 +721,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -740,7 +775,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_u16_row::<BE>(
+      &y_plane[x..width],
+      &mut out[x * 4..width * 4],
+      width - x,
+    );
   }
 }
 
@@ -751,11 +790,15 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::grayf32_to_rgb_f32_row(y_plane, out, width);
+  scalar::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
 }
 
 /// AVX2 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8. 8 px/iter.
@@ -764,7 +807,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -774,7 +821,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -788,7 +837,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -798,7 +847,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -808,7 +861,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -820,7 +875,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -831,11 +886,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
-  scalar::grayf32_to_luma_f32_row(y_plane, out, width);
+  scalar::grayf32_to_luma_f32_row::<BE>(y_plane, out, width);
 }
 
 /// AVX2 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255. 8 px/iter.
@@ -844,7 +903,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn grayf32_to_hsv_row(
+pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
   y_plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -859,7 +918,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm256_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
       let scaled = _mm256_mul_ps(clamped, scale);
       let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
@@ -876,7 +937,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
     }
   }
   if x < width {
-    scalar::grayf32_to_hsv_row(
+    scalar::grayf32_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1093,7 +1154,7 @@ pub(crate) unsafe fn ya8_to_hsv_row(
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
@@ -1119,7 +1180,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
     }
   }
   if x < width {
-    scalar::ya16_to_rgb_row(
+    scalar::ya16_to_rgb_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 3..width * 3],
       width - x,
@@ -1133,7 +1194,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
@@ -1169,7 +1234,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_rgba_row(
+    scalar::ya16_to_rgba_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 4..width * 4],
       width - x,
@@ -1183,11 +1248,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
-  scalar::ya16_to_rgb_u16_row(packed, out, width);
+  scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX2 `ya16_to_rgba_u16_row`: native Y and A u16.
@@ -1196,11 +1265,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
-  scalar::ya16_to_rgba_u16_row(packed, out, width);
+  scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX2 `ya16_to_luma_row`: Y `>> 8` → u8. 4 px/iter.
@@ -1209,7 +1282,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
@@ -1229,7 +1306,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -1239,11 +1316,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
-  scalar::ya16_to_luma_u16_row(packed, out, width);
+  scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX2 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped.
@@ -1252,7 +1333,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 /// AVX2 must be available.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ya16_to_hsv_row(
+pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -1280,7 +1361,7 @@ pub(crate) unsafe fn ya16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::ya16_to_hsv_row(
+    scalar::ya16_to_hsv_row::<BE>(
       &packed[x * 2..width * 2],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1332,8 +1413,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1350,8 +1431,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1368,8 +1449,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1386,8 +1467,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1404,8 +1485,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0006);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1422,8 +1503,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0007);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1444,8 +1525,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) };
-      sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::grayf32_to_hsv_row::<false>(&plane, &mut sh, &mut ss, &mut sv, w) };
+      sf::grayf32_to_hsv_row::<false>(&plane, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1464,8 +1545,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0005);
       let mut simd = std::vec![0.0f32; w * 3];
       let mut scal = std::vec![0.0f32; w * 3];
-      unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1482,8 +1563,8 @@ mod tests {
       prng_f32(&mut plane, 0xF200_0008);
       let mut simd = std::vec![0.0f32; w];
       let mut scal = std::vec![0.0f32; w];
-      unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1632,8 +1713,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1650,8 +1731,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1668,8 +1749,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1686,8 +1767,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1704,8 +1785,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0005);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1722,8 +1803,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA260_0006);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1744,8 +1825,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) };
-      sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::ya16_to_hsv_row::<false>(&packed, &mut sh, &mut ss, &mut sv, w) };
+      sy::ya16_to_hsv_row::<false>(&packed, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1799,8 +1880,8 @@ mod tests {
       prng16(&mut plane, 0xABCD_1234);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::gray_n_to_luma_row::<10>(&plane, &mut simd, w) };
-      scalar::gray_n_to_luma_row::<10>(&plane, &mut scal, w);
+      unsafe { super::gray_n_to_luma_row::<10, false>(&plane, &mut simd, w) };
+      scalar::gray_n_to_luma_row::<10, false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1815,8 +1896,8 @@ mod tests {
       prng16(&mut plane, 0xDEAD_CAFE);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray_n_to_luma_u16_row::<12>(&plane, &mut simd, w) };
-      scalar::gray_n_to_luma_u16_row::<12>(&plane, &mut scal, w);
+      unsafe { super::gray_n_to_luma_u16_row::<12, false>(&plane, &mut simd, w) };
+      scalar::gray_n_to_luma_u16_row::<12, false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1831,8 +1912,8 @@ mod tests {
       prng16(&mut plane, 0xBEEF_CAFE);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1847,8 +1928,8 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1879,9 +1960,45 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) };
-      scalar::gray16_to_rgb_row(&plane, &mut scal, w, false);
+      unsafe { super::gray16_to_rgb_row::<false>(&plane, &mut simd, w, false) };
+      scalar::gray16_to_rgb_row::<false>(&plane, &mut scal, w, false);
       assert_eq!(simd, scal, "width={w} limited-range");
     }
   }
+
+  // ---- BE parity tests --------------------------------------------------------
+
+  #[test]
+  fn avx2_gray10_be_parity_luma() {
+    if !is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE10_0001);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) };
+      scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  fn avx2_gray16_be_parity_luma() {
+    if !is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE16_0002);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      scalar::gray16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
 }
diff --git a/src/row/arch/x86_avx512/gray.rs b/src/row/arch/x86_avx512/gray.rs
index 3b43606f..30b730db 100644
--- a/src/row/arch/x86_avx512/gray.rs
+++ b/src/row/arch/x86_avx512/gray.rs
@@ -16,7 +16,10 @@
 
 use core::arch::x86_64::*;
 
-use crate::row::scalar::{bits_mask, gray as scalar};
+use crate::row::{
+  arch::x86_avx512::endian::{load_endian_u16x32, load_endian_u32x16},
+  scalar::{bits_mask, gray as scalar},
+};
 
 // ---- Gray8 ------------------------------------------------------------------
 
@@ -116,7 +119,7 @@ pub(crate) unsafe fn gray8_to_hsv_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -124,7 +127,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray_n_to_rgba_row<BITS>`.
@@ -136,7 +139,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -144,7 +147,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray_n_to_rgb_u16_row<BITS>`.
@@ -156,7 +159,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -164,7 +167,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray_n_to_rgba_u16_row<BITS>`.
@@ -176,7 +179,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -184,7 +187,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray_n_to_luma_row<BITS>`: mask + shift → u8. 32 pixels/iter.
@@ -196,7 +199,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -211,7 +214,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     // requires a literal const generic shift not expressible as `BITS - 8`.
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     while x + 32 <= width {
-      let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm512_and_si512(raw, mask_v);
       // Shift right by (BITS - 8) to get u8-range value in u16
       let shifted = _mm512_srl_epi16(masked, shr);
@@ -222,7 +225,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -235,7 +238,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -247,14 +250,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
   unsafe {
     let mask_v = _mm512_set1_epi16(mask as i16);
     while x + 32 <= width {
-      let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm512_and_si512(raw, mask_v);
       _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), masked);
       x += 32;
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_u16_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -268,7 +271,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -278,7 +281,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let mut x = 0usize;
@@ -287,7 +290,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     let zero256 = _mm256_setzero_si256();
     while x + 32 <= width {
-      let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm512_and_si512(raw, mask_v);
       let shifted = _mm512_srl_epi16(masked, shr);
       let packed = _mm512_cvtepi16_epi8(shifted);
@@ -299,7 +302,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_hsv_row::<BITS>(
+    scalar::gray_n_to_hsv_row::<BITS, BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -321,7 +324,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_rgb_row(
+pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -329,7 +332,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray16_to_rgba_row`.
@@ -341,7 +344,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_rgba_row(
+pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -349,7 +352,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray16_to_rgb_u16_row`.
@@ -361,7 +364,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_rgb_u16_row(
+pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -369,7 +372,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray16_to_rgba_u16_row`.
@@ -381,7 +384,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_rgba_u16_row(
+pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -389,7 +392,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// AVX-512 `gray16_to_luma_row`: `>> 8`, pack to u8. 32 pixels/iter.
@@ -401,13 +404,17 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 32 <= width {
-      let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm512_srli_epi16(raw, 8);
       let packed = _mm512_cvtepi16_epi8(shifted);
       _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), packed);
@@ -415,7 +422,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -428,19 +435,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 32 <= width {
-      let y = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y);
       x += 32;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -454,7 +465,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gray16_to_hsv_row(
+pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -464,13 +475,13 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let zero256 = _mm256_setzero_si256();
     while x + 32 <= width {
-      let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm512_srli_epi16(raw, 8);
       let packed = _mm512_cvtepi16_epi8(shifted);
       _mm256_storeu_si256(h_out.as_mut_ptr().add(x).cast(), zero256);
@@ -480,7 +491,7 @@ pub(crate) unsafe fn gray16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::gray16_to_hsv_row(
+    scalar::gray16_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -502,7 +513,11 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -510,7 +525,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       // Round-half-up: + 0.5 then truncate (matches scalar).
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
@@ -532,7 +549,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -542,7 +559,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -550,7 +571,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
         _mm512_mul_ps(clamped, scale),
@@ -570,7 +593,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
   }
 }
 
@@ -580,7 +603,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -588,7 +615,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       // Round-to-nearest with embedded rounding.
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
@@ -610,7 +639,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -620,7 +649,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -628,7 +661,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
         _mm512_mul_ps(clamped, scale),
@@ -648,7 +683,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_u16_row::<BE>(
+      &y_plane[x..width],
+      &mut out[x * 4..width * 4],
+      width - x,
+    );
   }
 }
 
@@ -659,11 +698,15 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::grayf32_to_rgb_f32_row(y_plane, out, width);
+  scalar::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
 }
 
 /// AVX-512 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8. 16 px/iter.
@@ -672,7 +715,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -680,7 +727,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
         _mm512_mul_ps(clamped, scale),
@@ -692,7 +741,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -702,7 +751,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -710,7 +763,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
         _mm512_mul_ps(clamped, scale),
@@ -722,7 +777,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -733,11 +788,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
-  scalar::grayf32_to_luma_f32_row(y_plane, out, width);
+  scalar::grayf32_to_luma_f32_row::<BE>(y_plane, out, width);
 }
 
 /// AVX-512 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255. 16 px/iter.
@@ -746,7 +805,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w
 /// AVX-512F must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn grayf32_to_hsv_row(
+pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
   y_plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -759,7 +818,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
   let mut x = 0usize;
   unsafe {
     while x + 16 <= width {
-      let y = _mm512_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
       let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
         _mm512_mul_ps(clamped, scale),
@@ -774,7 +835,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
     }
   }
   if x < width {
-    scalar::grayf32_to_hsv_row(
+    scalar::grayf32_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -986,7 +1047,7 @@ pub(crate) unsafe fn ya8_to_hsv_row(
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
@@ -1012,7 +1073,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
     }
   }
   if x < width {
-    scalar::ya16_to_rgb_row(
+    scalar::ya16_to_rgb_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 3..width * 3],
       width - x,
@@ -1026,7 +1087,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
@@ -1062,7 +1127,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_rgba_row(
+    scalar::ya16_to_rgba_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 4..width * 4],
       width - x,
@@ -1076,11 +1141,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
-  scalar::ya16_to_rgb_u16_row(packed, out, width);
+  scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX-512 `ya16_to_rgba_u16_row`: native Y and A u16.
@@ -1089,11 +1158,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
-  scalar::ya16_to_rgba_u16_row(packed, out, width);
+  scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX-512 `ya16_to_luma_row`: Y `>> 8` → u8. 4 px/iter.
@@ -1102,7 +1175,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
@@ -1122,7 +1199,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -1132,11 +1209,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
-  scalar::ya16_to_luma_u16_row(packed, out, width);
+  scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
 }
 
 /// AVX-512 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped.
@@ -1145,7 +1226,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 /// AVX-512F+BW must be available.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ya16_to_hsv_row(
+pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -1173,7 +1254,7 @@ pub(crate) unsafe fn ya16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::ya16_to_hsv_row(
+    scalar::ya16_to_hsv_row::<BE>(
       &packed[x * 2..width * 2],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1226,8 +1307,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1244,8 +1325,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1262,8 +1343,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1280,8 +1361,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1298,8 +1379,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0005);
       let mut simd = std::vec![0.0f32; w * 3];
       let mut scal = std::vec![0.0f32; w * 3];
-      unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1316,8 +1397,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0006);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1334,8 +1415,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0007);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1352,8 +1433,8 @@ mod tests {
       prng_f32(&mut plane, 0xF512_0008);
       let mut simd = std::vec![0.0f32; w];
       let mut scal = std::vec![0.0f32; w];
-      unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1374,8 +1455,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) };
-      sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::grayf32_to_hsv_row::<false>(&plane, &mut sh, &mut ss, &mut sv, w) };
+      sf::grayf32_to_hsv_row::<false>(&plane, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1546,8 +1627,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1564,8 +1645,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1582,8 +1663,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1600,8 +1681,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1618,8 +1699,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0005);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1636,8 +1717,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA562_0006);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1658,8 +1739,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) };
-      sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::ya16_to_hsv_row::<false>(&packed, &mut sh, &mut ss, &mut sv, w) };
+      sy::ya16_to_hsv_row::<false>(&packed, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1698,8 +1779,8 @@ mod tests {
       prng16(&mut plane, 0x1234_ABCD);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) };
-      scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w);
+      unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) };
+      scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1714,8 +1795,8 @@ mod tests {
       prng16(&mut plane, 0xCAFE_BABE);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1730,8 +1811,8 @@ mod tests {
       prng16(&mut plane, 0xDEAD_BEEF);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1762,9 +1843,45 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) };
-      scalar::gray16_to_rgb_row(&plane, &mut scal, w, false);
+      unsafe { super::gray16_to_rgb_row::<false>(&plane, &mut simd, w, false) };
+      scalar::gray16_to_rgb_row::<false>(&plane, &mut scal, w, false);
       assert_eq!(simd, scal, "width={w} limited-range");
     }
   }
+
+  // ---- BE parity tests --------------------------------------------------------
+
+  #[test]
+  fn avx512_gray10_be_parity_luma() {
+    if !is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE10_0001);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) };
+      scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  fn avx512_gray16_be_parity_luma() {
+    if !is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE16_0002);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      scalar::gray16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
 }
diff --git a/src/row/arch/x86_sse41/gray.rs b/src/row/arch/x86_sse41/gray.rs
index 4f101bfe..52f77b5c 100644
--- a/src/row/arch/x86_sse41/gray.rs
+++ b/src/row/arch/x86_sse41/gray.rs
@@ -17,7 +17,10 @@
 
 use core::arch::x86_64::*;
 
-use crate::row::scalar::{bits_mask, gray as scalar};
+use crate::row::{
+  arch::x86_sse41::endian::{load_endian_u16x8, load_endian_u32x4},
+  scalar::{bits_mask, gray as scalar},
+};
 
 // ---- Gray8 ------------------------------------------------------------------
 
@@ -125,7 +128,7 @@ pub(crate) unsafe fn gray8_to_hsv_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -133,7 +136,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray_n_to_rgba_row<BITS>`.
@@ -145,7 +148,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -155,7 +158,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
   debug_assert!(out.len() >= width * 4);
   // SSE4.1 4-channel interleave without SSSE3 shuffle tables is complex;
   // delegate to scalar (which auto-vectorizes well at -O3).
-  scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray_n_to_rgb_u16_row<BITS>`.
@@ -167,7 +170,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -175,7 +178,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray_n_to_rgba_u16_row<BITS>`.
@@ -187,7 +190,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -195,7 +198,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray_n_to_luma_row<BITS>`: mask, shift, pack, store.
@@ -207,7 +210,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -223,7 +226,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     // variant `_mm_srl_epi16` with a count vector built from the shift amount.
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     while x + 8 <= width {
-      let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm_and_si128(raw, mask_v);
       let shifted = _mm_srl_epi16(masked, shr);
       let zero = _mm_setzero_si128();
@@ -236,7 +239,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -249,7 +252,7 @@ pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -261,14 +264,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
   unsafe {
     let mask_v = _mm_set1_epi16(mask as i16);
     while x + 8 <= width {
-      let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm_and_si128(raw, mask_v);
       _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), masked);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray_n_to_luma_u16_row::<BITS>(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -282,7 +285,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -292,7 +295,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mask = bits_mask::<BITS>();
   let mut x = 0usize;
@@ -301,7 +304,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
     let zero = _mm_setzero_si128();
     while x + 8 <= width {
-      let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let masked = _mm_and_si128(raw, mask_v);
       let shifted = _mm_srl_epi16(masked, shr);
       let packed = _mm_packus_epi16(shifted, zero);
@@ -315,7 +318,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
     }
   }
   if x < width {
-    scalar::gray_n_to_hsv_row::<BITS>(
+    scalar::gray_n_to_hsv_row::<BITS, BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -337,7 +340,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32>(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_rgb_row(
+pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -345,7 +348,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray16_to_rgba_row`: `>> 8` → RGBA u8.
@@ -357,7 +360,7 @@ pub(crate) unsafe fn gray16_to_rgb_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_rgba_row(
+pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -365,7 +368,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray16_to_rgb_u16_row`.
@@ -377,7 +380,7 @@ pub(crate) unsafe fn gray16_to_rgba_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_rgb_u16_row(
+pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -385,7 +388,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
-  scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray16_to_rgba_u16_row`.
@@ -397,7 +400,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_rgba_u16_row(
+pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -405,7 +408,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
-  scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// SSE4.1 `gray16_to_luma_row`: `>> 8`, pack, store.
@@ -417,14 +420,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row(
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     let zero = _mm_setzero_si128();
     while x + 8 <= width {
-      let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm_srli_epi16(raw, 8);
       let packed = _mm_packus_epi16(shifted, zero);
       let val = _mm_cvtsi128_si64(packed) as u64;
@@ -433,7 +440,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -446,19 +453,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width:
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
   let mut x = 0usize;
   unsafe {
     while x + 8 <= width {
-      let y = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y);
       x += 8;
     }
   }
   if x < width {
-    scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -472,7 +483,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi
 #[allow(dead_code)]
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gray16_to_hsv_row(
+pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -482,13 +493,13 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 ) {
   debug_assert!(y_plane.len() >= width);
   if !full_range {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   let mut x = 0usize;
   unsafe {
     let zero16 = _mm_setzero_si128();
     while x + 8 <= width {
-      let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast());
+      let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
       let shifted = _mm_srli_epi16(raw, 8);
       let packed = _mm_packus_epi16(shifted, zero16);
       let val = _mm_cvtsi128_si64(packed) as u64;
@@ -499,7 +510,7 @@ pub(crate) unsafe fn gray16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::gray16_to_hsv_row(
+    scalar::gray16_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -522,7 +533,11 @@ pub(crate) unsafe fn gray16_to_hsv_row(
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -532,7 +547,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -560,7 +577,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -570,7 +587,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width:
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -580,7 +601,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -611,7 +634,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
   }
 }
 
@@ -621,7 +644,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width:
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
@@ -631,7 +658,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -657,7 +686,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
     }
   }
   if x < width {
-    scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
+    scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
   }
 }
 
@@ -667,7 +696,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 4);
@@ -677,7 +710,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -706,7 +741,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
+    scalar::grayf32_to_rgba_u16_row::<BE>(
+      &y_plane[x..width],
+      &mut out[x * 4..width * 4],
+      width - x,
+    );
   }
 }
 
@@ -717,12 +756,16 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width * 3);
   // f32 triplet broadcast: scalar is already optimal here.
-  scalar::grayf32_to_rgb_f32_row(y_plane, out, width);
+  scalar::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
 }
 
 /// SSE4.1 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8.
@@ -731,7 +774,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -741,7 +788,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -754,7 +803,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -764,7 +813,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width:
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
@@ -774,7 +827,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -787,7 +842,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
     }
   }
   if x < width {
-    scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x);
+    scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
   }
 }
 
@@ -798,11 +853,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w
 #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) {
+pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
+  y_plane: &[f32],
+  out: &mut [f32],
+  width: usize,
+) {
   use crate::row::scalar::grayf32 as scalar;
   debug_assert!(y_plane.len() >= width);
   debug_assert!(out.len() >= width);
-  scalar::grayf32_to_luma_f32_row(y_plane, out, width);
+  scalar::grayf32_to_luma_f32_row::<BE>(y_plane, out, width);
 }
 
 /// SSE4.1 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255.
@@ -811,7 +870,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn grayf32_to_hsv_row(
+pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
   y_plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -827,7 +886,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
   let mut x = 0usize;
   unsafe {
     while x + 4 <= width {
-      let y = _mm_loadu_ps(y_plane.as_ptr().add(x));
+      let y = _mm_castsi128_ps(load_endian_u32x4::<BE>(
+        y_plane.as_ptr().cast::<u8>().add(x * 4),
+      ));
       let clamped = _mm_min_ps(_mm_max_ps(y, zero), one);
       let scaled = _mm_mul_ps(clamped, scale);
       let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5)));
@@ -844,7 +905,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row(
     }
   }
   if x < width {
-    scalar::grayf32_to_hsv_row(
+    scalar::grayf32_to_hsv_row::<BE>(
       &y_plane[x..width],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1063,7 +1124,7 @@ pub(crate) unsafe fn ya8_to_hsv_row(
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
@@ -1091,7 +1152,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
     }
   }
   if x < width {
-    scalar::ya16_to_rgb_row(
+    scalar::ya16_to_rgb_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 3..width * 3],
       width - x,
@@ -1105,7 +1166,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
@@ -1141,7 +1206,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_rgba_row(
+    scalar::ya16_to_rgba_row::<BE>(
       &packed[x * 2..width * 2],
       &mut out[x * 4..width * 4],
       width - x,
@@ -1155,11 +1220,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 3);
-  scalar::ya16_to_rgb_u16_row(packed, out, width);
+  scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
 }
 
 /// SSE4.1 `ya16_to_rgba_u16_row`: native Y and A u16.
@@ -1168,11 +1237,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width:
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width * 4);
-  scalar::ya16_to_rgba_u16_row(packed, out, width);
+  scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
 }
 
 /// SSE4.1 `ya16_to_luma_row`: Y `>> 8` → u8.
@@ -1181,7 +1254,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
@@ -1201,7 +1278,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     }
   }
   if x < width {
-    scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
+    scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
   }
 }
 
@@ -1211,11 +1288,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   use crate::row::scalar::ya16 as scalar;
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
-  scalar::ya16_to_luma_u16_row(packed, out, width);
+  scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
 }
 
 /// SSE4.1 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped.
@@ -1224,7 +1305,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 /// SSE4.1 must be available.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ya16_to_hsv_row(
+pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -1252,7 +1333,7 @@ pub(crate) unsafe fn ya16_to_hsv_row(
     }
   }
   if x < width {
-    scalar::ya16_to_hsv_row(
+    scalar::ya16_to_hsv_row::<BE>(
       &packed[x * 2..width * 2],
       &mut h_out[x..width],
       &mut s_out[x..width],
@@ -1331,8 +1412,8 @@ mod tests {
       prng16(&mut plane, 0xCAFE_BABE);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) };
-      scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w);
+      unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) };
+      scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1347,8 +1428,8 @@ mod tests {
       prng16(&mut plane, 0xDEAD_BEEF);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1363,8 +1444,8 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) };
-      scalar::gray16_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::gray16_to_luma_row::<false>(&plane, &mut simd, w) };
+      scalar::gray16_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1395,8 +1476,8 @@ mod tests {
       prng16(&mut plane, 0x1234_5678);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) };
-      scalar::gray16_to_rgb_row(&plane, &mut scal, w, false);
+      unsafe { super::gray16_to_rgb_row::<false>(&plane, &mut simd, w, false) };
+      scalar::gray16_to_rgb_row::<false>(&plane, &mut scal, w, false);
       assert_eq!(simd, scal, "width={w} limited-range");
     }
   }
@@ -1423,8 +1504,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1441,8 +1522,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1459,8 +1540,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1477,8 +1558,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgba_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgba_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1495,8 +1576,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0005);
       let mut simd = std::vec![0.0f32; w * 3];
       let mut scal = std::vec![0.0f32; w * 3];
-      unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_rgb_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_rgb_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1513,8 +1594,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0006);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1531,8 +1612,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0007);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_u16_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_u16_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_u16_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1549,8 +1630,8 @@ mod tests {
       prng_f32(&mut plane, 0xF320_0008);
       let mut simd = std::vec![0.0f32; w];
       let mut scal = std::vec![0.0f32; w];
-      unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) };
-      sf::grayf32_to_luma_f32_row(&plane, &mut scal, w);
+      unsafe { super::grayf32_to_luma_f32_row::<false>(&plane, &mut simd, w) };
+      sf::grayf32_to_luma_f32_row::<false>(&plane, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1571,8 +1652,8 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) };
-      sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::grayf32_to_hsv_row::<false>(&plane, &mut sh, &mut ss, &mut sv, w) };
+      sf::grayf32_to_hsv_row::<false>(&plane, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
@@ -1743,8 +1824,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0001);
       let mut simd = std::vec![0u8; w * 3];
       let mut scal = std::vec![0u8; w * 3];
-      unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1761,8 +1842,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0002);
       let mut simd = std::vec![0u8; w * 4];
       let mut scal = std::vec![0u8; w * 4];
-      unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1779,8 +1860,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0003);
       let mut simd = std::vec![0u16; w * 3];
       let mut scal = std::vec![0u16; w * 3];
-      unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgb_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgb_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgb_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1797,8 +1878,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0004);
       let mut simd = std::vec![0u16; w * 4];
       let mut scal = std::vec![0u16; w * 4];
-      unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_rgba_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_rgba_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_rgba_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1815,8 +1896,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0005);
       let mut simd = std::vec![0u8; w];
       let mut scal = std::vec![0u8; w];
-      unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1833,8 +1914,8 @@ mod tests {
       prng_ya16(&mut packed, 0xA160_0006);
       let mut simd = std::vec![0u16; w];
       let mut scal = std::vec![0u16; w];
-      unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) };
-      sy::ya16_to_luma_u16_row(&packed, &mut scal, w);
+      unsafe { super::ya16_to_luma_u16_row::<false>(&packed, &mut simd, w) };
+      sy::ya16_to_luma_u16_row::<false>(&packed, &mut scal, w);
       assert_eq!(simd, scal, "width={w}");
     }
   }
@@ -1855,11 +1936,75 @@ mod tests {
       let mut rh = std::vec![0u8; w];
       let mut rs = std::vec![0u8; w];
       let mut rv = std::vec![0u8; w];
-      unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) };
-      sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w);
+      unsafe { super::ya16_to_hsv_row::<false>(&packed, &mut sh, &mut ss, &mut sv, w) };
+      sy::ya16_to_hsv_row::<false>(&packed, &mut rh, &mut rs, &mut rv, w);
       assert_eq!(sh, rh, "H width={w}");
       assert_eq!(ss, rs, "S width={w}");
       assert_eq!(sv, rv, "V width={w}");
     }
   }
+
+  // ---- BE parity tests --------------------------------------------------------
+
+  #[test]
+  fn sse41_gray10_be_parity_luma() {
+    if !is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE10_0001);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) };
+      scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  fn sse41_gray16_be_parity_luma() {
+    if !is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0u16; w];
+      prng16(&mut le, 0xBE16_0002);
+      let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::gray16_to_luma_row::<true>(&be, &mut simd_be, w) };
+      scalar::gray16_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
+
+  #[test]
+  fn sse41_grayf32_be_parity_luma() {
+    use crate::row::scalar::grayf32 as sf;
+    if !is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    fn prng_f32(out: &mut [f32], seed: u32) {
+      let mut s = seed;
+      for v in out.iter_mut() {
+        s = s.wrapping_mul(1664525).wrapping_add(1013904223);
+        *v = ((s >> 8) as f32) / (u32::MAX as f32) * 1.3 - 0.1;
+      }
+    }
+    for &w in WIDTHS {
+      let mut le = std::vec![0.0f32; w];
+      prng_f32(&mut le, 0xBEF3_0003);
+      let be: std::vec::Vec<f32> = le
+        .iter()
+        .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+        .collect();
+      let mut simd_be = std::vec![0u8; w];
+      let mut scal_le = std::vec![0u8; w];
+      unsafe { super::grayf32_to_luma_row::<true>(&be, &mut simd_be, w) };
+      sf::grayf32_to_luma_row::<false>(&le, &mut scal_le, w);
+      assert_eq!(simd_be, scal_le, "width={w}");
+    }
+  }
 }
diff --git a/src/row/dispatch/gray.rs b/src/row/dispatch/gray.rs
index a8215abb..b6eaf204 100644
--- a/src/row/dispatch/gray.rs
+++ b/src/row/dispatch/gray.rs
@@ -177,7 +177,7 @@ pub(crate) fn gray8_to_hsv_row(
 
 /// Dispatch `gray_n_to_rgb_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -188,43 +188,43 @@ pub(crate) fn gray_n_to_rgb_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_rgb_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray_n_to_rgba_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -235,43 +235,43 @@ pub(crate) fn gray_n_to_rgba_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_rgba_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray_n_to_rgb_u16_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -282,45 +282,45 @@ pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
         unsafe {
-          arch::wasm_simd128::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+          arch::wasm_simd128::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
         }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_rgb_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray_n_to_rgba_u16_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -331,47 +331,47 @@ pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+    return scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
         unsafe {
-          arch::x86_avx512::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+          arch::x86_avx512::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
         }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
         unsafe {
-          arch::wasm_simd128::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+          arch::wasm_simd128::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
         }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_rgba_u16_row::<BITS>(y_plane, out, width, full_range);
+  scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray_n_to_luma_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_luma_row<const BITS: u32>(
+pub(crate) fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -380,43 +380,43 @@ pub(crate) fn gray_n_to_luma_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_luma_row::<BITS>(y_plane, out, width);
+    return scalar::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_luma_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::neon::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray_n_to_luma_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_avx512::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_luma_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_avx2::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_luma_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_sse41::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray_n_to_luma_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::wasm_simd128::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_luma_row::<BITS>(y_plane, out, width);
+  scalar::gray_n_to_luma_row::<BITS, BE>(y_plane, out, width);
 }
 
 /// Dispatch `gray_n_to_luma_u16_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -425,43 +425,43 @@ pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32>(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width);
+    return scalar::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::neon::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_avx512::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_avx2::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::x86_sse41::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width); }
+        unsafe { arch::wasm_simd128::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray_n_to_luma_u16_row::<BITS>(y_plane, out, width);
+  scalar::gray_n_to_luma_u16_row::<BITS, BE>(y_plane, out, width);
 }
 
 /// Dispatch `gray_n_to_hsv_row<BITS>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -475,13 +475,13 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
   assert!(s_out.len() >= width, "S out too short");
   assert!(v_out.len() >= width, "V out too short");
   if !use_simd {
-    return scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
         unsafe {
-          arch::neon::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+          arch::neon::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
         }
         return;
       }
@@ -489,7 +489,7 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
     target_arch = "x86_64" => {
       if avx512_available() {
         unsafe {
-          arch::x86_avx512::gray_n_to_hsv_row::<BITS>(
+          arch::x86_avx512::gray_n_to_hsv_row::<BITS, BE>(
             y_plane, h_out, s_out, v_out, width, full_range,
           );
         }
@@ -497,7 +497,7 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
       }
       if avx2_available() {
         unsafe {
-          arch::x86_avx2::gray_n_to_hsv_row::<BITS>(
+          arch::x86_avx2::gray_n_to_hsv_row::<BITS, BE>(
             y_plane, h_out, s_out, v_out, width, full_range,
           );
         }
@@ -505,7 +505,7 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
       }
       if sse41_available() {
         unsafe {
-          arch::x86_sse41::gray_n_to_hsv_row::<BITS>(
+          arch::x86_sse41::gray_n_to_hsv_row::<BITS, BE>(
             y_plane, h_out, s_out, v_out, width, full_range,
           );
         }
@@ -515,7 +515,7 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
     target_arch = "wasm32" => {
       if simd128_available() {
         unsafe {
-          arch::wasm_simd128::gray_n_to_hsv_row::<BITS>(
+          arch::wasm_simd128::gray_n_to_hsv_row::<BITS, BE>(
             y_plane, h_out, s_out, v_out, width, full_range,
           );
         }
@@ -524,14 +524,14 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
     },
     _ => {}
   }
-  scalar::gray_n_to_hsv_row::<BITS>(y_plane, h_out, s_out, v_out, width, full_range);
+  scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
 }
 
 // ---- Gray16 ----------------------------------------------------------------
 
 /// Dispatch `gray16_to_rgb_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgb_row(
+pub(crate) fn gray16_to_rgb_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -542,43 +542,43 @@ pub(crate) fn gray16_to_rgb_row(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_rgb_row(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_rgb_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_rgb_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_rgb_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_rgb_row(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_rgb_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray16_to_rgba_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgba_row(
+pub(crate) fn gray16_to_rgba_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -589,43 +589,43 @@ pub(crate) fn gray16_to_rgba_row(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_rgba_row(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_rgba_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_rgba_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_rgba_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_rgba_row(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_rgba_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray16_to_rgb_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgb_u16_row(
+pub(crate) fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -636,43 +636,43 @@ pub(crate) fn gray16_to_rgb_u16_row(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_rgb_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_rgb_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_rgb_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_rgb_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_rgb_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray16_to_rgba_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgba_u16_row(
+pub(crate) fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -683,83 +683,88 @@ pub(crate) fn gray16_to_rgba_u16_row(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= out_min, "out too short");
   if !use_simd {
-    return scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+    return scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_rgba_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::neon::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_rgba_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx512::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_rgba_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_avx2::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_rgba_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::x86_sse41::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_rgba_u16_row(y_plane, out, width, full_range); }
+        unsafe { arch::wasm_simd128::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range);
+  scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
 }
 
 /// Dispatch `gray16_to_luma_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn gray16_to_luma_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::gray16_to_luma_row(y_plane, out, width);
+    return scalar::gray16_to_luma_row::<BE>(y_plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_luma_row(y_plane, out, width); }
+        unsafe { arch::neon::gray16_to_luma_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_luma_row(y_plane, out, width); }
+        unsafe { arch::x86_avx512::gray16_to_luma_row::<BE>(y_plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_luma_row(y_plane, out, width); }
+        unsafe { arch::x86_avx2::gray16_to_luma_row::<BE>(y_plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_luma_row(y_plane, out, width); }
+        unsafe { arch::x86_sse41::gray16_to_luma_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_luma_row(y_plane, out, width); }
+        unsafe { arch::wasm_simd128::gray16_to_luma_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_luma_row(y_plane, out, width);
+  scalar::gray16_to_luma_row::<BE>(y_plane, out, width);
 }
 
 /// Dispatch `gray16_to_luma_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_luma_u16_row(
+pub(crate) fn gray16_to_luma_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -768,43 +773,43 @@ pub(crate) fn gray16_to_luma_u16_row(
   assert!(y_plane.len() >= width, "y_plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::gray16_to_luma_u16_row(y_plane, out, width);
+    return scalar::gray16_to_luma_u16_row::<BE>(y_plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_luma_u16_row(y_plane, out, width); }
+        unsafe { arch::neon::gray16_to_luma_u16_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::gray16_to_luma_u16_row(y_plane, out, width); }
+        unsafe { arch::x86_avx512::gray16_to_luma_u16_row::<BE>(y_plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::gray16_to_luma_u16_row(y_plane, out, width); }
+        unsafe { arch::x86_avx2::gray16_to_luma_u16_row::<BE>(y_plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::gray16_to_luma_u16_row(y_plane, out, width); }
+        unsafe { arch::x86_sse41::gray16_to_luma_u16_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::gray16_to_luma_u16_row(y_plane, out, width); }
+        unsafe { arch::wasm_simd128::gray16_to_luma_u16_row::<BE>(y_plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_luma_u16_row(y_plane, out, width);
+  scalar::gray16_to_luma_u16_row::<BE>(y_plane, out, width);
 }
 
 /// Dispatch `gray16_to_hsv_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_hsv_row(
+pub(crate) fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -818,31 +823,31 @@ pub(crate) fn gray16_to_hsv_row(
   assert!(s_out.len() >= width, "S out too short");
   assert!(v_out.len() >= width, "V out too short");
   if !use_simd {
-    return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+    return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); }
+        unsafe { arch::neon::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
         unsafe {
-          arch::x86_avx512::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+          arch::x86_avx512::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
         }
         return;
       }
       if avx2_available() {
         unsafe {
-          arch::x86_avx2::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+          arch::x86_avx2::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
         }
         return;
       }
       if sse41_available() {
         unsafe {
-          arch::x86_sse41::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+          arch::x86_sse41::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
         }
         return;
       }
@@ -850,12 +855,12 @@ pub(crate) fn gray16_to_hsv_row(
     target_arch = "wasm32" => {
       if simd128_available() {
         unsafe {
-          arch::wasm_simd128::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+          arch::wasm_simd128::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
         }
         return;
       }
     },
     _ => {}
   }
-  scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
+  scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
 }
diff --git a/src/row/dispatch/grayf32.rs b/src/row/dispatch/grayf32.rs
index 6169c03e..02321edd 100644
--- a/src/row/dispatch/grayf32.rs
+++ b/src/row/dispatch/grayf32.rs
@@ -25,133 +25,148 @@ use crate::row::{
 
 /// Dispatch `grayf32_to_rgb_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn grayf32_to_rgb_row<const BE: bool>(
+  plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= rgb_row_bytes(width), "out too short");
   if !use_simd {
-    return scalar::grayf32_to_rgb_row(plane, out, width);
+    return scalar::grayf32_to_rgb_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_rgb_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_rgb_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_rgb_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_rgb_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_rgb_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_rgb_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_rgb_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_rgb_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_rgb_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_rgb_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_rgb_row(plane, out, width);
+  scalar::grayf32_to_rgb_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_rgba_row ------------------------------------------------------
 
 /// Dispatch `grayf32_to_rgba_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgba_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn grayf32_to_rgba_row<const BE: bool>(
+  plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= rgba_row_bytes(width), "out too short");
   if !use_simd {
-    return scalar::grayf32_to_rgba_row(plane, out, width);
+    return scalar::grayf32_to_rgba_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_rgba_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_rgba_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_rgba_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_rgba_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_rgba_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_rgba_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_rgba_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_rgba_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_rgba_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_rgba_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_rgba_row(plane, out, width);
+  scalar::grayf32_to_rgba_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_rgb_u16_row ---------------------------------------------------
 
 /// Dispatch `grayf32_to_rgb_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], out: &mut [u16], width: usize, use_simd: bool) {
+pub(crate) fn grayf32_to_rgb_u16_row<const BE: bool>(
+  plane: &[f32],
+  out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= rgb_row_elems(width), "out too short");
   if !use_simd {
-    return scalar::grayf32_to_rgb_u16_row(plane, out, width);
+    return scalar::grayf32_to_rgb_u16_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_rgb_u16_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_rgb_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_rgb_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_rgb_u16_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_rgb_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_rgb_u16_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_rgb_u16_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_rgb_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_rgb_u16_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_rgb_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_rgb_u16_row(plane, out, width);
+  scalar::grayf32_to_rgb_u16_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_rgba_u16_row --------------------------------------------------
 
 /// Dispatch `grayf32_to_rgba_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgba_u16_row(
+pub(crate) fn grayf32_to_rgba_u16_row<const BE: bool>(
   plane: &[f32],
   out: &mut [u16],
   width: usize,
@@ -160,45 +175,45 @@ pub(crate) fn grayf32_to_rgba_u16_row(
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= rgba_row_elems(width), "out too short");
   if !use_simd {
-    return scalar::grayf32_to_rgba_u16_row(plane, out, width);
+    return scalar::grayf32_to_rgba_u16_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_rgba_u16_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_rgba_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_rgba_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_rgba_u16_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_rgba_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_rgba_u16_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_rgba_u16_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_rgba_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_rgba_u16_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_rgba_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_rgba_u16_row(plane, out, width);
+  scalar::grayf32_to_rgba_u16_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_rgb_f32_row ---------------------------------------------------
 
 /// Dispatch `grayf32_to_rgb_f32_row` (lossless replicate, all backends delegate to scalar).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_f32_row(
+pub(crate) fn grayf32_to_rgb_f32_row<const BE: bool>(
   plane: &[f32],
   out: &mut [f32],
   width: usize,
@@ -206,56 +221,61 @@ pub(crate) fn grayf32_to_rgb_f32_row(
 ) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= rgb_row_elems(width), "out too short");
-  scalar::grayf32_to_rgb_f32_row(plane, out, width);
+  scalar::grayf32_to_rgb_f32_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_luma_row ------------------------------------------------------
 
 /// Dispatch `grayf32_to_luma_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn grayf32_to_luma_row<const BE: bool>(
+  plane: &[f32],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::grayf32_to_luma_row(plane, out, width);
+    return scalar::grayf32_to_luma_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_luma_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_luma_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_luma_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_luma_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_luma_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_luma_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_luma_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_luma_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_luma_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_luma_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_luma_row(plane, out, width);
+  scalar::grayf32_to_luma_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_luma_u16_row --------------------------------------------------
 
 /// Dispatch `grayf32_to_luma_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_u16_row(
+pub(crate) fn grayf32_to_luma_u16_row<const BE: bool>(
   plane: &[f32],
   out: &mut [u16],
   width: usize,
@@ -264,45 +284,45 @@ pub(crate) fn grayf32_to_luma_u16_row(
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::grayf32_to_luma_u16_row(plane, out, width);
+    return scalar::grayf32_to_luma_u16_row::<BE>(plane, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_luma_u16_row(plane, out, width); }
+        unsafe { arch::neon::grayf32_to_luma_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_luma_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_luma_u16_row::<BE>(plane, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_luma_u16_row(plane, out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_luma_u16_row::<BE>(plane, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_luma_u16_row(plane, out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_luma_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_luma_u16_row(plane, out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_luma_u16_row::<BE>(plane, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_luma_u16_row(plane, out, width);
+  scalar::grayf32_to_luma_u16_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_luma_f32_row --------------------------------------------------
 
 /// Dispatch `grayf32_to_luma_f32_row` (lossless memcpy, no SIMD needed).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_f32_row(
+pub(crate) fn grayf32_to_luma_f32_row<const BE: bool>(
   plane: &[f32],
   out: &mut [f32],
   width: usize,
@@ -310,14 +330,14 @@ pub(crate) fn grayf32_to_luma_f32_row(
 ) {
   assert!(plane.len() >= width, "plane too short");
   assert!(out.len() >= width, "out too short");
-  scalar::grayf32_to_luma_f32_row(plane, out, width);
+  scalar::grayf32_to_luma_f32_row::<BE>(plane, out, width);
 }
 
 // ---- grayf32_to_hsv_row -------------------------------------------------------
 
 /// Dispatch `grayf32_to_hsv_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_hsv_row(
+pub(crate) fn grayf32_to_hsv_row<const BE: bool>(
   plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -330,36 +350,36 @@ pub(crate) fn grayf32_to_hsv_row(
   assert!(s_out.len() >= width, "S out too short");
   assert!(v_out.len() >= width, "V out too short");
   if !use_simd {
-    return scalar::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width);
+    return scalar::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); }
+        unsafe { arch::neon::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_avx512::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_avx2::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_sse41::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); }
+        unsafe { arch::wasm_simd128::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width);
+  scalar::grayf32_to_hsv_row::<BE>(plane, h_out, s_out, v_out, width);
 }
diff --git a/src/row/dispatch/ya16.rs b/src/row/dispatch/ya16.rs
index 38dab9c0..41e0c486 100644
--- a/src/row/dispatch/ya16.rs
+++ b/src/row/dispatch/ya16.rs
@@ -27,259 +27,289 @@ use crate::row::{
 
 /// Dispatch `ya16_to_rgb_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_rgb_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= rgb_row_bytes(width), "out too short");
   if !use_simd {
-    return scalar::ya16_to_rgb_row(packed, out, width);
+    return scalar::ya16_to_rgb_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_rgb_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_rgb_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_rgb_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_rgb_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_rgb_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_rgb_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_rgb_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_rgb_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_rgb_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_rgb_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_rgb_row(packed, out, width);
+  scalar::ya16_to_rgb_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_rgba_row ---------------------------------------------------------
 
 /// Dispatch `ya16_to_rgba_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_rgba_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= rgba_row_bytes(width), "out too short");
   if !use_simd {
-    return scalar::ya16_to_rgba_row(packed, out, width);
+    return scalar::ya16_to_rgba_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_rgba_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_rgba_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_rgba_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_rgba_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_rgba_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_rgba_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_rgba_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_rgba_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_rgba_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_rgba_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_rgba_row(packed, out, width);
+  scalar::ya16_to_rgba_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_rgb_u16_row ------------------------------------------------------
 
 /// Dispatch `ya16_to_rgb_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= rgb_row_elems(width), "out too short");
   if !use_simd {
-    return scalar::ya16_to_rgb_u16_row(packed, out, width);
+    return scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_rgb_u16_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_rgb_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_rgb_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_rgb_u16_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_rgb_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_rgb_u16_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_rgb_u16_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_rgb_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_rgb_u16_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_rgb_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_rgb_u16_row(packed, out, width);
+  scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_rgba_u16_row -----------------------------------------------------
 
 /// Dispatch `ya16_to_rgba_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= rgba_row_elems(width), "out too short");
   if !use_simd {
-    return scalar::ya16_to_rgba_u16_row(packed, out, width);
+    return scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_rgba_u16_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_rgba_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_rgba_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_rgba_u16_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_rgba_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_rgba_u16_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_rgba_u16_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_rgba_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_rgba_u16_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_rgba_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_rgba_u16_row(packed, out, width);
+  scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_luma_row ---------------------------------------------------------
 
 /// Dispatch `ya16_to_luma_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::ya16_to_luma_row(packed, out, width);
+    return scalar::ya16_to_luma_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_luma_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_luma_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_luma_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_luma_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_luma_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_luma_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_luma_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_luma_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_luma_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_luma_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_luma_row(packed, out, width);
+  scalar::ya16_to_luma_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_luma_u16_row -----------------------------------------------------
 
 /// Dispatch `ya16_to_luma_u16_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) {
+pub(crate) fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   assert!(packed.len() >= ya_row_elems(width), "packed too short");
   assert!(out.len() >= width, "out too short");
   if !use_simd {
-    return scalar::ya16_to_luma_u16_row(packed, out, width);
+    return scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_luma_u16_row(packed, out, width); }
+        unsafe { arch::neon::ya16_to_luma_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_luma_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx512::ya16_to_luma_u16_row::<BE>(packed, out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_luma_u16_row(packed, out, width); }
+        unsafe { arch::x86_avx2::ya16_to_luma_u16_row::<BE>(packed, out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_luma_u16_row(packed, out, width); }
+        unsafe { arch::x86_sse41::ya16_to_luma_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_luma_u16_row(packed, out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_luma_u16_row::<BE>(packed, out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_luma_u16_row(packed, out, width);
+  scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
 }
 
 // ---- ya16_to_hsv_row ----------------------------------------------------------
 
 /// Dispatch `ya16_to_hsv_row`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_hsv_row(
+pub(crate) fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -292,36 +322,36 @@ pub(crate) fn ya16_to_hsv_row(
   assert!(s_out.len() >= width, "S out too short");
   assert!(v_out.len() >= width, "V out too short");
   if !use_simd {
-    return scalar::ya16_to_hsv_row(packed, h_out, s_out, v_out, width);
+    return scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
       if neon_available() {
-        unsafe { arch::neon::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); }
+        unsafe { arch::neon::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width); }
         return;
       }
     },
     target_arch = "x86_64" => {
       if avx512_available() {
-        unsafe { arch::x86_avx512::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_avx512::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width); }
         return;
       }
       if avx2_available() {
-        unsafe { arch::x86_avx2::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_avx2::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width); }
         return;
       }
       if sse41_available() {
-        unsafe { arch::x86_sse41::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); }
+        unsafe { arch::x86_sse41::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width); }
         return;
       }
     },
     target_arch = "wasm32" => {
       if simd128_available() {
-        unsafe { arch::wasm_simd128::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); }
+        unsafe { arch::wasm_simd128::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width); }
         return;
       }
     },
     _ => {}
   }
-  scalar::ya16_to_hsv_row(packed, h_out, s_out, v_out, width);
+  scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 297f1c3c..920856a4 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -710,7 +710,7 @@ mod overflow_tests {
   fn gray_n_to_rgb_dispatcher_rejects_width_times_3_overflow() {
     let y: [u16; 0] = [];
     let mut rgb: [u8; 0] = [];
-    gray_n_to_rgb_row::<10>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
+    gray_n_to_rgb_row::<10, false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -719,7 +719,7 @@ mod overflow_tests {
   fn gray_n_to_rgba_dispatcher_rejects_width_times_4_overflow() {
     let y: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gray_n_to_rgba_row::<10>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
+    gray_n_to_rgba_row::<10, false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -728,7 +728,7 @@ mod overflow_tests {
   fn gray_n_to_rgb_u16_dispatcher_rejects_width_times_3_overflow() {
     let y: [u16; 0] = [];
     let mut rgb: [u16; 0] = [];
-    gray_n_to_rgb_u16_row::<10>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
+    gray_n_to_rgb_u16_row::<10, false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -737,7 +737,7 @@ mod overflow_tests {
   fn gray_n_to_rgba_u16_dispatcher_rejects_width_times_4_overflow() {
     let y: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gray_n_to_rgba_u16_row::<10>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
+    gray_n_to_rgba_u16_row::<10, false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -746,7 +746,7 @@ mod overflow_tests {
   fn gray16_to_rgb_dispatcher_rejects_width_times_3_overflow() {
     let y: [u16; 0] = [];
     let mut rgb: [u8; 0] = [];
-    gray16_to_rgb_row(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
+    gray16_to_rgb_row::<false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -755,7 +755,7 @@ mod overflow_tests {
   fn gray16_to_rgba_dispatcher_rejects_width_times_4_overflow() {
     let y: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gray16_to_rgba_row(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
+    gray16_to_rgba_row::<false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -764,7 +764,7 @@ mod overflow_tests {
   fn gray16_to_rgb_u16_dispatcher_rejects_width_times_3_overflow() {
     let y: [u16; 0] = [];
     let mut rgb: [u16; 0] = [];
-    gray16_to_rgb_u16_row(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
+    gray16_to_rgb_u16_row::<false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -773,7 +773,7 @@ mod overflow_tests {
   fn gray16_to_rgba_u16_dispatcher_rejects_width_times_4_overflow() {
     let y: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gray16_to_rgba_u16_row(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
+    gray16_to_rgba_u16_row::<false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true);
   }
 
   #[cfg(target_pointer_width = "32")]
diff --git a/src/row/scalar/gray.rs b/src/row/scalar/gray.rs
index 8211f084..8f591e30 100644
--- a/src/row/scalar/gray.rs
+++ b/src/row/scalar/gray.rs
@@ -213,10 +213,11 @@ pub(crate) fn gray8_to_hsv_row(
 /// GrayN → packed RGB u8. Masks to BITS bits, downshifts `BITS - 8` to u8,
 /// broadcasts.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to [0, 255]
 /// before broadcast. Luma outputs always pass Y through without rescaling.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgb_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -227,6 +228,11 @@ pub(crate) fn gray_n_to_rgb_row<const BITS: u32>(
   let mask = bits_mask::<BITS>();
   let shift = BITS - 8;
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let masked = raw & mask;
     let y8 = if full_range {
       (masked >> shift) as u8
@@ -240,9 +246,10 @@ pub(crate) fn gray_n_to_rgb_row<const BITS: u32>(
 /// GrayN → packed RGBA u8. Masks to BITS bits, downshifts to u8, broadcasts,
 /// α = 0xFF.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to [0, 255].
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgba_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u8],
   width: usize,
@@ -253,6 +260,11 @@ pub(crate) fn gray_n_to_rgba_row<const BITS: u32>(
   let mask = bits_mask::<BITS>();
   let shift = BITS - 8;
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let masked = raw & mask;
     let y8 = if full_range {
       (masked >> shift) as u8
@@ -265,10 +277,11 @@ pub(crate) fn gray_n_to_rgba_row<const BITS: u32>(
 
 /// GrayN → packed u16 RGB. Masks to BITS bits, broadcasts at native depth.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to full native range
 /// [0, (1<<BITS)-1] before broadcast.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -278,6 +291,11 @@ pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32>(
   debug_assert!(out.len() >= width * 3, "out too short");
   let mask = bits_mask::<BITS>();
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let masked = raw & mask;
     let y_out = if full_range {
       masked
@@ -290,9 +308,10 @@ pub(crate) fn gray_n_to_rgb_u16_row<const BITS: u32>(
 
 /// GrayN → packed u16 RGBA. Masks to BITS bits, broadcasts, α = `(1 << BITS) - 1`.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to full native range.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -303,6 +322,11 @@ pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32>(
   let mask = bits_mask::<BITS>();
   let alpha = mask; // full-range max for BITS
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let masked = raw & mask;
     let y_out = if full_range {
       masked
@@ -315,25 +339,36 @@ pub(crate) fn gray_n_to_rgba_u16_row<const BITS: u32>(
 
 /// GrayN → luma u8. Masks to BITS bits, downshifts `BITS - 8`.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// Always passes raw Y through without `full_range` rescaling —
 /// the caller is explicitly requesting the source luma plane as-is.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_luma_row<const BITS: u32>(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width, "out too short");
   let mask = bits_mask::<BITS>();
   let shift = BITS - 8;
   for (out_byte, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     *out_byte = ((raw & mask) >> shift) as u8;
   }
 }
 
 /// GrayN → luma u16. Masks to BITS bits, identity copy.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// Always passes raw Y through without `full_range` rescaling —
 /// the caller is explicitly requesting the source luma plane as-is.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -342,16 +377,22 @@ pub(crate) fn gray_n_to_luma_u16_row<const BITS: u32>(
   debug_assert!(out.len() >= width, "out too short");
   let mask = bits_mask::<BITS>();
   for (out_el, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     *out_el = raw & mask;
   }
 }
 
 /// GrayN → HSV u8. Masks to BITS bits, downshifts to u8, H=0 S=0 V=Y8.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, the V channel uses the rescaled luma value.
 /// See [`gray8_to_hsv_row`] for the S=0 convention.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
+pub(crate) fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -366,6 +407,11 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
   let mask = bits_mask::<BITS>();
   let shift = BITS - 8;
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let masked = raw & mask;
     h_out[x] = 0;
     s_out[x] = 0;
@@ -381,13 +427,24 @@ pub(crate) fn gray_n_to_hsv_row<const BITS: u32>(
 
 /// Gray16 → packed RGB u8. Downshifts `>> 8` to u8, broadcasts.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y (black=4096, white=56064+4096)
 /// is rescaled to [0, 255] before broadcast.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgb_row(y_plane: &[u16], out: &mut [u8], width: usize, full_range: bool) {
+pub(crate) fn gray16_to_rgb_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+  full_range: bool,
+) {
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width * 3, "out too short");
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let y8 = if full_range {
       (raw >> 8) as u8
     } else {
@@ -399,12 +456,23 @@ pub(crate) fn gray16_to_rgb_row(y_plane: &[u16], out: &mut [u8], width: usize, f
 
 /// Gray16 → packed RGBA u8. Downshifts `>> 8`, broadcasts, α = 0xFF.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to [0, 255].
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgba_row(y_plane: &[u16], out: &mut [u8], width: usize, full_range: bool) {
+pub(crate) fn gray16_to_rgba_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u8],
+  width: usize,
+  full_range: bool,
+) {
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width * 4, "out too short");
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let y8 = if full_range {
       (raw >> 8) as u8
     } else {
@@ -416,9 +484,10 @@ pub(crate) fn gray16_to_rgba_row(y_plane: &[u16], out: &mut [u8], width: usize,
 
 /// Gray16 → packed u16 RGB. Identity broadcast, native 16-bit depth.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to [0, 65535].
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgb_u16_row(
+pub(crate) fn gray16_to_rgb_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -427,6 +496,11 @@ pub(crate) fn gray16_to_rgb_u16_row(
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width * 3, "out too short");
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let y_out = if full_range {
       raw
     } else {
@@ -438,9 +512,10 @@ pub(crate) fn gray16_to_rgb_u16_row(
 
 /// Gray16 → packed u16 RGBA. Identity broadcast, α = 0xFFFF.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, limited-range Y is rescaled to [0, 65535].
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_rgba_u16_row(
+pub(crate) fn gray16_to_rgba_u16_row<const BE: bool>(
   y_plane: &[u16],
   out: &mut [u16],
   width: usize,
@@ -449,6 +524,11 @@ pub(crate) fn gray16_to_rgba_u16_row(
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width * 4, "out too short");
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     let y_out = if full_range {
       raw
     } else {
@@ -460,34 +540,52 @@ pub(crate) fn gray16_to_rgba_u16_row(
 
 /// Gray16 → luma u8. Downshifts `>> 8`.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// Always passes raw Y through without `full_range` rescaling —
 /// the caller is explicitly requesting the source luma plane as-is.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) {
+pub(crate) fn gray16_to_luma_row<const BE: bool>(y_plane: &[u16], out: &mut [u8], width: usize) {
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width, "out too short");
   for (out_byte, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     *out_byte = (raw >> 8) as u8;
   }
 }
 
-/// Gray16 → luma u16. Identity copy.
+/// Gray16 → luma u16. Identity copy (or byte-swap copy for BE).
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before output.
 /// Always passes raw Y through without `full_range` rescaling —
 /// the caller is explicitly requesting the source luma plane as-is.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) {
+pub(crate) fn gray16_to_luma_u16_row<const BE: bool>(
+  y_plane: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(y_plane.len() >= width, "y_plane too short");
   debug_assert!(out.len() >= width, "out too short");
-  out[..width].copy_from_slice(&y_plane[..width]);
+  for (o, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) {
+    *o = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
+  }
 }
 
 /// Gray16 → HSV u8. `>> 8` to u8, H=0 S=0 V=Y8.
 ///
+/// When `BE = true`, each u16 sample is byte-swapped before processing.
 /// When `full_range = false`, the V channel uses the rescaled luma value.
 /// See [`gray8_to_hsv_row`] for the S=0 convention.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gray16_to_hsv_row(
+pub(crate) fn gray16_to_hsv_row<const BE: bool>(
   y_plane: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -500,6 +598,11 @@ pub(crate) fn gray16_to_hsv_row(
   debug_assert!(s_out.len() >= width, "S out too short");
   debug_assert!(v_out.len() >= width, "V out too short");
   for (x, &raw) in y_plane[..width].iter().enumerate() {
+    let raw = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
     h_out[x] = 0;
     s_out[x] = 0;
     v_out[x] = if full_range {
@@ -609,7 +712,7 @@ mod tests {
     // 10-bit black = 16 << 2 = 64
     let y: std::vec::Vec<u16> = std::vec![64u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<10>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -618,7 +721,7 @@ mod tests {
     // 10-bit white = 235 << 2 = 940
     let y: std::vec::Vec<u16> = std::vec![940u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<10>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[255, 255, 255]);
   }
 
@@ -627,7 +730,7 @@ mod tests {
     // 10-bit mid: 125 << 2 = 500 → approx 127
     let y: std::vec::Vec<u16> = std::vec![500u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<10>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false);
     assert!(
       out[0] >= 126 && out[0] <= 128,
       "expected ~127 got {}",
@@ -640,7 +743,7 @@ mod tests {
     // 10-bit full range: value 512 >> 2 = 128
     let y: std::vec::Vec<u16> = std::vec![512u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<10>(&y, &mut out, 1, true);
+    gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, true);
     assert_eq!(&out[0..3], &[128, 128, 128]);
   }
 
@@ -651,7 +754,7 @@ mod tests {
     // 12-bit black = 16 << 4 = 256
     let y: std::vec::Vec<u16> = std::vec![256u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<12>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<12, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -660,7 +763,7 @@ mod tests {
     // 12-bit white = 235 << 4 = 3760
     let y: std::vec::Vec<u16> = std::vec![3760u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<12>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<12, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[255, 255, 255]);
   }
 
@@ -671,7 +774,7 @@ mod tests {
     // 14-bit black = 16 << 6 = 1024
     let y: std::vec::Vec<u16> = std::vec![1024u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<14>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<14, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -680,7 +783,7 @@ mod tests {
     // 14-bit white = 235 << 6 = 15040
     let y: std::vec::Vec<u16> = std::vec![15040u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<14>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<14, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[255, 255, 255]);
   }
 
@@ -691,7 +794,7 @@ mod tests {
     // 16-bit black = 16 << 8 = 4096
     let y: std::vec::Vec<u16> = std::vec![4096u16];
     let mut out = std::vec![0u8; 3];
-    gray16_to_rgb_row(&y, &mut out, 1, false);
+    gray16_to_rgb_row::<false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -700,7 +803,7 @@ mod tests {
     // 16-bit white = 235 << 8 = 60160
     let y: std::vec::Vec<u16> = std::vec![60160u16];
     let mut out = std::vec![0u8; 3];
-    gray16_to_rgb_row(&y, &mut out, 1, false);
+    gray16_to_rgb_row::<false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[255, 255, 255]);
   }
 
@@ -709,7 +812,7 @@ mod tests {
     // 16-bit mid: 125 << 8 = 32000 → approx 127
     let y: std::vec::Vec<u16> = std::vec![32000u16];
     let mut out = std::vec![0u8; 3];
-    gray16_to_rgb_row(&y, &mut out, 1, false);
+    gray16_to_rgb_row::<false>(&y, &mut out, 1, false);
     assert!(
       out[0] >= 126 && out[0] <= 128,
       "expected ~127 got {}",
@@ -722,7 +825,7 @@ mod tests {
     // 16-bit full range: 0x8000 >> 8 = 128
     let y: std::vec::Vec<u16> = std::vec![0x8000u16];
     let mut out = std::vec![0u8; 3];
-    gray16_to_rgb_row(&y, &mut out, 1, true);
+    gray16_to_rgb_row::<false>(&y, &mut out, 1, true);
     assert_eq!(&out[0..3], &[128, 128, 128]);
   }
 
@@ -737,7 +840,7 @@ mod tests {
   fn gray16_to_rgb_u16_limited_range_black() {
     let y: std::vec::Vec<u16> = std::vec![4096u16]; // limited-range black
     let mut out = std::vec![0u16; 3];
-    gray16_to_rgb_u16_row(&y, &mut out, 1, false);
+    gray16_to_rgb_u16_row::<false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -745,7 +848,7 @@ mod tests {
   fn gray16_to_rgb_u16_limited_range_white() {
     let y: std::vec::Vec<u16> = std::vec![60160u16]; // limited-range white
     let mut out = std::vec![0u16; 3];
-    gray16_to_rgb_u16_row(&y, &mut out, 1, false);
+    gray16_to_rgb_u16_row::<false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[65535, 65535, 65535]);
   }
 
@@ -754,7 +857,7 @@ mod tests {
     // Over-white (Y > 60160) is clamped to max_native=65535.
     let y: std::vec::Vec<u16> = std::vec![65535u16];
     let mut out = std::vec![0u16; 3];
-    gray16_to_rgb_u16_row(&y, &mut out, 1, false);
+    gray16_to_rgb_u16_row::<false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[65535, 65535, 65535]);
   }
 
@@ -762,7 +865,7 @@ mod tests {
   fn gray16_to_rgba_u16_limited_range_black_and_white() {
     let y: std::vec::Vec<u16> = std::vec![4096u16, 60160u16];
     let mut out = std::vec![0u16; 8];
-    gray16_to_rgba_u16_row(&y, &mut out, 2, false);
+    gray16_to_rgba_u16_row::<false>(&y, &mut out, 2, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
     assert_eq!(out[3], 0xFFFF);
     assert_eq!(&out[4..7], &[65535, 65535, 65535]);
@@ -776,7 +879,7 @@ mod tests {
     // 10-bit: 1023 >> 2 = 255; 0 >> 2 = 0; 512 >> 2 = 128
     let y: std::vec::Vec<u16> = std::vec![0, 512, 1023];
     let mut out = std::vec![0u8; 9];
-    gray_n_to_rgb_row::<10>(&y, &mut out, 3, true);
+    gray_n_to_rgb_row::<10, false>(&y, &mut out, 3, true);
     assert_eq!(&out[0..3], &[0, 0, 0]);
     assert_eq!(&out[3..6], &[128, 128, 128]);
     assert_eq!(&out[6..9], &[255, 255, 255]);
@@ -787,7 +890,7 @@ mod tests {
     // Upper bits should be masked out: 0xFFFF & 0x03FF = 0x03FF = 1023
     let y: std::vec::Vec<u16> = std::vec![0xFFFF, 512, 0];
     let mut out = std::vec![0u16; 9];
-    gray_n_to_rgb_u16_row::<10>(&y, &mut out, 3, true);
+    gray_n_to_rgb_u16_row::<10, false>(&y, &mut out, 3, true);
     assert_eq!(&out[0..3], &[1023, 1023, 1023]);
     assert_eq!(&out[3..6], &[512, 512, 512]);
     assert_eq!(&out[6..9], &[0, 0, 0]);
@@ -799,7 +902,7 @@ mod tests {
     let mut h = std::vec![0xFFu8; 1];
     let mut s = std::vec![0xFFu8; 1];
     let mut v = std::vec![0u8; 1];
-    gray_n_to_hsv_row::<10>(&y, &mut h, &mut s, &mut v, 1, true);
+    gray_n_to_hsv_row::<10, false>(&y, &mut h, &mut s, &mut v, 1, true);
     assert_eq!(h[0], 0);
     assert_eq!(s[0], 0);
     assert_eq!(v[0], 128);
@@ -809,7 +912,7 @@ mod tests {
   fn gray16_to_rgb_downshifts_8() {
     let y: std::vec::Vec<u16> = std::vec![0, 0x8000, 0xFFFF];
     let mut out = std::vec![0u8; 9];
-    gray16_to_rgb_row(&y, &mut out, 3, true);
+    gray16_to_rgb_row::<false>(&y, &mut out, 3, true);
     assert_eq!(&out[0..3], &[0, 0, 0]);
     assert_eq!(&out[3..6], &[0x80, 0x80, 0x80]);
     assert_eq!(&out[6..9], &[0xFF, 0xFF, 0xFF]);
@@ -819,7 +922,7 @@ mod tests {
   fn gray16_to_luma_u16_identity() {
     let y: std::vec::Vec<u16> = std::vec![0, 1000, 65535];
     let mut out = std::vec![0u16; 3];
-    gray16_to_luma_u16_row(&y, &mut out, 3);
+    gray16_to_luma_u16_row::<false>(&y, &mut out, 3);
     assert_eq!(out.as_slice(), &[0, 1000, 65535]);
   }
 
@@ -827,7 +930,7 @@ mod tests {
   fn gray16_to_rgba_u16_opaque() {
     let y: std::vec::Vec<u16> = std::vec![12345u16];
     let mut out = std::vec![0u16; 4];
-    gray16_to_rgba_u16_row(&y, &mut out, 1, true);
+    gray16_to_rgba_u16_row::<false>(&y, &mut out, 1, true);
     assert_eq!(&out[0..4], &[12345, 12345, 12345, 0xFFFF]);
   }
 
@@ -835,7 +938,7 @@ mod tests {
   fn gray_n_to_luma_u16_10bit_masks() {
     let y: std::vec::Vec<u16> = std::vec![0xFFFF]; // should mask to 1023
     let mut out = std::vec![0u16; 1];
-    gray_n_to_luma_u16_row::<10>(&y, &mut out, 1);
+    gray_n_to_luma_u16_row::<10, false>(&y, &mut out, 1);
     assert_eq!(out[0], 1023);
   }
 
@@ -846,7 +949,7 @@ mod tests {
     // 9-bit black = 16 << 1 = 32
     let y: std::vec::Vec<u16> = std::vec![32u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<9>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[0, 0, 0]);
   }
 
@@ -855,7 +958,7 @@ mod tests {
     // 9-bit white = 235 << 1 = 470
     let y: std::vec::Vec<u16> = std::vec![470u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<9>(&y, &mut out, 1, false);
+    gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, false);
     assert_eq!(&out[0..3], &[255, 255, 255]);
   }
 
@@ -864,7 +967,113 @@ mod tests {
     // 9-bit full range: value 256 >> 1 = 128
     let y: std::vec::Vec<u16> = std::vec![256u16];
     let mut out = std::vec![0u8; 3];
-    gray_n_to_rgb_row::<9>(&y, &mut out, 1, true);
+    gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, true);
     assert_eq!(&out[0..3], &[128, 128, 128]);
   }
+
+  // ---- BE parity tests: gray_n (Gray9-14) -----------------------------------
+  // Pattern: construct LE input, byte-swap to produce BE input, call with
+  // BE=true, assert output equals LE-input run output.
+
+  #[test]
+  fn gray10_be_parity_rgb() {
+    // LE value 512 >> 2 = 128. BE encoding: 512 = 0x0200, BE bytes = [0x02, 0x00].
+    let le: std::vec::Vec<u16> = std::vec![512u16];
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 3];
+    let mut out_be = std::vec![0u8; 3];
+    gray_n_to_rgb_row::<10, false>(&le, &mut out_le, 1, true);
+    gray_n_to_rgb_row::<10, true>(&be, &mut out_be, 1, true);
+    assert_eq!(out_le, out_be, "BE and LE gray10 rgb outputs must match");
+  }
+
+  #[test]
+  fn gray10_be_parity_rgba() {
+    let le: std::vec::Vec<u16> = std::vec![768u16]; // 768 >> 2 = 192
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 4];
+    let mut out_be = std::vec![0u8; 4];
+    gray_n_to_rgba_row::<10, false>(&le, &mut out_le, 1, true);
+    gray_n_to_rgba_row::<10, true>(&be, &mut out_be, 1, true);
+    assert_eq!(out_le, out_be, "BE and LE gray10 rgba outputs must match");
+  }
+
+  #[test]
+  fn gray10_be_parity_luma() {
+    let le: std::vec::Vec<u16> = std::vec![256u16]; // 256 >> 2 = 64
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 1];
+    let mut out_be = std::vec![0u8; 1];
+    gray_n_to_luma_row::<10, false>(&le, &mut out_le, 1);
+    gray_n_to_luma_row::<10, true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE gray10 luma outputs must match");
+  }
+
+  #[test]
+  fn gray10_be_parity_luma_u16() {
+    let le: std::vec::Vec<u16> = std::vec![512u16];
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u16; 1];
+    let mut out_be = std::vec![0u16; 1];
+    gray_n_to_luma_u16_row::<10, false>(&le, &mut out_le, 1);
+    gray_n_to_luma_u16_row::<10, true>(&be, &mut out_be, 1);
+    assert_eq!(
+      out_le, out_be,
+      "BE and LE gray10 luma_u16 outputs must match"
+    );
+  }
+
+  // ---- BE parity tests: gray16 -----------------------------------------------
+
+  #[test]
+  fn gray16_be_parity_rgb() {
+    // LE value 0x8000 >> 8 = 128.
+    let le: std::vec::Vec<u16> = std::vec![0x8000u16];
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 3];
+    let mut out_be = std::vec![0u8; 3];
+    gray16_to_rgb_row::<false>(&le, &mut out_le, 1, true);
+    gray16_to_rgb_row::<true>(&be, &mut out_be, 1, true);
+    assert_eq!(out_le, out_be, "BE and LE gray16 rgb outputs must match");
+  }
+
+  #[test]
+  fn gray16_be_parity_rgba() {
+    let le: std::vec::Vec<u16> = std::vec![0xC000u16]; // 0xC0 = 192
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 4];
+    let mut out_be = std::vec![0u8; 4];
+    gray16_to_rgba_row::<false>(&le, &mut out_le, 1, true);
+    gray16_to_rgba_row::<true>(&be, &mut out_be, 1, true);
+    assert_eq!(out_le, out_be, "BE and LE gray16 rgba outputs must match");
+  }
+
+  #[test]
+  fn gray16_be_parity_luma() {
+    let le: std::vec::Vec<u16> = std::vec![0x4000u16]; // 0x40 = 64
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u8; 1];
+    let mut out_be = std::vec![0u8; 1];
+    gray16_to_luma_row::<false>(&le, &mut out_le, 1);
+    gray16_to_luma_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE gray16 luma outputs must match");
+  }
+
+  #[test]
+  fn gray16_be_parity_luma_u16() {
+    // For gray16_to_luma_u16_row with BE=true, swap_bytes is applied.
+    // LE: 0x1234. BE encoding of that value: swap bytes → 0x3412.
+    // After BE kernel processes 0x3412 with swap_bytes → 0x1234. Output = 0x1234.
+    let le_val: u16 = 0x1234;
+    let le: std::vec::Vec<u16> = std::vec![le_val];
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = std::vec![0u16; 1];
+    let mut out_be = std::vec![0u16; 1];
+    gray16_to_luma_u16_row::<false>(&le, &mut out_le, 1);
+    gray16_to_luma_u16_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(
+      out_le, out_be,
+      "BE and LE gray16 luma_u16 outputs must match"
+    );
+  }
 }
diff --git a/src/row/scalar/grayf32.rs b/src/row/scalar/grayf32.rs
index b762edef..f7a4d6db 100644
--- a/src/row/scalar/grayf32.rs
+++ b/src/row/scalar/grayf32.rs
@@ -42,11 +42,18 @@ fn f32_to_u16(y: f32) -> u16 {
 // ---- kernel implementations -------------------------------------------------
 
 /// Grayf32 → packed u8 RGB. Clamp [0,1] × 255 → u8, broadcast R=G=B=Y.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_row(plane: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) fn grayf32_to_rgb_row<const BE: bool>(plane: &[f32], rgb_out: &mut [u8], width: usize) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     let v = f32_to_u8(y);
     let i = x * 3;
     rgb_out[i] = v;
@@ -56,11 +63,22 @@ pub(crate) fn grayf32_to_rgb_row(plane: &[f32], rgb_out: &mut [u8], width: usize
 }
 
 /// Grayf32 → packed u8 RGBA. Same broadcast as rgb; α = 0xFF.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgba_row(plane: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn grayf32_to_rgba_row<const BE: bool>(
+  plane: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     let v = f32_to_u8(y);
     let i = x * 4;
     rgba_out[i] = v;
@@ -71,11 +89,22 @@ pub(crate) fn grayf32_to_rgba_row(plane: &[f32], rgba_out: &mut [u8], width: usi
 }
 
 /// Grayf32 → packed u16 RGB. Clamp [0,1] × 65535 → u16, broadcast R=G=B=Y.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], rgb_u16_out: &mut [u16], width: usize) {
+pub(crate) fn grayf32_to_rgb_u16_row<const BE: bool>(
+  plane: &[f32],
+  rgb_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     let v = f32_to_u16(y);
     let i = x * 3;
     rgb_u16_out[i] = v;
@@ -85,11 +114,22 @@ pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], rgb_u16_out: &mut [u16], wid
 }
 
 /// Grayf32 → packed u16 RGBA. Same broadcast; α = 0xFFFF.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgba_u16_row(plane: &[f32], rgba_u16_out: &mut [u16], width: usize) {
+pub(crate) fn grayf32_to_rgba_u16_row<const BE: bool>(
+  plane: &[f32],
+  rgba_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     let v = f32_to_u16(y);
     let i = x * 4;
     rgba_u16_out[i] = v;
@@ -100,11 +140,23 @@ pub(crate) fn grayf32_to_rgba_u16_row(plane: &[f32], rgba_u16_out: &mut [u16], w
 }
 
 /// Grayf32 → packed f32 RGB. Lossless: replicate Y → R=G=B (no clamp, no round).
+///
+/// When `BE = true`, each f32 element is byte-swapped (treats stored bits as
+/// BE-encoded IEEE 754 and converts to host-native before replication).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_rgb_f32_row(plane: &[f32], rgb_f32_out: &mut [f32], width: usize) {
+pub(crate) fn grayf32_to_rgb_f32_row<const BE: bool>(
+  plane: &[f32],
+  rgb_f32_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(rgb_f32_out.len() >= width * 3, "rgb_f32_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     let i = x * 3;
     rgb_f32_out[i] = y;
     rgb_f32_out[i + 1] = y;
@@ -113,39 +165,74 @@ pub(crate) fn grayf32_to_rgb_f32_row(plane: &[f32], rgb_f32_out: &mut [f32], wid
 }
 
 /// Grayf32 → luma u8. Clamp [0,1] × 255 → u8.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_row(plane: &[f32], luma_out: &mut [u8], width: usize) {
+pub(crate) fn grayf32_to_luma_row<const BE: bool>(
+  plane: &[f32],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(luma_out.len() >= width, "luma_out too short");
-  for (out, &y) in luma_out[..width].iter_mut().zip(plane[..width].iter()) {
+  for (out, &raw) in luma_out[..width].iter_mut().zip(plane[..width].iter()) {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     *out = f32_to_u8(y);
   }
 }
 
 /// Grayf32 → luma u16. Clamp [0,1] × 65535 → u16.
+///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_u16_row(plane: &[f32], luma_u16_out: &mut [u16], width: usize) {
+pub(crate) fn grayf32_to_luma_u16_row<const BE: bool>(
+  plane: &[f32],
+  luma_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(luma_u16_out.len() >= width, "luma_u16_out too short");
-  for (out, &y) in luma_u16_out[..width].iter_mut().zip(plane[..width].iter()) {
+  for (out, &raw) in luma_u16_out[..width].iter_mut().zip(plane[..width].iter()) {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     *out = f32_to_u16(y);
   }
 }
 
-/// Grayf32 → luma f32. Lossless pass-through (memcpy-equivalent).
+/// Grayf32 → luma f32. Lossless pass-through (or byte-swap copy for BE).
+///
+/// When `BE = true`, each f32 element is byte-swapped before output.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_luma_f32_row(plane: &[f32], luma_f32_out: &mut [f32], width: usize) {
+pub(crate) fn grayf32_to_luma_f32_row<const BE: bool>(
+  plane: &[f32],
+  luma_f32_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(plane.len() >= width, "plane too short");
   debug_assert!(luma_f32_out.len() >= width, "luma_f32_out too short");
-  luma_f32_out[..width].copy_from_slice(&plane[..width]);
+  for (out, &raw) in luma_f32_out[..width].iter_mut().zip(plane[..width].iter()) {
+    *out = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
+  }
 }
 
 /// Grayf32 → HSV u8. Gray fast-path: H=0, S=0, V = clamp(Y, 0, 1) × 255.
 ///
+/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits.
 /// Gray sources are achromatic (saturation = 0 identically). H is fixed to 0
 /// to match OpenCV's `cv2.COLOR_GRAY2HSV` convention.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn grayf32_to_hsv_row(
+pub(crate) fn grayf32_to_hsv_row<const BE: bool>(
   plane: &[f32],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -156,7 +243,12 @@ pub(crate) fn grayf32_to_hsv_row(
   debug_assert!(h_out.len() >= width, "h_out too short");
   debug_assert!(s_out.len() >= width, "s_out too short");
   debug_assert!(v_out.len() >= width, "v_out too short");
-  for (x, &y) in plane[..width].iter().enumerate() {
+  for (x, &raw) in plane[..width].iter().enumerate() {
+    let y = if BE {
+      f32::from_bits(u32::from_be(raw.to_bits()))
+    } else {
+      f32::from_bits(u32::from_le(raw.to_bits()))
+    };
     h_out[x] = 0;
     s_out[x] = 0;
     v_out[x] = f32_to_u8(y);
@@ -173,7 +265,7 @@ mod tests {
   fn grayf32_to_rgb_zero() {
     let plane = [0.0f32];
     let mut out = [0xFFu8; 3];
-    grayf32_to_rgb_row(&plane, &mut out, 1);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [0, 0, 0]);
   }
 
@@ -181,7 +273,7 @@ mod tests {
   fn grayf32_to_rgb_max() {
     let plane = [1.0f32];
     let mut out = [0u8; 3];
-    grayf32_to_rgb_row(&plane, &mut out, 1);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [255, 255, 255]);
   }
 
@@ -195,7 +287,7 @@ mod tests {
     // truncate` is the contract this crate uses across scalar + SIMD.
     let plane = [0.5f32];
     let mut out = [0u8; 3];
-    grayf32_to_rgb_row(&plane, &mut out, 1);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [128, 128, 128]);
   }
 
@@ -203,7 +295,7 @@ mod tests {
   fn grayf32_to_rgb_saturates_high() {
     let plane = [1.5f32];
     let mut out = [0u8; 3];
-    grayf32_to_rgb_row(&plane, &mut out, 1);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [255, 255, 255]);
   }
 
@@ -211,7 +303,7 @@ mod tests {
   fn grayf32_to_rgb_saturates_low() {
     let plane = [-0.1f32];
     let mut out = [0xFFu8; 3];
-    grayf32_to_rgb_row(&plane, &mut out, 1);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [0, 0, 0]);
   }
 
@@ -221,7 +313,7 @@ mod tests {
   fn grayf32_to_rgba_zero_alpha_opaque() {
     let plane = [0.0f32];
     let mut out = [0u8; 4];
-    grayf32_to_rgba_row(&plane, &mut out, 1);
+    grayf32_to_rgba_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [0, 0, 0, 0xFF]);
   }
 
@@ -229,7 +321,7 @@ mod tests {
   fn grayf32_to_rgba_max_alpha_opaque() {
     let plane = [1.0f32];
     let mut out = [0u8; 4];
-    grayf32_to_rgba_row(&plane, &mut out, 1);
+    grayf32_to_rgba_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [255, 255, 255, 0xFF]);
   }
 
@@ -239,7 +331,7 @@ mod tests {
   fn grayf32_to_rgb_u16_zero() {
     let plane = [0.0f32];
     let mut out = [0xFFFFu16; 3];
-    grayf32_to_rgb_u16_row(&plane, &mut out, 1);
+    grayf32_to_rgb_u16_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [0, 0, 0]);
   }
 
@@ -247,7 +339,7 @@ mod tests {
   fn grayf32_to_rgb_u16_max() {
     let plane = [1.0f32];
     let mut out = [0u16; 3];
-    grayf32_to_rgb_u16_row(&plane, &mut out, 1);
+    grayf32_to_rgb_u16_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [65535, 65535, 65535]);
   }
 
@@ -255,7 +347,7 @@ mod tests {
   fn grayf32_to_rgb_u16_saturates_high() {
     let plane = [2.0f32];
     let mut out = [0u16; 3];
-    grayf32_to_rgb_u16_row(&plane, &mut out, 1);
+    grayf32_to_rgb_u16_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [65535, 65535, 65535]);
   }
 
@@ -265,7 +357,7 @@ mod tests {
   fn grayf32_to_rgba_u16_opaque() {
     let plane = [1.0f32];
     let mut out = [0u16; 4];
-    grayf32_to_rgba_u16_row(&plane, &mut out, 1);
+    grayf32_to_rgba_u16_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [65535, 65535, 65535, 0xFFFF]);
   }
 
@@ -276,7 +368,7 @@ mod tests {
     // Non-clamped value preserved exactly.
     let plane = [1.5f32];
     let mut out = [0.0f32; 3];
-    grayf32_to_rgb_f32_row(&plane, &mut out, 1);
+    grayf32_to_rgb_f32_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [1.5, 1.5, 1.5]);
   }
 
@@ -284,7 +376,7 @@ mod tests {
   fn grayf32_to_rgb_f32_negative_preserved() {
     let plane = [-0.5f32];
     let mut out = [0.0f32; 3];
-    grayf32_to_rgb_f32_row(&plane, &mut out, 1);
+    grayf32_to_rgb_f32_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [-0.5, -0.5, -0.5]);
   }
 
@@ -294,7 +386,7 @@ mod tests {
   fn grayf32_to_luma_zero() {
     let plane = [0.0f32];
     let mut out = [0xFFu8; 1];
-    grayf32_to_luma_row(&plane, &mut out, 1);
+    grayf32_to_luma_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [0]);
   }
 
@@ -302,7 +394,7 @@ mod tests {
   fn grayf32_to_luma_max() {
     let plane = [1.0f32];
     let mut out = [0u8; 1];
-    grayf32_to_luma_row(&plane, &mut out, 1);
+    grayf32_to_luma_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [255]);
   }
 
@@ -312,7 +404,7 @@ mod tests {
   fn grayf32_to_luma_u16_max() {
     let plane = [1.0f32];
     let mut out = [0u16; 1];
-    grayf32_to_luma_u16_row(&plane, &mut out, 1);
+    grayf32_to_luma_u16_row::<false>(&plane, &mut out, 1);
     assert_eq!(out, [65535]);
   }
 
@@ -322,7 +414,7 @@ mod tests {
   fn grayf32_to_luma_f32_identity() {
     let plane = [0.0f32, 0.5, 1.0, 1.5, -0.1];
     let mut out = [99.0f32; 5];
-    grayf32_to_luma_f32_row(&plane, &mut out, 5);
+    grayf32_to_luma_f32_row::<false>(&plane, &mut out, 5);
     // Lossless pass-through — exact bit equality.
     assert_eq!(out, [0.0, 0.5, 1.0, 1.5, -0.1]);
   }
@@ -335,7 +427,7 @@ mod tests {
     let mut h = [0xFFu8; 1];
     let mut s = [0xFFu8; 1];
     let mut v = [0u8; 1];
-    grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1);
+    grayf32_to_hsv_row::<false>(&plane, &mut h, &mut s, &mut v, 1);
     assert_eq!(h[0], 0, "H must be 0 for achromatic source");
     assert_eq!(s[0], 0, "S must be 0 for achromatic source");
     assert_eq!(v[0], 0);
@@ -347,7 +439,7 @@ mod tests {
     let mut h = [0u8; 1];
     let mut s = [0u8; 1];
     let mut v = [0u8; 1];
-    grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1);
+    grayf32_to_hsv_row::<false>(&plane, &mut h, &mut s, &mut v, 1);
     assert_eq!(h[0], 0);
     assert_eq!(s[0], 0);
     assert_eq!(v[0], 255);
@@ -360,7 +452,7 @@ mod tests {
     let mut h = [0u8; 1];
     let mut s = [0u8; 1];
     let mut v = [0u8; 1];
-    grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1);
+    grayf32_to_hsv_row::<false>(&plane, &mut h, &mut s, &mut v, 1);
     assert_eq!(h[0], 0);
     assert_eq!(s[0], 0);
     assert_eq!(v[0], 128);
@@ -373,7 +465,7 @@ mod tests {
     let mut h = [0u8; 1];
     let mut s = [0u8; 1];
     let mut v = [0u8; 1];
-    grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1);
+    grayf32_to_hsv_row::<false>(&plane, &mut h, &mut s, &mut v, 1);
     assert_eq!(v[0], 255);
   }
 
@@ -381,9 +473,43 @@ mod tests {
   fn grayf32_to_rgb_multi_pixel() {
     let plane = [0.0f32, 1.0, 0.5];
     let mut out = [0u8; 9];
-    grayf32_to_rgb_row(&plane, &mut out, 3);
+    grayf32_to_rgb_row::<false>(&plane, &mut out, 3);
     assert_eq!(&out[0..3], &[0, 0, 0]);
     assert_eq!(&out[3..6], &[255, 255, 255]);
     assert_eq!(&out[6..9], &[128, 128, 128]); // 0.5 → 128
   }
+
+  // ---- BE parity tests: grayf32 ---------------------------------------------
+  // Pattern: construct LE f32 input, reinterpret bytes as BE-encoded f32
+  // (i.e. byte-swap the u32 bits), call BE kernel, assert output matches LE run.
+
+  /// Helper: produce a BE-encoded copy of an f32 slice (swap u32 bits of each element).
+  fn f32_to_be_bytes(src: &[f32]) -> std::vec::Vec<f32> {
+    src
+      .iter()
+      .map(|&v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect()
+  }
+
+  #[test]
+  fn grayf32_be_parity_rgb() {
+    let le = [0.5f32];
+    let be = f32_to_be_bytes(&le);
+    let mut out_le = [0u8; 3];
+    let mut out_be = [0u8; 3];
+    grayf32_to_rgb_row::<false>(&le, &mut out_le, 1);
+    grayf32_to_rgb_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE grayf32 rgb outputs must match");
+  }
+
+  #[test]
+  fn grayf32_be_parity_luma() {
+    let le = [0.25f32];
+    let be = f32_to_be_bytes(&le);
+    let mut out_le = [0u8; 1];
+    let mut out_be = [0u8; 1];
+    grayf32_to_luma_row::<false>(&le, &mut out_le, 1);
+    grayf32_to_luma_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE grayf32 luma outputs must match");
+  }
 }
diff --git a/src/row/scalar/ya16.rs b/src/row/scalar/ya16.rs
index b2d8f831..8ca3fe5a 100644
--- a/src/row/scalar/ya16.rs
+++ b/src/row/scalar/ya16.rs
@@ -19,12 +19,19 @@
 //! α is dropped for HSV output.
 
 /// Ya16 → packed u8 RGB. Y `>> 8`, broadcast R=G=B; α dropped.
+///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgb_row(packed: &[u16], rgb_out: &mut [u8], width: usize) {
+pub(crate) fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], rgb_out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
   for x in 0..width {
-    let y8 = (packed[x * 2] >> 8) as u8;
+    let y_raw = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
+    let y8 = (y_raw >> 8) as u8;
     let i = x * 3;
     rgb_out[i] = y8;
     rgb_out[i + 1] = y8;
@@ -33,13 +40,25 @@ pub(crate) fn ya16_to_rgb_row(packed: &[u16], rgb_out: &mut [u8], width: usize)
 }
 
 /// Ya16 → packed u8 RGBA. Y `>> 8`, broadcast R=G=B; A `>> 8` from source slot 1.
+///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgba_row(packed: &[u16], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn ya16_to_rgba_row<const BE: bool>(packed: &[u16], rgba_out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for x in 0..width {
-    let y8 = (packed[x * 2] >> 8) as u8;
-    let a8 = (packed[x * 2 + 1] >> 8) as u8;
+    let y_raw = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
+    let a_raw = if BE {
+      u16::from_be(packed[x * 2 + 1])
+    } else {
+      u16::from_le(packed[x * 2 + 1])
+    };
+    let y8 = (y_raw >> 8) as u8;
+    let a8 = (a_raw >> 8) as u8;
     let i = x * 4;
     rgba_out[i] = y8;
     rgba_out[i + 1] = y8;
@@ -49,12 +68,22 @@ pub(crate) fn ya16_to_rgba_row(packed: &[u16], rgba_out: &mut [u8], width: usize
 }
 
 /// Ya16 → packed u16 RGB. Y native u16, broadcast R=G=B=Y; α dropped.
+///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], rgb_u16_out: &mut [u16], width: usize) {
+pub(crate) fn ya16_to_rgb_u16_row<const BE: bool>(
+  packed: &[u16],
+  rgb_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
   for x in 0..width {
-    let y = packed[x * 2];
+    let y = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
     let i = x * 3;
     rgb_u16_out[i] = y;
     rgb_u16_out[i + 1] = y;
@@ -63,13 +92,27 @@ pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], rgb_u16_out: &mut [u16], width
 }
 
 /// Ya16 → packed u16 RGBA. Y native u16, broadcast; A native u16 from source slot 1.
+///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) {
+pub(crate) fn ya16_to_rgba_u16_row<const BE: bool>(
+  packed: &[u16],
+  rgba_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
   for x in 0..width {
-    let y = packed[x * 2];
-    let a = packed[x * 2 + 1];
+    let y = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
+    let a = if BE {
+      u16::from_be(packed[x * 2 + 1])
+    } else {
+      u16::from_le(packed[x * 2 + 1])
+    };
     let i = x * 4;
     rgba_u16_out[i] = y;
     rgba_u16_out[i + 1] = y;
@@ -79,30 +122,48 @@ pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], rgba_u16_out: &mut [u16], wid
 }
 
 /// Ya16 → luma u8. Y `>> 8`.
+///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) fn ya16_to_luma_row<const BE: bool>(packed: &[u16], luma_out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(luma_out.len() >= width, "luma_out too short");
   for x in 0..width {
-    luma_out[x] = (packed[x * 2] >> 8) as u8;
+    let y = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
+    luma_out[x] = (y >> 8) as u8;
   }
 }
 
-/// Ya16 → luma u16. Y native u16 pass-through.
+/// Ya16 → luma u16. Y native u16 pass-through (or byte-swap for BE).
+///
+/// When `BE = true`, each u16 element is byte-swapped before output.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_luma_u16_row(packed: &[u16], luma_u16_out: &mut [u16], width: usize) {
+pub(crate) fn ya16_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(luma_u16_out.len() >= width, "luma_u16_out too short");
   for x in 0..width {
-    luma_u16_out[x] = packed[x * 2];
+    luma_u16_out[x] = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
   }
 }
 
 /// Ya16 → HSV u8. Gray fast-path: H=0, S=0, V = Y `>> 8`. α dropped.
 ///
+/// When `BE = true`, each u16 element is byte-swapped before processing.
 /// See [`super::gray::gray8_to_hsv_row`] for the S=0 convention.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ya16_to_hsv_row(
+pub(crate) fn ya16_to_hsv_row<const BE: bool>(
   packed: &[u16],
   h_out: &mut [u8],
   s_out: &mut [u8],
@@ -114,9 +175,14 @@ pub(crate) fn ya16_to_hsv_row(
   debug_assert!(s_out.len() >= width, "s_out too short");
   debug_assert!(v_out.len() >= width, "v_out too short");
   for x in 0..width {
+    let y = if BE {
+      u16::from_be(packed[x * 2])
+    } else {
+      u16::from_le(packed[x * 2])
+    };
     h_out[x] = 0;
     s_out[x] = 0;
-    v_out[x] = (packed[x * 2] >> 8) as u8;
+    v_out[x] = (y >> 8) as u8;
   }
 }
 
@@ -136,7 +202,7 @@ mod tests {
     // Y=0x8000, A=0x4000 → rgb [0x80, 0x80, 0x80]
     let p = packed_ya(&[(0x8000, 0x4000)]);
     let mut out = [0u8; 3];
-    ya16_to_rgb_row(&p, &mut out, 1);
+    ya16_to_rgb_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0x80, 0x80, 0x80]);
   }
 
@@ -144,7 +210,7 @@ mod tests {
   fn ya16_to_rgb_zero_pixel() {
     let p = packed_ya(&[(0, 0)]);
     let mut out = [0xFFu8; 3];
-    ya16_to_rgb_row(&p, &mut out, 1);
+    ya16_to_rgb_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0, 0, 0]);
   }
 
@@ -152,7 +218,7 @@ mod tests {
   fn ya16_to_rgb_max_y() {
     let p = packed_ya(&[(0xFFFF, 0)]);
     let mut out = [0u8; 3];
-    ya16_to_rgb_row(&p, &mut out, 1);
+    ya16_to_rgb_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0xFF, 0xFF, 0xFF]);
   }
 
@@ -163,7 +229,7 @@ mod tests {
     // Y=0x8000, A=0x4000 → rgba [0x80, 0x80, 0x80, 0x40]
     let p = packed_ya(&[(0x8000, 0x4000)]);
     let mut out = [0u8; 4];
-    ya16_to_rgba_row(&p, &mut out, 1);
+    ya16_to_rgba_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0x80, 0x80, 0x80, 0x40]);
   }
 
@@ -171,7 +237,7 @@ mod tests {
   fn ya16_to_rgba_two_pixels() {
     let p = packed_ya(&[(0x8000, 0x4000), (0x1000, 0x0800)]);
     let mut out = [0u8; 8];
-    ya16_to_rgba_row(&p, &mut out, 2);
+    ya16_to_rgba_row::<false>(&p, &mut out, 2);
     assert_eq!(&out[0..4], &[0x80, 0x80, 0x80, 0x40]);
     assert_eq!(&out[4..8], &[0x10, 0x10, 0x10, 0x08]);
   }
@@ -183,7 +249,7 @@ mod tests {
     // Y=0x8000 native, broadcast
     let p = packed_ya(&[(0x8000, 0x4000)]);
     let mut out = [0u16; 3];
-    ya16_to_rgb_u16_row(&p, &mut out, 1);
+    ya16_to_rgb_u16_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0x8000, 0x8000, 0x8000]);
   }
 
@@ -191,7 +257,7 @@ mod tests {
   fn ya16_to_rgb_u16_zero() {
     let p = packed_ya(&[(0, 0)]);
     let mut out = [0xFFFFu16; 3];
-    ya16_to_rgb_u16_row(&p, &mut out, 1);
+    ya16_to_rgb_u16_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0, 0, 0]);
   }
 
@@ -202,7 +268,7 @@ mod tests {
     // Y=0x8000, A=0x4000 → rgba_u16 [0x8000, 0x8000, 0x8000, 0x4000]
     let p = packed_ya(&[(0x8000, 0x4000)]);
     let mut out = [0u16; 4];
-    ya16_to_rgba_u16_row(&p, &mut out, 1);
+    ya16_to_rgba_u16_row::<false>(&p, &mut out, 1);
     assert_eq!(out, [0x8000, 0x8000, 0x8000, 0x4000]);
   }
 
@@ -212,7 +278,7 @@ mod tests {
   fn ya16_to_luma_downshifts() {
     let p = packed_ya(&[(0x8000, 0x4000), (0x0000, 0xFFFF)]);
     let mut out = [0u8; 2];
-    ya16_to_luma_row(&p, &mut out, 2);
+    ya16_to_luma_row::<false>(&p, &mut out, 2);
     assert_eq!(out, [0x80, 0x00]);
   }
 
@@ -222,7 +288,7 @@ mod tests {
   fn ya16_to_luma_u16_native_passthrough() {
     let p = packed_ya(&[(0x8000, 0x0000)]);
     let mut out = [0u16; 1];
-    ya16_to_luma_u16_row(&p, &mut out, 1);
+    ya16_to_luma_u16_row::<false>(&p, &mut out, 1);
     assert_eq!(out[0], 0x8000);
   }
 
@@ -235,7 +301,7 @@ mod tests {
     let mut h = [0xFFu8; 1];
     let mut s = [0xFFu8; 1];
     let mut v = [0u8; 1];
-    ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1);
+    ya16_to_hsv_row::<false>(&p, &mut h, &mut s, &mut v, 1);
     assert_eq!(h[0], 0);
     assert_eq!(s[0], 0);
     assert_eq!(v[0], 0x80);
@@ -247,7 +313,7 @@ mod tests {
     let mut h = [0u8; 1];
     let mut s = [0u8; 1];
     let mut v = [0xFFu8; 1];
-    ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1);
+    ya16_to_hsv_row::<false>(&p, &mut h, &mut s, &mut v, 1);
     assert_eq!(v[0], 0);
   }
 
@@ -257,7 +323,46 @@ mod tests {
     let mut h = [0u8; 1];
     let mut s = [0u8; 1];
     let mut v = [0u8; 1];
-    ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1);
+    ya16_to_hsv_row::<false>(&p, &mut h, &mut s, &mut v, 1);
     assert_eq!(v[0], 0xFF);
   }
+
+  // ---- BE parity tests: ya16 -------------------------------------------------
+  // Pattern: construct LE packed input, byte-swap each u16 element to produce
+  // BE input, call BE kernel, assert output equals LE-input run output.
+
+  #[test]
+  fn ya16_be_parity_rgb() {
+    // Y=0x8000, A=0x4000 LE → RGB [0x80, 0x80, 0x80]
+    let le = packed_ya(&[(0x8000, 0x4000)]);
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = [0u8; 3];
+    let mut out_be = [0u8; 3];
+    ya16_to_rgb_row::<false>(&le, &mut out_le, 1);
+    ya16_to_rgb_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE ya16 rgb outputs must match");
+  }
+
+  #[test]
+  fn ya16_be_parity_rgba() {
+    // Y=0x8000, A=0x4000 LE → RGBA [0x80, 0x80, 0x80, 0x40]
+    let le = packed_ya(&[(0x8000, 0x4000)]);
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = [0u8; 4];
+    let mut out_be = [0u8; 4];
+    ya16_to_rgba_row::<false>(&le, &mut out_le, 1);
+    ya16_to_rgba_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE ya16 rgba outputs must match");
+  }
+
+  #[test]
+  fn ya16_be_parity_luma() {
+    let le = packed_ya(&[(0xC000, 0x0000)]);
+    let be: std::vec::Vec<u16> = le.iter().map(|v| v.swap_bytes()).collect();
+    let mut out_le = [0u8; 1];
+    let mut out_be = [0u8; 1];
+    ya16_to_luma_row::<false>(&le, &mut out_le, 1);
+    ya16_to_luma_row::<true>(&be, &mut out_be, 1);
+    assert_eq!(out_le, out_be, "BE and LE ya16 luma outputs must match");
+  }
 }
diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs
index 592befd1..b588e324 100644
--- a/src/sinker/mixed/gray.rs
+++ b/src/sinker/mixed/gray.rs
@@ -268,7 +268,7 @@ fn process_gray_n<'a, const BITS: u32>(
 
   // Luma u8 — always passes raw Y through, no full_range rescaling.
   if let Some(buf) = luma.as_deref_mut() {
-    gray_n_to_luma_row::<BITS>(
+    gray_n_to_luma_row::<BITS, false>(
       y_plane,
       &mut buf[one_plane_start..one_plane_end],
       w,
@@ -278,7 +278,7 @@ fn process_gray_n<'a, const BITS: u32>(
 
   // Luma u16 — always passes raw Y through, no full_range rescaling.
   if let Some(buf) = luma_u16.as_deref_mut() {
-    gray_n_to_luma_u16_row::<BITS>(
+    gray_n_to_luma_u16_row::<BITS, false>(
       y_plane,
       &mut buf[one_plane_start..one_plane_end],
       w,
@@ -294,7 +294,7 @@ fn process_gray_n<'a, const BITS: u32>(
     let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
     let rgba_u16_row =
       rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-    gray_n_to_rgba_u16_row::<BITS>(y_plane, rgba_u16_row, w, use_simd, full_range);
+    gray_n_to_rgba_u16_row::<BITS, false>(y_plane, rgba_u16_row, w, use_simd, full_range);
   } else if want_rgb_u16 {
     let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
     let rgb_plane_start = one_plane_start * 3;
@@ -306,7 +306,7 @@ fn process_gray_n<'a, const BITS: u32>(
         channels: 3,
       })?;
     let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-    gray_n_to_rgb_u16_row::<BITS>(y_plane, rgb_u16_row, w, use_simd, full_range);
+    gray_n_to_rgb_u16_row::<BITS, false>(y_plane, rgb_u16_row, w, use_simd, full_range);
     if want_rgba_u16 {
       let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row =
@@ -324,7 +324,7 @@ fn process_gray_n<'a, const BITS: u32>(
   if want_rgba && !want_rgb && !want_hsv {
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-    gray_n_to_rgba_row::<BITS>(y_plane, rgba_row, w, use_simd, full_range);
+    gray_n_to_rgba_row::<BITS, false>(y_plane, rgba_row, w, use_simd, full_range);
     return Ok(());
   }
 
@@ -332,7 +332,7 @@ fn process_gray_n<'a, const BITS: u32>(
   // (rescaled if limited-range).
   if want_hsv && !want_rgb && !want_rgba {
     let hsv = hsv.as_mut().unwrap();
-    gray_n_to_hsv_row::<BITS>(
+    gray_n_to_hsv_row::<BITS, false>(
       y_plane,
       &mut hsv.h[one_plane_start..one_plane_end],
       &mut hsv.s[one_plane_start..one_plane_end],
@@ -356,7 +356,7 @@ fn process_gray_n<'a, const BITS: u32>(
     w,
     h,
   )?;
-  gray_n_to_rgb_row::<BITS>(y_plane, rgb_row, w, use_simd, full_range);
+  gray_n_to_rgb_row::<BITS, false>(y_plane, rgb_row, w, use_simd, full_range);
 
   if let Some(hsv) = hsv.as_mut() {
     rgb_to_hsv_row(
@@ -690,7 +690,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
 
     // Luma u8 — shift >> 8.
     if let Some(buf) = luma.as_deref_mut() {
-      gray16_to_luma_row(
+      gray16_to_luma_row::<false>(
         y_plane,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -700,7 +700,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
 
     // Luma u16 — identity copy.
     if let Some(buf) = luma_u16.as_deref_mut() {
-      gray16_to_luma_u16_row(
+      gray16_to_luma_u16_row::<false>(
         y_plane,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -716,7 +716,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
       let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row =
         rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-      gray16_to_rgba_u16_row(y_plane, rgba_u16_row, w, use_simd, full_range);
+      gray16_to_rgba_u16_row::<false>(y_plane, rgba_u16_row, w, use_simd, full_range);
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_start = one_plane_start * 3;
@@ -729,7 +729,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
             channels: 3,
           })?;
       let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-      gray16_to_rgb_u16_row(y_plane, rgb_u16_row, w, use_simd, full_range);
+      gray16_to_rgb_u16_row::<false>(y_plane, rgb_u16_row, w, use_simd, full_range);
       if want_rgba_u16 {
         let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row =
@@ -750,7 +750,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
     if want_rgba && !need_rgb_kernel && !want_hsv {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gray16_to_rgba_row(y_plane, rgba_row, w, use_simd, full_range);
+      gray16_to_rgba_row::<false>(y_plane, rgba_row, w, use_simd, full_range);
       return Ok(());
     }
 
@@ -758,7 +758,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
     // Skip RGB scratch entirely when only HSV (and optionally RGBA) is needed.
     if want_hsv && !want_rgb {
       let hsv = hsv.as_mut().unwrap();
-      gray16_to_hsv_row(
+      gray16_to_hsv_row::<false>(
         y_plane,
         &mut hsv.h[one_plane_start..one_plane_end],
         &mut hsv.s[one_plane_start..one_plane_end],
@@ -769,7 +769,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
       );
       if let Some(buf) = rgba.as_deref_mut() {
         let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-        gray16_to_rgba_row(y_plane, rgba_row, w, use_simd, full_range);
+        gray16_to_rgba_row::<false>(y_plane, rgba_row, w, use_simd, full_range);
       }
       return Ok(());
     }
@@ -786,7 +786,7 @@ impl PixelSink for MixedSinker<'_, Gray16> {
       w,
       h,
     )?;
-    gray16_to_rgb_row(y_plane, rgb_row, w, use_simd, full_range);
+    gray16_to_rgb_row::<false>(y_plane, rgb_row, w, use_simd, full_range);
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(
@@ -970,7 +970,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
 
     // luma f32 pass-through — highest priority (no clamp, no round).
     if let Some(buf) = self.luma_f32.as_deref_mut() {
-      grayf32_to_luma_f32_row(
+      grayf32_to_luma_f32_row::<false>(
         y_plane,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -988,12 +988,12 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
           height: h,
           channels: 3,
         })?;
-      grayf32_to_rgb_f32_row(y_plane, &mut buf[rgb_f32_start..rgb_f32_end], w, use_simd);
+      grayf32_to_rgb_f32_row::<false>(y_plane, &mut buf[rgb_f32_start..rgb_f32_end], w, use_simd);
     }
 
     // luma u8.
     if let Some(buf) = self.luma.as_deref_mut() {
-      grayf32_to_luma_row(
+      grayf32_to_luma_row::<false>(
         y_plane,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -1003,7 +1003,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
 
     // luma u16.
     if let Some(buf) = self.luma_u16.as_deref_mut() {
-      grayf32_to_luma_u16_row(
+      grayf32_to_luma_u16_row::<false>(
         y_plane,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -1019,7 +1019,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
       let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row =
         rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-      grayf32_to_rgba_u16_row(y_plane, rgba_u16_row, w, use_simd);
+      grayf32_to_rgba_u16_row::<false>(y_plane, rgba_u16_row, w, use_simd);
     } else if want_rgb_u16 {
       let rgb_u16_buf = self.rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_start = one_plane_start * 3;
@@ -1032,7 +1032,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
             channels: 3,
           })?;
       let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-      grayf32_to_rgb_u16_row(y_plane, rgb_u16_row, w, use_simd);
+      grayf32_to_rgb_u16_row::<false>(y_plane, rgb_u16_row, w, use_simd);
       if want_rgba_u16 {
         let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row =
@@ -1050,14 +1050,14 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
     if want_rgba && !want_rgb && !want_hsv {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      grayf32_to_rgba_row(y_plane, rgba_row, w, use_simd);
+      grayf32_to_rgba_row::<false>(y_plane, rgba_row, w, use_simd);
       return Ok(());
     }
 
     // Standalone HSV fast path — Grayf32 always has H=0, S=0, V=clamp(Y)×255.
     if want_hsv && !want_rgb {
       let hsv = self.hsv.as_mut().unwrap();
-      grayf32_to_hsv_row(
+      grayf32_to_hsv_row::<false>(
         y_plane,
         &mut hsv.h[one_plane_start..one_plane_end],
         &mut hsv.s[one_plane_start..one_plane_end],
@@ -1067,7 +1067,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
       );
       if let Some(buf) = self.rgba.as_deref_mut() {
         let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-        grayf32_to_rgba_row(y_plane, rgba_row, w, use_simd);
+        grayf32_to_rgba_row::<false>(y_plane, rgba_row, w, use_simd);
       }
       return Ok(());
     }
@@ -1084,7 +1084,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> {
       w,
       h,
     )?;
-    grayf32_to_rgb_row(y_plane, rgb_row, w, use_simd);
+    grayf32_to_rgb_row::<false>(y_plane, rgb_row, w, use_simd);
 
     if let Some(hsv) = self.hsv.as_mut() {
       rgb_to_hsv_row(
@@ -1454,7 +1454,7 @@ impl PixelSink for MixedSinker<'_, Ya16> {
 
     // luma u8 — `Y >> 8`.
     if let Some(buf) = self.luma.as_deref_mut() {
-      ya16_to_luma_row(
+      ya16_to_luma_row::<false>(
         packed,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -1464,7 +1464,7 @@ impl PixelSink for MixedSinker<'_, Ya16> {
 
     // luma u16 — native pass-through.
     if let Some(buf) = self.luma_u16.as_deref_mut() {
-      ya16_to_luma_u16_row(
+      ya16_to_luma_u16_row::<false>(
         packed,
         &mut buf[one_plane_start..one_plane_end],
         w,
@@ -1480,7 +1480,7 @@ impl PixelSink for MixedSinker<'_, Ya16> {
       let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row =
         rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-      ya16_to_rgba_u16_row(packed, rgba_u16_row, w, use_simd);
+      ya16_to_rgba_u16_row::<false>(packed, rgba_u16_row, w, use_simd);
     } else if want_rgb_u16 {
       let rgb_u16_buf = self.rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_start = one_plane_start * 3;
@@ -1493,7 +1493,7 @@ impl PixelSink for MixedSinker<'_, Ya16> {
             channels: 3,
           })?;
       let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-      ya16_to_rgb_u16_row(packed, rgb_u16_row, w, use_simd);
+      ya16_to_rgb_u16_row::<false>(packed, rgb_u16_row, w, use_simd);
       if want_rgba_u16 {
         let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row =
@@ -1513,14 +1513,14 @@ impl PixelSink for MixedSinker<'_, Ya16> {
     if want_rgba && !want_rgb && !want_hsv {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      ya16_to_rgba_row(packed, rgba_row, w, use_simd);
+      ya16_to_rgba_row::<false>(packed, rgba_row, w, use_simd);
       return Ok(());
     }
 
     // Standalone HSV fast path.
     if want_hsv && !want_rgb && !want_rgba {
       let hsv = self.hsv.as_mut().unwrap();
-      ya16_to_hsv_row(
+      ya16_to_hsv_row::<false>(
         packed,
         &mut hsv.h[one_plane_start..one_plane_end],
         &mut hsv.s[one_plane_start..one_plane_end],
@@ -1544,7 +1544,7 @@ impl PixelSink for MixedSinker<'_, Ya16> {
       w,
       h,
     )?;
-    ya16_to_rgb_row(packed, rgb_row, w, use_simd);
+    ya16_to_rgb_row::<false>(packed, rgb_row, w, use_simd);
 
     if let Some(hsv) = self.hsv.as_mut() {
       rgb_to_hsv_row(