diff --git a/src/row/arch/neon/gray.rs b/src/row/arch/neon/gray.rs index a4d24371..15e2bf7c 100644 --- a/src/row/arch/neon/gray.rs +++ b/src/row/arch/neon/gray.rs @@ -15,7 +15,10 @@ use core::arch::aarch64::*; -use crate::row::scalar::{bits_mask, gray as scalar}; +use crate::row::{ + arch::neon::endian::{load_endian_u16x8, load_endian_u32x4}, + scalar::{bits_mask, gray as scalar}, +}; // ---- helpers ----------------------------------------------------------------- @@ -179,7 +182,7 @@ pub(crate) unsafe fn gray8_to_hsv_row( /// NEON must be available. Slices sized correctly for `width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_rgb_row( +pub(crate) unsafe fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); if !full_range { - return scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } let shift = (BITS - 8) as i32; let mask = bits_mask::(); @@ -196,7 +199,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( unsafe { let mask_v = vdupq_n_u16(mask); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = vandq_u16(raw, mask_v); let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16))); // narrow u16x8 → u8x8 @@ -208,7 +211,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( } } if x < width { - scalar::gray_n_to_rgb_row::( + scalar::gray_n_to_rgb_row::( &y_plane[x..width], &mut out[x * 3..width * 3], width - x, @@ -225,7 +228,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( /// NEON must be available. Slices sized correctly for `width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_rgba_row( +pub(crate) unsafe fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -234,7 +237,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); if !full_range { - return scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } let shift = (BITS - 8) as i32; let mask = bits_mask::(); @@ -243,7 +246,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( let mask_v = vdupq_n_u16(mask); let alpha = vdup_n_u8(0xFF); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = vandq_u16(raw, mask_v); let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16))); let narrow = vmovn_u16(shifted); @@ -253,7 +256,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( } } if x < width { - scalar::gray_n_to_rgba_row::( + scalar::gray_n_to_rgba_row::( &y_plane[x..width], &mut out[x * 4..width * 4], width - x, @@ -270,7 +273,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_rgb_u16_row( +pub(crate) unsafe fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -279,14 +282,14 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); if !full_range { - return scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } let mask = bits_mask::(); let mut x = 0usize; unsafe { let mask_v = vdupq_n_u16(mask); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y = vandq_u16(raw, mask_v); let rgb = uint16x8x3_t(y, y, y); vst3q_u16(out.as_mut_ptr().add(x * 3), rgb); @@ -294,7 +297,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( } } if x < width { - scalar::gray_n_to_rgb_u16_row::( + scalar::gray_n_to_rgb_u16_row::( &y_plane[x..width], &mut out[x * 3..width * 3], width - x, @@ -311,7 +314,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_rgba_u16_row( +pub(crate) unsafe fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -320,7 +323,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); if !full_range { - return scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } let mask = bits_mask::(); let mut x = 0usize; @@ -328,7 +331,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( let mask_v = vdupq_n_u16(mask); let alpha_v = vdupq_n_u16(mask); // full-range max for BITS while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y = vandq_u16(raw, mask_v); let rgba = uint16x8x4_t(y, y, y, alpha_v); vst4q_u16(out.as_mut_ptr().add(x * 4), rgba); @@ -336,7 +339,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( } } if x < width { - scalar::gray_n_to_rgba_u16_row::( + scalar::gray_n_to_rgba_u16_row::( &y_plane[x..width], &mut out[x * 4..width * 4], width - x, @@ -353,7 +356,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_luma_row( +pub(crate) unsafe fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -366,7 +369,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( unsafe { let mask_v = vdupq_n_u16(mask); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = vandq_u16(raw, mask_v); let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16))); let narrow = vmovn_u16(shifted); @@ -375,7 +378,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( } } if x < width { - scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -387,7 +390,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_luma_u16_row( +pub(crate) unsafe fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -399,14 +402,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( unsafe { let mask_v = vdupq_n_u16(mask); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = vandq_u16(raw, mask_v); vst1q_u16(out.as_mut_ptr().add(x), masked); x += 8; } } if x < width { - scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -419,7 +422,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray_n_to_hsv_row( +pub(crate) unsafe fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -432,7 +435,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( debug_assert!(s_out.len() >= width); debug_assert!(v_out.len() >= width); if !full_range { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let shift = (BITS - 8) as i32; let mask = bits_mask::(); @@ -441,7 +444,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( let mask_v = vdupq_n_u16(mask); let zero = vdup_n_u8(0); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = vandq_u16(raw, mask_v); let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16))); let narrow = vmovn_u16(shifted); @@ -452,7 +455,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( } } if x < width { - scalar::gray_n_to_hsv_row::( + scalar::gray_n_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -473,7 +476,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_rgb_row( +pub(crate) unsafe fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -482,12 +485,12 @@ pub(crate) unsafe fn gray16_to_rgb_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); if !full_range { - return scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } let mut x = 0usize; unsafe { while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y8 = vshrn_n_u16::<8>(raw); let rgb = uint8x8x3_t(y8, y8, y8); vst3_u8(out.as_mut_ptr().add(x * 3), rgb); @@ -495,7 +498,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( } } if x < width { - scalar::gray16_to_rgb_row( + scalar::gray16_to_rgb_row::( &y_plane[x..width], &mut out[x * 3..width * 3], width - x, @@ -512,7 +515,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_rgba_row( +pub(crate) unsafe fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -521,13 +524,13 @@ pub(crate) unsafe fn gray16_to_rgba_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); if !full_range { - return scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } let mut x = 0usize; unsafe { let alpha = vdup_n_u8(0xFF); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y8 = vshrn_n_u16::<8>(raw); let rgba = uint8x8x4_t(y8, y8, y8, alpha); vst4_u8(out.as_mut_ptr().add(x * 4), rgba); @@ -535,7 +538,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( } } if x < width { - scalar::gray16_to_rgba_row( + scalar::gray16_to_rgba_row::( &y_plane[x..width], &mut out[x * 4..width * 4], width - x, @@ -552,7 +555,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_rgb_u16_row( +pub(crate) unsafe fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -561,19 +564,19 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); if !full_range { - return scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } let mut x = 0usize; unsafe { while x + 8 <= width { - let y = vld1q_u16(y_plane.as_ptr().add(x)); + let y = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let rgb = uint16x8x3_t(y, y, y); vst3q_u16(out.as_mut_ptr().add(x * 3), rgb); x += 8; } } if x < width { - scalar::gray16_to_rgb_u16_row( + scalar::gray16_to_rgb_u16_row::( &y_plane[x..width], &mut out[x * 3..width * 3], width - x, @@ -590,7 +593,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_rgba_u16_row( +pub(crate) unsafe fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -599,20 +602,20 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); if !full_range { - return scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } let mut x = 0usize; unsafe { let alpha = vdupq_n_u16(0xFFFF); while x + 8 <= width { - let y = vld1q_u16(y_plane.as_ptr().add(x)); + let y = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let rgba = uint16x8x4_t(y, y, y, alpha); vst4q_u16(out.as_mut_ptr().add(x * 4), rgba); x += 8; } } if x < width { - scalar::gray16_to_rgba_u16_row( + scalar::gray16_to_rgba_u16_row::( &y_plane[x..width], &mut out[x * 4..width * 4], width - x, @@ -629,20 +632,24 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y8 = vshrn_n_u16::<8>(raw); vst1_u8(out.as_mut_ptr().add(x), y8); x += 8; } } if x < width { - scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -654,19 +661,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 8 <= width { - let y = vld1q_u16(y_plane.as_ptr().add(x)); + let y = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); vst1q_u16(out.as_mut_ptr().add(x), y); x += 8; } } if x < width { - scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -679,7 +690,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gray16_to_hsv_row( +pub(crate) unsafe fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -692,13 +703,13 @@ pub(crate) unsafe fn gray16_to_hsv_row( debug_assert!(s_out.len() >= width); debug_assert!(v_out.len() >= width); if !full_range { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mut x = 0usize; unsafe { let zero = vdup_n_u8(0); while x + 8 <= width { - let raw = vld1q_u16(y_plane.as_ptr().add(x)); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let y8 = vshrn_n_u16::<8>(raw); vst1_u8(h_out.as_mut_ptr().add(x), zero); vst1_u8(s_out.as_mut_ptr().add(x), zero); @@ -707,7 +718,7 @@ pub(crate) unsafe fn gray16_to_hsv_row( } } if x < width { - scalar::gray16_to_hsv_row( + scalar::gray16_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -728,7 +739,11 @@ pub(crate) unsafe fn gray16_to_hsv_row( /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -738,8 +753,12 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y0 = vld1q_f32(y_plane.as_ptr().add(x)); - let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4)); + let y0 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); + let y1 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add((x + 4) * 4), + )); let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale); let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale); // vcvtaq_u32_f32: round-to-nearest-even, no FPCR manipulation needed. @@ -754,7 +773,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -764,7 +783,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -775,8 +798,12 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y0 = vld1q_f32(y_plane.as_ptr().add(x)); - let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4)); + let y0 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); + let y1 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add((x + 4) * 4), + )); let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale); let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale); let u0 = vcvtaq_u32_f32(c0); @@ -788,7 +815,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_row::(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); } } @@ -798,7 +825,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -808,7 +839,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi let mut x = 0usize; unsafe { while x + 4 <= width { - let y = vld1q_f32(y_plane.as_ptr().add(x)); + let y = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale); let u32v = vcvtaq_u32_f32(c); let u16v = vqmovn_u32(u32v); // saturating narrow to u16 @@ -818,7 +851,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi } } if x < width { - scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_u16_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -828,7 +861,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi /// NEON must be available. `y_plane.len() >= width`. `out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -839,7 +876,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 4 <= width { - let y = vld1q_f32(y_plane.as_ptr().add(x)); + let y = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale); let u16v = vqmovn_u32(vcvtaq_u32_f32(c)); let rgba = uint16x4x4_t(u16v, u16v, u16v, alpha); @@ -848,7 +887,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_u16_row::( + &y_plane[x..width], + &mut out[x * 4..width * 4], + width - x, + ); } } @@ -859,21 +902,27 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); let mut x = 0usize; unsafe { while x + 4 <= width { - let y = vld1q_f32(y_plane.as_ptr().add(x)); + let y = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let rgb = float32x4x3_t(y, y, y); vst3q_f32(out.as_mut_ptr().add(x * 3), rgb); x += 4; } } if x < width { - scalar::grayf32_to_rgb_f32_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_f32_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -883,7 +932,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -893,8 +946,12 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y0 = vld1q_f32(y_plane.as_ptr().add(x)); - let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4)); + let y0 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); + let y1 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add((x + 4) * 4), + )); let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale); let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale); let n8 = vmovn_u16(vcombine_u16( @@ -906,7 +963,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -916,7 +973,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -926,7 +987,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 4 <= width { - let y = vld1q_f32(y_plane.as_ptr().add(x)); + let y = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale); let u16v = vqmovn_u32(vcvtaq_u32_f32(c)); vst1_u16(out.as_mut_ptr().add(x), u16v); @@ -934,7 +997,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -945,20 +1008,26 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 4 <= width { - let y = vld1q_f32(y_plane.as_ptr().add(x)); + let y = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); vst1q_f32(out.as_mut_ptr().add(x), y); x += 4; } } if x < width { - scalar::grayf32_to_luma_f32_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_f32_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -968,7 +1037,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn grayf32_to_hsv_row( +pub(crate) unsafe fn grayf32_to_hsv_row( y_plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -984,8 +1053,12 @@ pub(crate) unsafe fn grayf32_to_hsv_row( let mut x = 0usize; unsafe { while x + 8 <= width { - let y0 = vld1q_f32(y_plane.as_ptr().add(x)); - let y1 = vld1q_f32(y_plane.as_ptr().add(x + 4)); + let y0 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); + let y1 = vreinterpretq_f32_u32(load_endian_u32x4::( + y_plane.as_ptr().cast::().add((x + 4) * 4), + )); let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero_f), scale); let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero_f), scale); let v8 = vmovn_u16(vcombine_u16( @@ -999,7 +1072,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( } } if x < width { - scalar::grayf32_to_hsv_row( + scalar::grayf32_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -1238,10 +1311,13 @@ pub(crate) unsafe fn ya8_to_hsv_row( /// NEON must be available. `packed.len() >= width * 2`. `out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); + if BE { + return scalar::ya16_to_rgb_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1253,7 +1329,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz } } if x < width { - scalar::ya16_to_rgb_row( + scalar::ya16_to_rgb_row::( &packed[x * 2..width * 2], &mut out[x * 3..width * 3], width - x, @@ -1267,10 +1343,17 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); + if BE { + return scalar::ya16_to_rgba_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1283,7 +1366,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_rgba_row( + scalar::ya16_to_rgba_row::( &packed[x * 2..width * 2], &mut out[x * 4..width * 4], width - x, @@ -1297,10 +1380,17 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); + if BE { + return scalar::ya16_to_rgb_u16_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1315,7 +1405,7 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: } } if x < width { - scalar::ya16_to_rgb_u16_row( + scalar::ya16_to_rgb_u16_row::( &packed[x * 2..width * 2], &mut out[x * 3..width * 3], width - x, @@ -1329,10 +1419,17 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); + if BE { + return scalar::ya16_to_rgba_u16_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1349,7 +1446,7 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width } } if x < width { - scalar::ya16_to_rgba_u16_row( + scalar::ya16_to_rgba_u16_row::( &packed[x * 2..width * 2], &mut out[x * 4..width * 4], width - x, @@ -1363,10 +1460,17 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); + if BE { + return scalar::ya16_to_luma_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1377,7 +1481,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + scalar::ya16_to_luma_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -1387,10 +1491,17 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); + if BE { + return scalar::ya16_to_luma_u16_row::(packed, out, width); + } let mut x = 0usize; unsafe { while x + 8 <= width { @@ -1400,7 +1511,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width } } if x < width { - scalar::ya16_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + scalar::ya16_to_luma_u16_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -1410,7 +1521,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width /// NEON must be available. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ya16_to_hsv_row( +pub(crate) unsafe fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1419,6 +1530,9 @@ pub(crate) unsafe fn ya16_to_hsv_row( ) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); + if BE { + return scalar::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); + } let mut x = 0usize; unsafe { let zero = vdup_n_u8(0); @@ -1432,7 +1546,7 @@ pub(crate) unsafe fn ya16_to_hsv_row( } } if x < width { - scalar::ya16_to_hsv_row( + scalar::ya16_to_hsv_row::( &packed[x * 2..width * 2], &mut h_out[x..width], &mut s_out[x..width], @@ -1519,8 +1633,8 @@ mod tests { prng16(&mut plane, 0xABCD_1234); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray_n_to_rgb_row::<10>(&plane, &mut simd, w, true) }; - scalar::gray_n_to_rgb_row::<10>(&plane, &mut scal, w, true); + unsafe { super::gray_n_to_rgb_row::<10, false>(&plane, &mut simd, w, true) }; + scalar::gray_n_to_rgb_row::<10, false>(&plane, &mut scal, w, true); assert_eq!(simd, scal, "width={w}"); } } @@ -1533,8 +1647,8 @@ mod tests { prng16(&mut plane, 0xDEAD_BEEF); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, true) }; - scalar::gray16_to_rgb_row(&plane, &mut scal, w, true); + unsafe { super::gray16_to_rgb_row::(&plane, &mut simd, w, true) }; + scalar::gray16_to_rgb_row::(&plane, &mut scal, w, true); assert_eq!(simd, scal, "width={w}"); } } @@ -1563,8 +1677,8 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) }; - scalar::gray16_to_rgb_row(&plane, &mut scal, w, false); + unsafe { super::gray16_to_rgb_row::(&plane, &mut simd, w, false) }; + scalar::gray16_to_rgb_row::(&plane, &mut scal, w, false); assert_eq!(simd, scal, "width={w} limited-range"); } } @@ -1589,8 +1703,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1604,8 +1718,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1619,8 +1733,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1634,8 +1748,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1649,8 +1763,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0005); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1664,8 +1778,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0006); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1679,8 +1793,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0007); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1694,8 +1808,8 @@ mod tests { prng_f32(&mut plane, 0xF32A_0008); let mut simd = std::vec![0.0f32; w]; let mut scal = std::vec![0.0f32; w]; - unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1713,8 +1827,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) }; - sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w); + unsafe { super::grayf32_to_hsv_row::(&plane, &mut sh, &mut ss, &mut sv, w) }; + sf::grayf32_to_hsv_row::(&plane, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1861,8 +1975,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1876,8 +1990,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1891,8 +2005,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1906,8 +2020,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1921,8 +2035,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0005); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1936,8 +2050,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0006); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1955,11 +2069,78 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) }; - sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w); + unsafe { super::ya16_to_hsv_row::(&packed, &mut sh, &mut ss, &mut sv, w) }; + sy::ya16_to_hsv_row::(&packed, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); } } + + // ---- BE parity tests: NEON BE kernel == scalar LE kernel on byte-swapped input ---- + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn neon_gray10_be_parity_rgb() { + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE10_0001); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w * 3]; + let mut scal_le = std::vec![0u8; w * 3]; + unsafe { super::gray_n_to_rgb_row::<10, true>(&be, &mut simd_be, w, true) }; + scalar::gray_n_to_rgb_row::<10, false>(&le, &mut scal_le, w, true); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn neon_gray16_be_parity_luma() { + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE16_0002); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray16_to_luma_row::(&be, &mut simd_be, w) }; + scalar::gray16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn neon_grayf32_be_parity_luma() { + use crate::row::scalar::grayf32 as sf; + for &w in WIDTHS { + let mut le = std::vec![0.0f32; w]; + prng_f32(&mut le, 0xBEF3_0003); + let be: std::vec::Vec = le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::grayf32_to_luma_row::(&be, &mut simd_be, w) }; + sf::grayf32_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn neon_ya16_be_parity_luma() { + use crate::row::scalar::ya16 as sy; + for &w in WIDTHS { + let mut le = std::vec![0u16; w * 2]; + prng_ya16(&mut le, 0xBEA1_0004); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::ya16_to_luma_row::(&be, &mut simd_be, w) }; + sy::ya16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } } diff --git a/src/row/arch/wasm_simd128/gray.rs b/src/row/arch/wasm_simd128/gray.rs index 39f26ca9..fdb667c7 100644 --- a/src/row/arch/wasm_simd128/gray.rs +++ b/src/row/arch/wasm_simd128/gray.rs @@ -15,7 +15,10 @@ use core::arch::wasm32::*; -use crate::row::scalar::{bits_mask, gray as scalar, grayf32, ya8, ya16}; +use crate::row::{ + arch::wasm_simd128::endian::{load_endian_u16x8, load_endian_u32x4}, + scalar::{bits_mask, gray as scalar, grayf32, ya8, ya16}, +}; // ---- Gray8 ------------------------------------------------------------------ @@ -112,7 +115,7 @@ pub(crate) unsafe fn gray8_to_hsv_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_rgb_row( +pub(crate) unsafe fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -120,7 +123,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray_n_to_rgba_row`. @@ -131,7 +134,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_rgba_row( +pub(crate) unsafe fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -139,7 +142,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray_n_to_rgb_u16_row`. @@ -150,7 +153,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_rgb_u16_row( +pub(crate) unsafe fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -158,7 +161,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray_n_to_rgba_u16_row`. @@ -169,7 +172,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_rgba_u16_row( +pub(crate) unsafe fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -177,7 +180,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray_n_to_luma_row`: mask + shift → u8. 8 pixels/iter. @@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_luma_row( +pub(crate) unsafe fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -201,7 +204,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( unsafe { let mask_v = u16x8_splat(mask); while x + 8 <= width { - let raw = v128_load(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = v128_and(raw, mask_v); let shifted = u16x8_shr(masked, shift); // Narrow u16x8 → u8x8 via u8x16_narrow_i16x8 (saturation, but values @@ -215,7 +218,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( } } if x < width { - scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -227,7 +230,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_luma_u16_row( +pub(crate) unsafe fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -239,14 +242,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( unsafe { let mask_v = u16x8_splat(mask); while x + 8 <= width { - let raw = v128_load(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = v128_and(raw, mask_v); v128_store(out.as_mut_ptr().add(x).cast(), masked); x += 8; } } if x < width { - scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -260,7 +263,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( /// simd128 must be enabled. All slices have length >= width. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray_n_to_hsv_row( +pub(crate) unsafe fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -270,7 +273,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mask = bits_mask::(); let shift = BITS - 8; @@ -279,7 +282,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( let mask_v = u16x8_splat(mask); let zero = i64x2(0, 0); while x + 8 <= width { - let raw = v128_load(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = v128_and(raw, mask_v); let shifted = u16x8_shr(masked, shift); let narrowed = u8x16_narrow_i16x8(shifted, zero); @@ -292,7 +295,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( } } if x < width { - scalar::gray_n_to_hsv_row::( + scalar::gray_n_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -313,7 +316,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_rgb_row( +pub(crate) unsafe fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -321,7 +324,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray16_to_rgba_row`. @@ -332,7 +335,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_rgba_row( +pub(crate) unsafe fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -340,7 +343,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray16_to_rgb_u16_row`. @@ -351,7 +354,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_rgb_u16_row( +pub(crate) unsafe fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -359,7 +362,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray16_to_rgba_u16_row`. @@ -371,7 +374,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_rgba_u16_row( +pub(crate) unsafe fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -379,7 +382,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } /// wasm-simd128 `gray16_to_luma_row`: `>> 8` → u8. 8 pixels/iter. @@ -390,14 +393,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { let zero = i64x2(0, 0); while x + 8 <= width { - let raw = v128_load(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = u16x8_shr(raw, 8); let narrowed = u8x16_narrow_i16x8(shifted, zero); let val = i64x2_extract_lane::<0>(narrowed) as u64; @@ -406,7 +413,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: } } if x < width { - scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -418,19 +425,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 8 <= width { - let y = v128_load(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); v128_store(out.as_mut_ptr().add(x).cast(), y); x += 8; } } if x < width { - scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -443,7 +454,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi /// simd128 must be enabled. All slices have length >= width. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gray16_to_hsv_row( +pub(crate) unsafe fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -453,13 +464,13 @@ pub(crate) unsafe fn gray16_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mut x = 0usize; unsafe { let zero = i64x2(0, 0); while x + 8 <= width { - let raw = v128_load(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = u16x8_shr(raw, 8); let narrowed = u8x16_narrow_i16x8(shifted, zero); let val = i64x2_extract_lane::<0>(narrowed) as u64; @@ -471,7 +482,7 @@ pub(crate) unsafe fn gray16_to_hsv_row( } } if x < width { - scalar::gray16_to_hsv_row( + scalar::gray16_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -490,10 +501,14 @@ pub(crate) unsafe fn gray16_to_hsv_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - grayf32::grayf32_to_rgb_row(y_plane, out, width); + grayf32::grayf32_to_rgb_row::(y_plane, out, width); } /// wasm-simd128 `grayf32_to_rgba_row`: delegates to scalar. @@ -502,10 +517,14 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - grayf32::grayf32_to_rgba_row(y_plane, out, width); + grayf32::grayf32_to_rgba_row::(y_plane, out, width); } /// wasm-simd128 `grayf32_to_rgb_u16_row`: delegates to scalar. @@ -514,10 +533,14 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - grayf32::grayf32_to_rgb_u16_row(y_plane, out, width); + grayf32::grayf32_to_rgb_u16_row::(y_plane, out, width); } /// wasm-simd128 `grayf32_to_rgba_u16_row`: delegates to scalar. @@ -526,10 +549,14 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - grayf32::grayf32_to_rgba_u16_row(y_plane, out, width); + grayf32::grayf32_to_rgba_u16_row::(y_plane, out, width); } /// wasm-simd128 `grayf32_to_rgb_f32_row`: delegates to scalar. @@ -539,10 +566,14 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w #[inline] #[target_feature(enable = "simd128")] #[allow(dead_code)] // dispatcher always uses scalar; function is exercised by tests only -pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - grayf32::grayf32_to_rgb_f32_row(y_plane, out, width); + grayf32::grayf32_to_rgb_f32_row::(y_plane, out, width); } /// wasm-simd128 `grayf32_to_luma_row`: clamp→scale→round→u8. 4 pixels/iter. @@ -554,7 +585,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let scale = f32x4_splat(255.0); @@ -565,7 +600,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 4 <= width { - let y = v128_load(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u32x4::(y_plane.as_ptr().cast::().add(x * 4)); let clamped = f32x4_min(f32x4_max(y, zero4), one4); let scaled = f32x4_mul(clamped, scale); let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half)); @@ -579,7 +614,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - grayf32::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + grayf32::grayf32_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -589,7 +624,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: /// simd128 must be enabled. `y_plane.len() >= width`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let scale = f32x4_splat(65535.0); @@ -600,7 +639,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 4 <= width { - let y = v128_load(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u32x4::(y_plane.as_ptr().cast::().add(x * 4)); let clamped = f32x4_min(f32x4_max(y, zero4), one4); let scaled = f32x4_mul(clamped, scale); let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half)); @@ -614,7 +653,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - grayf32::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + grayf32::grayf32_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -625,19 +664,23 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w #[inline] #[target_feature(enable = "simd128")] #[allow(dead_code)] // dispatcher always uses scalar; function is exercised by tests only -pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 4 <= width { - let y = v128_load(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u32x4::(y_plane.as_ptr().cast::().add(x * 4)); v128_store(out.as_mut_ptr().add(x).cast(), y); x += 4; } } if x < width { - grayf32::grayf32_to_luma_f32_row(&y_plane[x..width], &mut out[x..width], width - x); + grayf32::grayf32_to_luma_f32_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -647,7 +690,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w /// simd128 must be enabled. All slices have length >= width. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn grayf32_to_hsv_row( +pub(crate) unsafe fn grayf32_to_hsv_row( y_plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -663,7 +706,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( let mut x = 0usize; unsafe { while x + 4 <= width { - let y = v128_load(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u32x4::(y_plane.as_ptr().cast::().add(x * 4)); let clamped = f32x4_min(f32x4_max(y, zero4), one4); let scaled = f32x4_mul(clamped, scale); let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half)); @@ -678,7 +721,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( } } if x < width { - grayf32::grayf32_to_hsv_row( + grayf32::grayf32_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -840,10 +883,10 @@ pub(crate) unsafe fn ya8_to_hsv_row( /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); - ya16::ya16_to_rgb_row(packed, out, width); + ya16::ya16_to_rgb_row::(packed, out, width); } /// wasm-simd128 `ya16_to_rgba_row`: delegates to scalar. @@ -852,10 +895,14 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); - ya16::ya16_to_rgba_row(packed, out, width); + ya16::ya16_to_rgba_row::(packed, out, width); } /// wasm-simd128 `ya16_to_rgb_u16_row`: delegates to scalar. @@ -864,10 +911,14 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); - ya16::ya16_to_rgb_u16_row(packed, out, width); + ya16::ya16_to_rgb_u16_row::(packed, out, width); } /// wasm-simd128 `ya16_to_rgba_u16_row`: delegates to scalar. @@ -876,10 +927,14 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: /// simd128 must be enabled. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); - ya16::ya16_to_rgba_u16_row(packed, out, width); + ya16::ya16_to_rgba_u16_row::(packed, out, width); } /// wasm-simd128 `ya16_to_luma_row`: deinterleave Y u16 → `>> 8` → u8. @@ -892,7 +947,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width /// simd128 must be enabled. `packed.len() >= width * 2`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // Shuffle mask: gather words at indices 0,2,4,6 (byte offsets 0-1,4-5,8-9,12-13) @@ -926,7 +985,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - ya16::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + ya16::ya16_to_luma_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -937,7 +996,11 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// simd128 must be enabled. `packed.len() >= width * 2`. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); let shuf_lo = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); @@ -960,7 +1023,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width } } if x < width { - ya16::ya16_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + ya16::ya16_to_luma_u16_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -970,7 +1033,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width /// simd128 must be enabled. All slices have length >= width. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ya16_to_hsv_row( +pub(crate) unsafe fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1005,7 +1068,7 @@ pub(crate) unsafe fn ya16_to_hsv_row( } } if x < width { - ya16::ya16_to_hsv_row( + ya16::ya16_to_hsv_row::( &packed[x * 2..width * 2], &mut h_out[x..width], &mut s_out[x..width], @@ -1055,8 +1118,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1070,8 +1133,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1085,8 +1148,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1100,8 +1163,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1115,8 +1178,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0005); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1130,8 +1193,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0006); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1145,8 +1208,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0007); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1160,8 +1223,8 @@ mod tests { prng_f32(&mut plane, 0xF800_0008); let mut simd = std::vec![0.0f32; w]; let mut scal = std::vec![0.0f32; w]; - unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1179,8 +1242,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) }; - sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w); + unsafe { super::grayf32_to_hsv_row::(&plane, &mut sh, &mut ss, &mut sv, w) }; + sf::grayf32_to_hsv_row::(&plane, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1327,8 +1390,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1342,8 +1405,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1357,8 +1420,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1372,8 +1435,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1387,8 +1450,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0005); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1402,8 +1465,8 @@ mod tests { prng_ya16(&mut packed, 0xA862_0006); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1421,8 +1484,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) }; - sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w); + unsafe { super::ya16_to_hsv_row::(&packed, &mut sh, &mut ss, &mut sv, w) }; + sy::ya16_to_hsv_row::(&packed, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1457,8 +1520,8 @@ mod tests { prng16(&mut plane, 0xCAFE_BABE); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) }; - scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w); + unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) }; + scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1471,9 +1534,41 @@ mod tests { prng16(&mut plane, 0xDEAD_BEEF); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_u16_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } + + // ---- BE parity tests -------------------------------------------------------- + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn wasm_gray10_be_parity_luma() { + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE10_0001); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) }; + scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + #[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")] + fn wasm_gray16_be_parity_luma() { + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE16_0002); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray16_to_luma_row::(&be, &mut simd_be, w) }; + scalar::gray16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } } diff --git a/src/row/arch/x86_avx2/gray.rs b/src/row/arch/x86_avx2/gray.rs index a0977b02..48e973b8 100644 --- a/src/row/arch/x86_avx2/gray.rs +++ b/src/row/arch/x86_avx2/gray.rs @@ -16,7 +16,10 @@ use core::arch::x86_64::*; -use crate::row::scalar::{bits_mask, gray as scalar}; +use crate::row::{ + arch::x86_avx2::endian::{load_endian_u16x16, load_endian_u32x8}, + scalar::{bits_mask, gray as scalar}, +}; // ---- Gray8 ------------------------------------------------------------------ @@ -120,7 +123,7 @@ pub(crate) unsafe fn gray8_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_rgb_row( +pub(crate) unsafe fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -128,7 +131,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } /// AVX2 `gray_n_to_rgba_row`. @@ -140,7 +143,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_rgba_row( +pub(crate) unsafe fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -148,7 +151,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } /// AVX2 `gray_n_to_rgb_u16_row`. @@ -160,7 +163,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_rgb_u16_row( +pub(crate) unsafe fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -168,7 +171,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } /// AVX2 `gray_n_to_rgba_u16_row`. @@ -180,7 +183,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_rgba_u16_row( +pub(crate) unsafe fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -188,7 +191,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } /// AVX2 `gray_n_to_luma_row`: mask + shift to u8. 16 pixels/iter. @@ -200,7 +203,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_luma_row( +pub(crate) unsafe fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -215,7 +218,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( // requires a literal const generic shift not expressible as `BITS - 8`. let shr = _mm_cvtsi32_si128((BITS - 8) as i32); while x + 16 <= width { - let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm256_and_si256(raw, mask_v); let shifted = _mm256_srl_epi16(masked, shr); // Pack u16x16 → u8x16 (with lane-cross fixup via permute4x64) @@ -231,7 +234,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( } } if x < width { - scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -244,7 +247,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_luma_u16_row( +pub(crate) unsafe fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -256,14 +259,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( unsafe { let mask_v = _mm256_set1_epi16(mask as i16); while x + 16 <= width { - let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm256_and_si256(raw, mask_v); _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), masked); x += 16; } } if x < width { - scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -277,7 +280,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray_n_to_hsv_row( +pub(crate) unsafe fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -287,7 +290,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mask = bits_mask::(); let mut x = 0usize; @@ -296,7 +299,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( let shr = _mm_cvtsi32_si128((BITS - 8) as i32); let zero256 = _mm256_setzero_si256(); while x + 16 <= width { - let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm256_and_si256(raw, mask_v); let shifted = _mm256_srl_epi16(masked, shr); let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero256)); @@ -315,7 +318,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( } } if x < width { - scalar::gray_n_to_hsv_row::( + scalar::gray_n_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -337,7 +340,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_rgb_row( +pub(crate) unsafe fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -345,7 +348,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } /// AVX2 `gray16_to_rgba_row`. @@ -357,7 +360,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_rgba_row( +pub(crate) unsafe fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -365,7 +368,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } /// AVX2 `gray16_to_rgb_u16_row`. @@ -377,7 +380,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_rgb_u16_row( +pub(crate) unsafe fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -385,7 +388,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } /// AVX2 `gray16_to_rgba_u16_row`. @@ -397,7 +400,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_rgba_u16_row( +pub(crate) unsafe fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -405,7 +408,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } /// AVX2 `gray16_to_luma_row`: `>> 8`, pack, store. 16 pixels/iter. @@ -417,14 +420,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { let zero = _mm256_setzero_si256(); while x + 16 <= width { - let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm256_srli_epi16(raw, 8); // Pack u16x16 → u8x16 with lane-cross fixup. let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero)); @@ -434,7 +441,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: } } if x < width { - scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -447,19 +454,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y); x += 16; } } if x < width { - scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -473,7 +484,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi #[allow(dead_code)] #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gray16_to_hsv_row( +pub(crate) unsafe fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -483,13 +494,13 @@ pub(crate) unsafe fn gray16_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mut x = 0usize; unsafe { let zero = _mm256_setzero_si256(); while x + 16 <= width { - let raw = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x16::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm256_srli_epi16(raw, 8); let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero)); let lo = _mm256_castsi256_si128(packed); @@ -506,7 +517,7 @@ pub(crate) unsafe fn gray16_to_hsv_row( } } if x < width { - scalar::gray16_to_hsv_row( + scalar::gray16_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -529,7 +540,11 @@ pub(crate) unsafe fn gray16_to_hsv_row( /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -539,7 +554,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -561,7 +578,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -571,7 +588,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -581,7 +602,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -602,7 +625,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_row::(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); } } @@ -612,7 +635,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -622,7 +649,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -668,7 +697,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi } } if x < width { - scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_u16_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -678,7 +707,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -688,7 +721,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -740,7 +775,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_u16_row::( + &y_plane[x..width], + &mut out[x * 4..width * 4], + width - x, + ); } } @@ -751,11 +790,15 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::grayf32_to_rgb_f32_row(y_plane, out, width); + scalar::grayf32_to_rgb_f32_row::(y_plane, out, width); } /// AVX2 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8. 8 px/iter. @@ -764,7 +807,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -774,7 +821,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -788,7 +837,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -798,7 +847,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -808,7 +861,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -820,7 +875,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -831,11 +886,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); - scalar::grayf32_to_luma_f32_row(y_plane, out, width); + scalar::grayf32_to_luma_f32_row::(y_plane, out, width); } /// AVX2 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255. 8 px/iter. @@ -844,7 +903,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn grayf32_to_hsv_row( +pub(crate) unsafe fn grayf32_to_hsv_row( y_plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -859,7 +918,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row( let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm256_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm256_castsi256_ps(load_endian_u32x8::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one); let scaled = _mm256_mul_ps(clamped, scale); let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5))); @@ -876,7 +937,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( } } if x < width { - scalar::grayf32_to_hsv_row( + scalar::grayf32_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -1093,7 +1154,7 @@ pub(crate) unsafe fn ya8_to_hsv_row( /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); @@ -1119,7 +1180,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz } } if x < width { - scalar::ya16_to_rgb_row( + scalar::ya16_to_rgb_row::( &packed[x * 2..width * 2], &mut out[x * 3..width * 3], width - x, @@ -1133,7 +1194,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); @@ -1169,7 +1234,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_rgba_row( + scalar::ya16_to_rgba_row::( &packed[x * 2..width * 2], &mut out[x * 4..width * 4], width - x, @@ -1183,11 +1248,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); - scalar::ya16_to_rgb_u16_row(packed, out, width); + scalar::ya16_to_rgb_u16_row::(packed, out, width); } /// AVX2 `ya16_to_rgba_u16_row`: native Y and A u16. @@ -1196,11 +1265,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); - scalar::ya16_to_rgba_u16_row(packed, out, width); + scalar::ya16_to_rgba_u16_row::(packed, out, width); } /// AVX2 `ya16_to_luma_row`: Y `>> 8` → u8. 4 px/iter. @@ -1209,7 +1282,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); @@ -1229,7 +1306,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + scalar::ya16_to_luma_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -1239,11 +1316,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); - scalar::ya16_to_luma_u16_row(packed, out, width); + scalar::ya16_to_luma_u16_row::(packed, out, width); } /// AVX2 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped. @@ -1252,7 +1333,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width /// AVX2 must be available. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ya16_to_hsv_row( +pub(crate) unsafe fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1280,7 +1361,7 @@ pub(crate) unsafe fn ya16_to_hsv_row( } } if x < width { - scalar::ya16_to_hsv_row( + scalar::ya16_to_hsv_row::( &packed[x * 2..width * 2], &mut h_out[x..width], &mut s_out[x..width], @@ -1332,8 +1413,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1350,8 +1431,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1368,8 +1449,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1386,8 +1467,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1404,8 +1485,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0006); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1422,8 +1503,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0007); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1444,8 +1525,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) }; - sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w); + unsafe { super::grayf32_to_hsv_row::(&plane, &mut sh, &mut ss, &mut sv, w) }; + sf::grayf32_to_hsv_row::(&plane, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1464,8 +1545,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0005); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1482,8 +1563,8 @@ mod tests { prng_f32(&mut plane, 0xF200_0008); let mut simd = std::vec![0.0f32; w]; let mut scal = std::vec![0.0f32; w]; - unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1632,8 +1713,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1650,8 +1731,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1668,8 +1749,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1686,8 +1767,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1704,8 +1785,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0005); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1722,8 +1803,8 @@ mod tests { prng_ya16(&mut packed, 0xA260_0006); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1744,8 +1825,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) }; - sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w); + unsafe { super::ya16_to_hsv_row::(&packed, &mut sh, &mut ss, &mut sv, w) }; + sy::ya16_to_hsv_row::(&packed, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1799,8 +1880,8 @@ mod tests { prng16(&mut plane, 0xABCD_1234); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::gray_n_to_luma_row::<10>(&plane, &mut simd, w) }; - scalar::gray_n_to_luma_row::<10>(&plane, &mut scal, w); + unsafe { super::gray_n_to_luma_row::<10, false>(&plane, &mut simd, w) }; + scalar::gray_n_to_luma_row::<10, false>(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1815,8 +1896,8 @@ mod tests { prng16(&mut plane, 0xDEAD_CAFE); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray_n_to_luma_u16_row::<12>(&plane, &mut simd, w) }; - scalar::gray_n_to_luma_u16_row::<12>(&plane, &mut scal, w); + unsafe { super::gray_n_to_luma_u16_row::<12, false>(&plane, &mut simd, w) }; + scalar::gray_n_to_luma_u16_row::<12, false>(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1831,8 +1912,8 @@ mod tests { prng16(&mut plane, 0xBEEF_CAFE); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1847,8 +1928,8 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_u16_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1879,9 +1960,45 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) }; - scalar::gray16_to_rgb_row(&plane, &mut scal, w, false); + unsafe { super::gray16_to_rgb_row::(&plane, &mut simd, w, false) }; + scalar::gray16_to_rgb_row::(&plane, &mut scal, w, false); assert_eq!(simd, scal, "width={w} limited-range"); } } + + // ---- BE parity tests -------------------------------------------------------- + + #[test] + fn avx2_gray10_be_parity_luma() { + if !is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE10_0001); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) }; + scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + fn avx2_gray16_be_parity_luma() { + if !is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE16_0002); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray16_to_luma_row::(&be, &mut simd_be, w) }; + scalar::gray16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } } diff --git a/src/row/arch/x86_avx512/gray.rs b/src/row/arch/x86_avx512/gray.rs index 3b43606f..30b730db 100644 --- a/src/row/arch/x86_avx512/gray.rs +++ b/src/row/arch/x86_avx512/gray.rs @@ -16,7 +16,10 @@ use core::arch::x86_64::*; -use crate::row::scalar::{bits_mask, gray as scalar}; +use crate::row::{ + arch::x86_avx512::endian::{load_endian_u16x32, load_endian_u32x16}, + scalar::{bits_mask, gray as scalar}, +}; // ---- Gray8 ------------------------------------------------------------------ @@ -116,7 +119,7 @@ pub(crate) unsafe fn gray8_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_rgb_row( +pub(crate) unsafe fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -124,7 +127,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } /// AVX-512 `gray_n_to_rgba_row`. @@ -136,7 +139,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_rgba_row( +pub(crate) unsafe fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -144,7 +147,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } /// AVX-512 `gray_n_to_rgb_u16_row`. @@ -156,7 +159,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_rgb_u16_row( +pub(crate) unsafe fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -164,7 +167,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } /// AVX-512 `gray_n_to_rgba_u16_row`. @@ -176,7 +179,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_rgba_u16_row( +pub(crate) unsafe fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -184,7 +187,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } /// AVX-512 `gray_n_to_luma_row`: mask + shift → u8. 32 pixels/iter. @@ -196,7 +199,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_luma_row( +pub(crate) unsafe fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -211,7 +214,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( // requires a literal const generic shift not expressible as `BITS - 8`. let shr = _mm_cvtsi32_si128((BITS - 8) as i32); while x + 32 <= width { - let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm512_and_si512(raw, mask_v); // Shift right by (BITS - 8) to get u8-range value in u16 let shifted = _mm512_srl_epi16(masked, shr); @@ -222,7 +225,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( } } if x < width { - scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -235,7 +238,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_luma_u16_row( +pub(crate) unsafe fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -247,14 +250,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( unsafe { let mask_v = _mm512_set1_epi16(mask as i16); while x + 32 <= width { - let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm512_and_si512(raw, mask_v); _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), masked); x += 32; } } if x < width { - scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -268,7 +271,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray_n_to_hsv_row( +pub(crate) unsafe fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -278,7 +281,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mask = bits_mask::(); let mut x = 0usize; @@ -287,7 +290,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( let shr = _mm_cvtsi32_si128((BITS - 8) as i32); let zero256 = _mm256_setzero_si256(); while x + 32 <= width { - let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm512_and_si512(raw, mask_v); let shifted = _mm512_srl_epi16(masked, shr); let packed = _mm512_cvtepi16_epi8(shifted); @@ -299,7 +302,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( } } if x < width { - scalar::gray_n_to_hsv_row::( + scalar::gray_n_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -321,7 +324,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_rgb_row( +pub(crate) unsafe fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -329,7 +332,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } /// AVX-512 `gray16_to_rgba_row`. @@ -341,7 +344,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_rgba_row( +pub(crate) unsafe fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -349,7 +352,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } /// AVX-512 `gray16_to_rgb_u16_row`. @@ -361,7 +364,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_rgb_u16_row( +pub(crate) unsafe fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -369,7 +372,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } /// AVX-512 `gray16_to_rgba_u16_row`. @@ -381,7 +384,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_rgba_u16_row( +pub(crate) unsafe fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -389,7 +392,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } /// AVX-512 `gray16_to_luma_row`: `>> 8`, pack to u8. 32 pixels/iter. @@ -401,13 +404,17 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 32 <= width { - let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm512_srli_epi16(raw, 8); let packed = _mm512_cvtepi16_epi8(shifted); _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), packed); @@ -415,7 +422,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: } } if x < width { - scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -428,19 +435,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 32 <= width { - let y = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y); x += 32; } } if x < width { - scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -454,7 +465,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi #[allow(dead_code)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gray16_to_hsv_row( +pub(crate) unsafe fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -464,13 +475,13 @@ pub(crate) unsafe fn gray16_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mut x = 0usize; unsafe { let zero256 = _mm256_setzero_si256(); while x + 32 <= width { - let raw = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x32::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm512_srli_epi16(raw, 8); let packed = _mm512_cvtepi16_epi8(shifted); _mm256_storeu_si256(h_out.as_mut_ptr().add(x).cast(), zero256); @@ -480,7 +491,7 @@ pub(crate) unsafe fn gray16_to_hsv_row( } } if x < width { - scalar::gray16_to_hsv_row( + scalar::gray16_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -502,7 +513,11 @@ pub(crate) unsafe fn gray16_to_hsv_row( /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -510,7 +525,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); // Round-half-up: + 0.5 then truncate (matches scalar). let int32 = _mm512_cvttps_epi32(_mm512_add_ps( @@ -532,7 +549,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -542,7 +559,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -550,7 +571,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); let int32 = _mm512_cvttps_epi32(_mm512_add_ps( _mm512_mul_ps(clamped, scale), @@ -570,7 +593,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_row::(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); } } @@ -580,7 +603,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -588,7 +615,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); // Round-to-nearest with embedded rounding. let int32 = _mm512_cvttps_epi32(_mm512_add_ps( @@ -610,7 +639,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi } } if x < width { - scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_u16_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -620,7 +649,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -628,7 +661,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); let int32 = _mm512_cvttps_epi32(_mm512_add_ps( _mm512_mul_ps(clamped, scale), @@ -648,7 +683,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_u16_row::( + &y_plane[x..width], + &mut out[x * 4..width * 4], + width - x, + ); } } @@ -659,11 +698,15 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::grayf32_to_rgb_f32_row(y_plane, out, width); + scalar::grayf32_to_rgb_f32_row::(y_plane, out, width); } /// AVX-512 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8. 16 px/iter. @@ -672,7 +715,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -680,7 +727,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); let int32 = _mm512_cvttps_epi32(_mm512_add_ps( _mm512_mul_ps(clamped, scale), @@ -692,7 +741,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -702,7 +751,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -710,7 +763,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); let int32 = _mm512_cvttps_epi32(_mm512_add_ps( _mm512_mul_ps(clamped, scale), @@ -722,7 +777,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -733,11 +788,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); - scalar::grayf32_to_luma_f32_row(y_plane, out, width); + scalar::grayf32_to_luma_f32_row::(y_plane, out, width); } /// AVX-512 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255. 16 px/iter. @@ -746,7 +805,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w /// AVX-512F must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn grayf32_to_hsv_row( +pub(crate) unsafe fn grayf32_to_hsv_row( y_plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -759,7 +818,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row( let mut x = 0usize; unsafe { while x + 16 <= width { - let y = _mm512_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm512_castsi512_ps(load_endian_u32x16::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0)); let int32 = _mm512_cvttps_epi32(_mm512_add_ps( _mm512_mul_ps(clamped, scale), @@ -774,7 +835,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( } } if x < width { - scalar::grayf32_to_hsv_row( + scalar::grayf32_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -986,7 +1047,7 @@ pub(crate) unsafe fn ya8_to_hsv_row( /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); @@ -1012,7 +1073,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz } } if x < width { - scalar::ya16_to_rgb_row( + scalar::ya16_to_rgb_row::( &packed[x * 2..width * 2], &mut out[x * 3..width * 3], width - x, @@ -1026,7 +1087,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); @@ -1062,7 +1127,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_rgba_row( + scalar::ya16_to_rgba_row::( &packed[x * 2..width * 2], &mut out[x * 4..width * 4], width - x, @@ -1076,11 +1141,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); - scalar::ya16_to_rgb_u16_row(packed, out, width); + scalar::ya16_to_rgb_u16_row::(packed, out, width); } /// AVX-512 `ya16_to_rgba_u16_row`: native Y and A u16. @@ -1089,11 +1158,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); - scalar::ya16_to_rgba_u16_row(packed, out, width); + scalar::ya16_to_rgba_u16_row::(packed, out, width); } /// AVX-512 `ya16_to_luma_row`: Y `>> 8` → u8. 4 px/iter. @@ -1102,7 +1175,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); @@ -1122,7 +1199,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + scalar::ya16_to_luma_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -1132,11 +1209,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); - scalar::ya16_to_luma_u16_row(packed, out, width); + scalar::ya16_to_luma_u16_row::(packed, out, width); } /// AVX-512 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped. @@ -1145,7 +1226,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width /// AVX-512F+BW must be available. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ya16_to_hsv_row( +pub(crate) unsafe fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1173,7 +1254,7 @@ pub(crate) unsafe fn ya16_to_hsv_row( } } if x < width { - scalar::ya16_to_hsv_row( + scalar::ya16_to_hsv_row::( &packed[x * 2..width * 2], &mut h_out[x..width], &mut s_out[x..width], @@ -1226,8 +1307,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1244,8 +1325,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1262,8 +1343,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1280,8 +1361,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1298,8 +1379,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0005); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1316,8 +1397,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0006); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1334,8 +1415,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0007); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1352,8 +1433,8 @@ mod tests { prng_f32(&mut plane, 0xF512_0008); let mut simd = std::vec![0.0f32; w]; let mut scal = std::vec![0.0f32; w]; - unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1374,8 +1455,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) }; - sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w); + unsafe { super::grayf32_to_hsv_row::(&plane, &mut sh, &mut ss, &mut sv, w) }; + sf::grayf32_to_hsv_row::(&plane, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1546,8 +1627,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1564,8 +1645,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1582,8 +1663,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1600,8 +1681,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1618,8 +1699,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0005); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1636,8 +1717,8 @@ mod tests { prng_ya16(&mut packed, 0xA562_0006); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1658,8 +1739,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) }; - sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w); + unsafe { super::ya16_to_hsv_row::(&packed, &mut sh, &mut ss, &mut sv, w) }; + sy::ya16_to_hsv_row::(&packed, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1698,8 +1779,8 @@ mod tests { prng16(&mut plane, 0x1234_ABCD); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) }; - scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w); + unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) }; + scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1714,8 +1795,8 @@ mod tests { prng16(&mut plane, 0xCAFE_BABE); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1730,8 +1811,8 @@ mod tests { prng16(&mut plane, 0xDEAD_BEEF); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_u16_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1762,9 +1843,45 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) }; - scalar::gray16_to_rgb_row(&plane, &mut scal, w, false); + unsafe { super::gray16_to_rgb_row::(&plane, &mut simd, w, false) }; + scalar::gray16_to_rgb_row::(&plane, &mut scal, w, false); assert_eq!(simd, scal, "width={w} limited-range"); } } + + // ---- BE parity tests -------------------------------------------------------- + + #[test] + fn avx512_gray10_be_parity_luma() { + if !is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE10_0001); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) }; + scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + fn avx512_gray16_be_parity_luma() { + if !is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE16_0002); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray16_to_luma_row::(&be, &mut simd_be, w) }; + scalar::gray16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } } diff --git a/src/row/arch/x86_sse41/gray.rs b/src/row/arch/x86_sse41/gray.rs index 4f101bfe..52f77b5c 100644 --- a/src/row/arch/x86_sse41/gray.rs +++ b/src/row/arch/x86_sse41/gray.rs @@ -17,7 +17,10 @@ use core::arch::x86_64::*; -use crate::row::scalar::{bits_mask, gray as scalar}; +use crate::row::{ + arch::x86_sse41::endian::{load_endian_u16x8, load_endian_u32x4}, + scalar::{bits_mask, gray as scalar}, +}; // ---- Gray8 ------------------------------------------------------------------ @@ -125,7 +128,7 @@ pub(crate) unsafe fn gray8_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_rgb_row( +pub(crate) unsafe fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -133,7 +136,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray_n_to_rgba_row`. @@ -145,7 +148,7 @@ pub(crate) unsafe fn gray_n_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_rgba_row( +pub(crate) unsafe fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -155,7 +158,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( debug_assert!(out.len() >= width * 4); // SSE4.1 4-channel interleave without SSSE3 shuffle tables is complex; // delegate to scalar (which auto-vectorizes well at -O3). - scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray_n_to_rgb_u16_row`. @@ -167,7 +170,7 @@ pub(crate) unsafe fn gray_n_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_rgb_u16_row( +pub(crate) unsafe fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -175,7 +178,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray_n_to_rgba_u16_row`. @@ -187,7 +190,7 @@ pub(crate) unsafe fn gray_n_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_rgba_u16_row( +pub(crate) unsafe fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -195,7 +198,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray_n_to_luma_row`: mask, shift, pack, store. @@ -207,7 +210,7 @@ pub(crate) unsafe fn gray_n_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_luma_row( +pub(crate) unsafe fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -223,7 +226,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( // variant `_mm_srl_epi16` with a count vector built from the shift amount. let shr = _mm_cvtsi32_si128((BITS - 8) as i32); while x + 8 <= width { - let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm_and_si128(raw, mask_v); let shifted = _mm_srl_epi16(masked, shr); let zero = _mm_setzero_si128(); @@ -236,7 +239,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( } } if x < width { - scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -249,7 +252,7 @@ pub(crate) unsafe fn gray_n_to_luma_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_luma_u16_row( +pub(crate) unsafe fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -261,14 +264,14 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( unsafe { let mask_v = _mm_set1_epi16(mask as i16); while x + 8 <= width { - let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm_and_si128(raw, mask_v); _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), masked); x += 8; } } if x < width { - scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray_n_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -282,7 +285,7 @@ pub(crate) unsafe fn gray_n_to_luma_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray_n_to_hsv_row( +pub(crate) unsafe fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -292,7 +295,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mask = bits_mask::(); let mut x = 0usize; @@ -301,7 +304,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( let shr = _mm_cvtsi32_si128((BITS - 8) as i32); let zero = _mm_setzero_si128(); while x + 8 <= width { - let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let masked = _mm_and_si128(raw, mask_v); let shifted = _mm_srl_epi16(masked, shr); let packed = _mm_packus_epi16(shifted, zero); @@ -315,7 +318,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( } } if x < width { - scalar::gray_n_to_hsv_row::( + scalar::gray_n_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -337,7 +340,7 @@ pub(crate) unsafe fn gray_n_to_hsv_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_rgb_row( +pub(crate) unsafe fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -345,7 +348,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray16_to_rgba_row`: `>> 8` → RGBA u8. @@ -357,7 +360,7 @@ pub(crate) unsafe fn gray16_to_rgb_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_rgba_row( +pub(crate) unsafe fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -365,7 +368,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray16_to_rgb_u16_row`. @@ -377,7 +380,7 @@ pub(crate) unsafe fn gray16_to_rgba_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_rgb_u16_row( +pub(crate) unsafe fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -385,7 +388,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); - scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray16_to_rgba_u16_row`. @@ -397,7 +400,7 @@ pub(crate) unsafe fn gray16_to_rgb_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_rgba_u16_row( +pub(crate) unsafe fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -405,7 +408,7 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( ) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); - scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } /// SSE4.1 `gray16_to_luma_row`: `>> 8`, pack, store. @@ -417,14 +420,18 @@ pub(crate) unsafe fn gray16_to_rgba_u16_row( #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { let zero = _mm_setzero_si128(); while x + 8 <= width { - let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm_srli_epi16(raw, 8); let packed = _mm_packus_epi16(shifted, zero); let val = _mm_cvtsi128_si64(packed) as u64; @@ -433,7 +440,7 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: } } if x < width { - scalar::gray16_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -446,19 +453,23 @@ pub(crate) unsafe fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); let mut x = 0usize; unsafe { while x + 8 <= width { - let y = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let y = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y); x += 8; } } if x < width { - scalar::gray16_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::gray16_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -472,7 +483,7 @@ pub(crate) unsafe fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], wi #[allow(dead_code)] #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gray16_to_hsv_row( +pub(crate) unsafe fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -482,13 +493,13 @@ pub(crate) unsafe fn gray16_to_hsv_row( ) { debug_assert!(y_plane.len() >= width); if !full_range { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } let mut x = 0usize; unsafe { let zero16 = _mm_setzero_si128(); while x + 8 <= width { - let raw = _mm_loadu_si128(y_plane.as_ptr().add(x).cast()); + let raw = load_endian_u16x8::(y_plane.as_ptr().cast::().add(x * 2)); let shifted = _mm_srli_epi16(raw, 8); let packed = _mm_packus_epi16(shifted, zero16); let val = _mm_cvtsi128_si64(packed) as u64; @@ -499,7 +510,7 @@ pub(crate) unsafe fn gray16_to_hsv_row( } } if x < width { - scalar::gray16_to_hsv_row( + scalar::gray16_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -522,7 +533,11 @@ pub(crate) unsafe fn gray16_to_hsv_row( /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -532,7 +547,9 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -560,7 +577,7 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgb_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -570,7 +587,11 @@ pub(crate) unsafe fn grayf32_to_rgb_row(y_plane: &[f32], out: &mut [u8], width: /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -580,7 +601,9 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -611,7 +634,7 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_rgba_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_row::(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); } } @@ -621,7 +644,11 @@ pub(crate) unsafe fn grayf32_to_rgba_row(y_plane: &[f32], out: &mut [u8], width: /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); @@ -631,7 +658,9 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -657,7 +686,7 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi } } if x < width { - scalar::grayf32_to_rgb_u16_row(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); + scalar::grayf32_to_rgb_u16_row::(&y_plane[x..width], &mut out[x * 3..width * 3], width - x); } } @@ -667,7 +696,11 @@ pub(crate) unsafe fn grayf32_to_rgb_u16_row(y_plane: &[f32], out: &mut [u16], wi /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_rgba_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 4); @@ -677,7 +710,9 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -706,7 +741,11 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_rgba_u16_row(&y_plane[x..width], &mut out[x * 4..width * 4], width - x); + scalar::grayf32_to_rgba_u16_row::( + &y_plane[x..width], + &mut out[x * 4..width * 4], + width - x, + ); } } @@ -717,12 +756,16 @@ pub(crate) unsafe fn grayf32_to_rgba_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_rgb_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width * 3); // f32 triplet broadcast: scalar is already optimal here. - scalar::grayf32_to_rgb_f32_row(y_plane, out, width); + scalar::grayf32_to_rgb_f32_row::(y_plane, out, width); } /// SSE4.1 `grayf32_to_luma_row`: clamp [0,1] × 255 → u8. @@ -731,7 +774,11 @@ pub(crate) unsafe fn grayf32_to_rgb_f32_row(y_plane: &[f32], out: &mut [f32], wi /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_row( + y_plane: &[f32], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -741,7 +788,9 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -754,7 +803,7 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: } } if x < width { - scalar::grayf32_to_luma_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -764,7 +813,11 @@ pub(crate) unsafe fn grayf32_to_luma_row(y_plane: &[f32], out: &mut [u8], width: /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_u16_row( + y_plane: &[f32], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); @@ -774,7 +827,9 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -787,7 +842,7 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w } } if x < width { - scalar::grayf32_to_luma_u16_row(&y_plane[x..width], &mut out[x..width], width - x); + scalar::grayf32_to_luma_u16_row::(&y_plane[x..width], &mut out[x..width], width - x); } } @@ -798,11 +853,15 @@ pub(crate) unsafe fn grayf32_to_luma_u16_row(y_plane: &[f32], out: &mut [u16], w #[allow(dead_code)] // dispatcher uses scalar directly for lossless f32 paths #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], width: usize) { +pub(crate) unsafe fn grayf32_to_luma_f32_row( + y_plane: &[f32], + out: &mut [f32], + width: usize, +) { use crate::row::scalar::grayf32 as scalar; debug_assert!(y_plane.len() >= width); debug_assert!(out.len() >= width); - scalar::grayf32_to_luma_f32_row(y_plane, out, width); + scalar::grayf32_to_luma_f32_row::(y_plane, out, width); } /// SSE4.1 `grayf32_to_hsv_row`: H=0, S=0, V = clamp(Y,0,1)×255. @@ -811,7 +870,7 @@ pub(crate) unsafe fn grayf32_to_luma_f32_row(y_plane: &[f32], out: &mut [f32], w /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn grayf32_to_hsv_row( +pub(crate) unsafe fn grayf32_to_hsv_row( y_plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -827,7 +886,9 @@ pub(crate) unsafe fn grayf32_to_hsv_row( let mut x = 0usize; unsafe { while x + 4 <= width { - let y = _mm_loadu_ps(y_plane.as_ptr().add(x)); + let y = _mm_castsi128_ps(load_endian_u32x4::( + y_plane.as_ptr().cast::().add(x * 4), + )); let clamped = _mm_min_ps(_mm_max_ps(y, zero), one); let scaled = _mm_mul_ps(clamped, scale); let int32 = _mm_cvttps_epi32(_mm_add_ps(scaled, _mm_set1_ps(0.5))); @@ -844,7 +905,7 @@ pub(crate) unsafe fn grayf32_to_hsv_row( } } if x < width { - scalar::grayf32_to_hsv_row( + scalar::grayf32_to_hsv_row::( &y_plane[x..width], &mut h_out[x..width], &mut s_out[x..width], @@ -1063,7 +1124,7 @@ pub(crate) unsafe fn ya8_to_hsv_row( /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); @@ -1091,7 +1152,7 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz } } if x < width { - scalar::ya16_to_rgb_row( + scalar::ya16_to_rgb_row::( &packed[x * 2..width * 2], &mut out[x * 3..width * 3], width - x, @@ -1105,7 +1166,11 @@ pub(crate) unsafe fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usiz /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); @@ -1141,7 +1206,7 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_rgba_row( + scalar::ya16_to_rgba_row::( &packed[x * 2..width * 2], &mut out[x * 4..width * 4], width - x, @@ -1155,11 +1220,15 @@ pub(crate) unsafe fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usi /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 3); - scalar::ya16_to_rgb_u16_row(packed, out, width); + scalar::ya16_to_rgb_u16_row::(packed, out, width); } /// SSE4.1 `ya16_to_rgba_u16_row`: native Y and A u16. @@ -1168,11 +1237,15 @@ pub(crate) unsafe fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width * 4); - scalar::ya16_to_rgba_u16_row(packed, out, width); + scalar::ya16_to_rgba_u16_row::(packed, out, width); } /// SSE4.1 `ya16_to_luma_row`: Y `>> 8` → u8. @@ -1181,7 +1254,11 @@ pub(crate) unsafe fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); @@ -1201,7 +1278,7 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi } } if x < width { - scalar::ya16_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x); + scalar::ya16_to_luma_row::(&packed[x * 2..width * 2], &mut out[x..width], width - x); } } @@ -1211,11 +1288,15 @@ pub(crate) unsafe fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { use crate::row::scalar::ya16 as scalar; debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); - scalar::ya16_to_luma_u16_row(packed, out, width); + scalar::ya16_to_luma_u16_row::(packed, out, width); } /// SSE4.1 `ya16_to_hsv_row`: H=0, S=0, V = Y `>> 8`. α dropped. @@ -1224,7 +1305,7 @@ pub(crate) unsafe fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width /// SSE4.1 must be available. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ya16_to_hsv_row( +pub(crate) unsafe fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1252,7 +1333,7 @@ pub(crate) unsafe fn ya16_to_hsv_row( } } if x < width { - scalar::ya16_to_hsv_row( + scalar::ya16_to_hsv_row::( &packed[x * 2..width * 2], &mut h_out[x..width], &mut s_out[x..width], @@ -1331,8 +1412,8 @@ mod tests { prng16(&mut plane, 0xCAFE_BABE); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray_n_to_luma_u16_row::<10>(&plane, &mut simd, w) }; - scalar::gray_n_to_luma_u16_row::<10>(&plane, &mut scal, w); + unsafe { super::gray_n_to_luma_u16_row::<10, false>(&plane, &mut simd, w) }; + scalar::gray_n_to_luma_u16_row::<10, false>(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1347,8 +1428,8 @@ mod tests { prng16(&mut plane, 0xDEAD_BEEF); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::gray16_to_luma_u16_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_u16_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1363,8 +1444,8 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::gray16_to_luma_row(&plane, &mut simd, w) }; - scalar::gray16_to_luma_row(&plane, &mut scal, w); + unsafe { super::gray16_to_luma_row::(&plane, &mut simd, w) }; + scalar::gray16_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1395,8 +1476,8 @@ mod tests { prng16(&mut plane, 0x1234_5678); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::gray16_to_rgb_row(&plane, &mut simd, w, false) }; - scalar::gray16_to_rgb_row(&plane, &mut scal, w, false); + unsafe { super::gray16_to_rgb_row::(&plane, &mut simd, w, false) }; + scalar::gray16_to_rgb_row::(&plane, &mut scal, w, false); assert_eq!(simd, scal, "width={w} limited-range"); } } @@ -1423,8 +1504,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::grayf32_to_rgb_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1441,8 +1522,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::grayf32_to_rgba_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1459,8 +1540,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::grayf32_to_rgb_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1477,8 +1558,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::grayf32_to_rgba_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgba_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgba_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgba_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1495,8 +1576,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0005); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { super::grayf32_to_rgb_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_rgb_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_rgb_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_rgb_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1513,8 +1594,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0006); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::grayf32_to_luma_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1531,8 +1612,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0007); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::grayf32_to_luma_u16_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_u16_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_u16_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_u16_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1549,8 +1630,8 @@ mod tests { prng_f32(&mut plane, 0xF320_0008); let mut simd = std::vec![0.0f32; w]; let mut scal = std::vec![0.0f32; w]; - unsafe { super::grayf32_to_luma_f32_row(&plane, &mut simd, w) }; - sf::grayf32_to_luma_f32_row(&plane, &mut scal, w); + unsafe { super::grayf32_to_luma_f32_row::(&plane, &mut simd, w) }; + sf::grayf32_to_luma_f32_row::(&plane, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1571,8 +1652,8 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::grayf32_to_hsv_row(&plane, &mut sh, &mut ss, &mut sv, w) }; - sf::grayf32_to_hsv_row(&plane, &mut rh, &mut rs, &mut rv, w); + unsafe { super::grayf32_to_hsv_row::(&plane, &mut sh, &mut ss, &mut sv, w) }; + sf::grayf32_to_hsv_row::(&plane, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); @@ -1743,8 +1824,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0001); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { super::ya16_to_rgb_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1761,8 +1842,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0002); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { super::ya16_to_rgba_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1779,8 +1860,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { super::ya16_to_rgb_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgb_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgb_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgb_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1797,8 +1878,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { super::ya16_to_rgba_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_rgba_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_rgba_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_rgba_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1815,8 +1896,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0005); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { super::ya16_to_luma_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1833,8 +1914,8 @@ mod tests { prng_ya16(&mut packed, 0xA160_0006); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { super::ya16_to_luma_u16_row(&packed, &mut simd, w) }; - sy::ya16_to_luma_u16_row(&packed, &mut scal, w); + unsafe { super::ya16_to_luma_u16_row::(&packed, &mut simd, w) }; + sy::ya16_to_luma_u16_row::(&packed, &mut scal, w); assert_eq!(simd, scal, "width={w}"); } } @@ -1855,11 +1936,75 @@ mod tests { let mut rh = std::vec![0u8; w]; let mut rs = std::vec![0u8; w]; let mut rv = std::vec![0u8; w]; - unsafe { super::ya16_to_hsv_row(&packed, &mut sh, &mut ss, &mut sv, w) }; - sy::ya16_to_hsv_row(&packed, &mut rh, &mut rs, &mut rv, w); + unsafe { super::ya16_to_hsv_row::(&packed, &mut sh, &mut ss, &mut sv, w) }; + sy::ya16_to_hsv_row::(&packed, &mut rh, &mut rs, &mut rv, w); assert_eq!(sh, rh, "H width={w}"); assert_eq!(ss, rs, "S width={w}"); assert_eq!(sv, rv, "V width={w}"); } } + + // ---- BE parity tests -------------------------------------------------------- + + #[test] + fn sse41_gray10_be_parity_luma() { + if !is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE10_0001); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray_n_to_luma_row::<10, true>(&be, &mut simd_be, w) }; + scalar::gray_n_to_luma_row::<10, false>(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + fn sse41_gray16_be_parity_luma() { + if !is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut le = std::vec![0u16; w]; + prng16(&mut le, 0xBE16_0002); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::gray16_to_luma_row::(&be, &mut simd_be, w) }; + scalar::gray16_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } + + #[test] + fn sse41_grayf32_be_parity_luma() { + use crate::row::scalar::grayf32 as sf; + if !is_x86_feature_detected!("sse4.1") { + return; + } + fn prng_f32(out: &mut [f32], seed: u32) { + let mut s = seed; + for v in out.iter_mut() { + s = s.wrapping_mul(1664525).wrapping_add(1013904223); + *v = ((s >> 8) as f32) / (u32::MAX as f32) * 1.3 - 0.1; + } + } + for &w in WIDTHS { + let mut le = std::vec![0.0f32; w]; + prng_f32(&mut le, 0xBEF3_0003); + let be: std::vec::Vec = le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut simd_be = std::vec![0u8; w]; + let mut scal_le = std::vec![0u8; w]; + unsafe { super::grayf32_to_luma_row::(&be, &mut simd_be, w) }; + sf::grayf32_to_luma_row::(&le, &mut scal_le, w); + assert_eq!(simd_be, scal_le, "width={w}"); + } + } } diff --git a/src/row/dispatch/gray.rs b/src/row/dispatch/gray.rs index a8215abb..b6eaf204 100644 --- a/src/row/dispatch/gray.rs +++ b/src/row/dispatch/gray.rs @@ -177,7 +177,7 @@ pub(crate) fn gray8_to_hsv_row( /// Dispatch `gray_n_to_rgb_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgb_row( +pub(crate) fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -188,43 +188,43 @@ pub(crate) fn gray_n_to_rgb_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_rgb_row::(y_plane, out, width, full_range); } + unsafe { arch::neon::gray_n_to_rgb_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray_n_to_rgb_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray_n_to_rgb_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_rgb_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray_n_to_rgb_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_rgb_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray_n_to_rgb_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray_n_to_rgb_row::(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray_n_to_rgb_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_row::(y_plane, out, width, full_range); } /// Dispatch `gray_n_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgba_row( +pub(crate) fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -235,43 +235,43 @@ pub(crate) fn gray_n_to_rgba_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_rgba_row::(y_plane, out, width, full_range); } + unsafe { arch::neon::gray_n_to_rgba_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray_n_to_rgba_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray_n_to_rgba_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_rgba_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray_n_to_rgba_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_rgba_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray_n_to_rgba_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray_n_to_rgba_row::(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray_n_to_rgba_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_row::(y_plane, out, width, full_range); } /// Dispatch `gray_n_to_rgb_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgb_u16_row( +pub(crate) fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -282,45 +282,45 @@ pub(crate) fn gray_n_to_rgb_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::neon::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + arch::wasm_simd128::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgb_u16_row::(y_plane, out, width, full_range); } /// Dispatch `gray_n_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgba_u16_row( +pub(crate) fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -331,47 +331,47 @@ pub(crate) fn gray_n_to_rgba_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + return scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::neon::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + arch::x86_avx512::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + arch::wasm_simd128::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); + scalar::gray_n_to_rgba_u16_row::(y_plane, out, width, full_range); } /// Dispatch `gray_n_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_luma_row( +pub(crate) fn gray_n_to_luma_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -380,43 +380,43 @@ pub(crate) fn gray_n_to_luma_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::gray_n_to_luma_row::(y_plane, out, width); + return scalar::gray_n_to_luma_row::(y_plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_luma_row::(y_plane, out, width); } + unsafe { arch::neon::gray_n_to_luma_row::(y_plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray_n_to_luma_row::(y_plane, out, width); } + unsafe { arch::x86_avx512::gray_n_to_luma_row::(y_plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_luma_row::(y_plane, out, width); } + unsafe { arch::x86_avx2::gray_n_to_luma_row::(y_plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_luma_row::(y_plane, out, width); } + unsafe { arch::x86_sse41::gray_n_to_luma_row::(y_plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray_n_to_luma_row::(y_plane, out, width); } + unsafe { arch::wasm_simd128::gray_n_to_luma_row::(y_plane, out, width); } return; } }, _ => {} } - scalar::gray_n_to_luma_row::(y_plane, out, width); + scalar::gray_n_to_luma_row::(y_plane, out, width); } /// Dispatch `gray_n_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_luma_u16_row( +pub(crate) fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -425,43 +425,43 @@ pub(crate) fn gray_n_to_luma_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::gray_n_to_luma_u16_row::(y_plane, out, width); + return scalar::gray_n_to_luma_u16_row::(y_plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray_n_to_luma_u16_row::(y_plane, out, width); } + unsafe { arch::neon::gray_n_to_luma_u16_row::(y_plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray_n_to_luma_u16_row::(y_plane, out, width); } + unsafe { arch::x86_avx512::gray_n_to_luma_u16_row::(y_plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray_n_to_luma_u16_row::(y_plane, out, width); } + unsafe { arch::x86_avx2::gray_n_to_luma_u16_row::(y_plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray_n_to_luma_u16_row::(y_plane, out, width); } + unsafe { arch::x86_sse41::gray_n_to_luma_u16_row::(y_plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray_n_to_luma_u16_row::(y_plane, out, width); } + unsafe { arch::wasm_simd128::gray_n_to_luma_u16_row::(y_plane, out, width); } return; } }, _ => {} } - scalar::gray_n_to_luma_u16_row::(y_plane, out, width); + scalar::gray_n_to_luma_u16_row::(y_plane, out, width); } /// Dispatch `gray_n_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_hsv_row( +pub(crate) fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -475,13 +475,13 @@ pub(crate) fn gray_n_to_hsv_row( assert!(s_out.len() >= width, "S out too short"); assert!(v_out.len() >= width, "V out too short"); if !use_simd { - return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + arch::neon::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } @@ -489,7 +489,7 @@ pub(crate) fn gray_n_to_hsv_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::gray_n_to_hsv_row::( + arch::x86_avx512::gray_n_to_hsv_row::( y_plane, h_out, s_out, v_out, width, full_range, ); } @@ -497,7 +497,7 @@ pub(crate) fn gray_n_to_hsv_row( } if avx2_available() { unsafe { - arch::x86_avx2::gray_n_to_hsv_row::( + arch::x86_avx2::gray_n_to_hsv_row::( y_plane, h_out, s_out, v_out, width, full_range, ); } @@ -505,7 +505,7 @@ pub(crate) fn gray_n_to_hsv_row( } if sse41_available() { unsafe { - arch::x86_sse41::gray_n_to_hsv_row::( + arch::x86_sse41::gray_n_to_hsv_row::( y_plane, h_out, s_out, v_out, width, full_range, ); } @@ -515,7 +515,7 @@ pub(crate) fn gray_n_to_hsv_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::gray_n_to_hsv_row::( + arch::wasm_simd128::gray_n_to_hsv_row::( y_plane, h_out, s_out, v_out, width, full_range, ); } @@ -524,14 +524,14 @@ pub(crate) fn gray_n_to_hsv_row( }, _ => {} } - scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); + scalar::gray_n_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } // ---- Gray16 ---------------------------------------------------------------- /// Dispatch `gray16_to_rgb_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgb_row( +pub(crate) fn gray16_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -542,43 +542,43 @@ pub(crate) fn gray16_to_rgb_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_rgb_row(y_plane, out, width, full_range); } + unsafe { arch::neon::gray16_to_rgb_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_rgb_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray16_to_rgb_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_rgb_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray16_to_rgb_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_rgb_row(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray16_to_rgb_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_rgb_row(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray16_to_rgb_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray16_to_rgb_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_row::(y_plane, out, width, full_range); } /// Dispatch `gray16_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgba_row( +pub(crate) fn gray16_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -589,43 +589,43 @@ pub(crate) fn gray16_to_rgba_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_rgba_row(y_plane, out, width, full_range); } + unsafe { arch::neon::gray16_to_rgba_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_rgba_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray16_to_rgba_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_rgba_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray16_to_rgba_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_rgba_row(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray16_to_rgba_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_rgba_row(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray16_to_rgba_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray16_to_rgba_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_row::(y_plane, out, width, full_range); } /// Dispatch `gray16_to_rgb_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgb_u16_row( +pub(crate) fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -636,43 +636,43 @@ pub(crate) fn gray16_to_rgb_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_rgb_u16_row(y_plane, out, width, full_range); } + unsafe { arch::neon::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_rgb_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_rgb_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_rgb_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_rgb_u16_row(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray16_to_rgb_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgb_u16_row::(y_plane, out, width, full_range); } /// Dispatch `gray16_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgba_u16_row( +pub(crate) fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -683,83 +683,88 @@ pub(crate) fn gray16_to_rgba_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= out_min, "out too short"); if !use_simd { - return scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + return scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_rgba_u16_row(y_plane, out, width, full_range); } + unsafe { arch::neon::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_rgba_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx512::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_rgba_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_avx2::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_rgba_u16_row(y_plane, out, width, full_range); } + unsafe { arch::x86_sse41::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_rgba_u16_row(y_plane, out, width, full_range); } + unsafe { arch::wasm_simd128::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } return; } }, _ => {} } - scalar::gray16_to_rgba_u16_row(y_plane, out, width, full_range); + scalar::gray16_to_rgba_u16_row::(y_plane, out, width, full_range); } /// Dispatch `gray16_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn gray16_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::gray16_to_luma_row(y_plane, out, width); + return scalar::gray16_to_luma_row::(y_plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_luma_row(y_plane, out, width); } + unsafe { arch::neon::gray16_to_luma_row::(y_plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_luma_row(y_plane, out, width); } + unsafe { arch::x86_avx512::gray16_to_luma_row::(y_plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_luma_row(y_plane, out, width); } + unsafe { arch::x86_avx2::gray16_to_luma_row::(y_plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_luma_row(y_plane, out, width); } + unsafe { arch::x86_sse41::gray16_to_luma_row::(y_plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_luma_row(y_plane, out, width); } + unsafe { arch::wasm_simd128::gray16_to_luma_row::(y_plane, out, width); } return; } }, _ => {} } - scalar::gray16_to_luma_row(y_plane, out, width); + scalar::gray16_to_luma_row::(y_plane, out, width); } /// Dispatch `gray16_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_luma_u16_row( +pub(crate) fn gray16_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -768,43 +773,43 @@ pub(crate) fn gray16_to_luma_u16_row( assert!(y_plane.len() >= width, "y_plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::gray16_to_luma_u16_row(y_plane, out, width); + return scalar::gray16_to_luma_u16_row::(y_plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_luma_u16_row(y_plane, out, width); } + unsafe { arch::neon::gray16_to_luma_u16_row::(y_plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::gray16_to_luma_u16_row(y_plane, out, width); } + unsafe { arch::x86_avx512::gray16_to_luma_u16_row::(y_plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::gray16_to_luma_u16_row(y_plane, out, width); } + unsafe { arch::x86_avx2::gray16_to_luma_u16_row::(y_plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::gray16_to_luma_u16_row(y_plane, out, width); } + unsafe { arch::x86_sse41::gray16_to_luma_u16_row::(y_plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::gray16_to_luma_u16_row(y_plane, out, width); } + unsafe { arch::wasm_simd128::gray16_to_luma_u16_row::(y_plane, out, width); } return; } }, _ => {} } - scalar::gray16_to_luma_u16_row(y_plane, out, width); + scalar::gray16_to_luma_u16_row::(y_plane, out, width); } /// Dispatch `gray16_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_hsv_row( +pub(crate) fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -818,31 +823,31 @@ pub(crate) fn gray16_to_hsv_row( assert!(s_out.len() >= width, "S out too short"); assert!(v_out.len() >= width, "V out too short"); if !use_simd { - return scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + return scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); } + unsafe { arch::neon::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + arch::x86_avx512::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + arch::x86_avx2::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + arch::x86_sse41::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } @@ -850,12 +855,12 @@ pub(crate) fn gray16_to_hsv_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + arch::wasm_simd128::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } return; } }, _ => {} } - scalar::gray16_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range); + scalar::gray16_to_hsv_row::(y_plane, h_out, s_out, v_out, width, full_range); } diff --git a/src/row/dispatch/grayf32.rs b/src/row/dispatch/grayf32.rs index 6169c03e..02321edd 100644 --- a/src/row/dispatch/grayf32.rs +++ b/src/row/dispatch/grayf32.rs @@ -25,133 +25,148 @@ use crate::row::{ /// Dispatch `grayf32_to_rgb_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn grayf32_to_rgb_row( + plane: &[f32], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= rgb_row_bytes(width), "out too short"); if !use_simd { - return scalar::grayf32_to_rgb_row(plane, out, width); + return scalar::grayf32_to_rgb_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_rgb_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_rgb_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_rgb_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_rgb_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_rgb_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_rgb_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_rgb_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_rgb_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_rgb_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_rgb_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_rgb_row(plane, out, width); + scalar::grayf32_to_rgb_row::(plane, out, width); } // ---- grayf32_to_rgba_row ------------------------------------------------------ /// Dispatch `grayf32_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgba_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn grayf32_to_rgba_row( + plane: &[f32], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= rgba_row_bytes(width), "out too short"); if !use_simd { - return scalar::grayf32_to_rgba_row(plane, out, width); + return scalar::grayf32_to_rgba_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_rgba_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_rgba_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_rgba_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_rgba_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_rgba_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_rgba_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_rgba_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_rgba_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_rgba_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_rgba_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_rgba_row(plane, out, width); + scalar::grayf32_to_rgba_row::(plane, out, width); } // ---- grayf32_to_rgb_u16_row --------------------------------------------------- /// Dispatch `grayf32_to_rgb_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], out: &mut [u16], width: usize, use_simd: bool) { +pub(crate) fn grayf32_to_rgb_u16_row( + plane: &[f32], + out: &mut [u16], + width: usize, + use_simd: bool, +) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= rgb_row_elems(width), "out too short"); if !use_simd { - return scalar::grayf32_to_rgb_u16_row(plane, out, width); + return scalar::grayf32_to_rgb_u16_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_rgb_u16_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_rgb_u16_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_rgb_u16_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_rgb_u16_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_rgb_u16_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_rgb_u16_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_rgb_u16_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_rgb_u16_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_rgb_u16_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_rgb_u16_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_rgb_u16_row(plane, out, width); + scalar::grayf32_to_rgb_u16_row::(plane, out, width); } // ---- grayf32_to_rgba_u16_row -------------------------------------------------- /// Dispatch `grayf32_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgba_u16_row( +pub(crate) fn grayf32_to_rgba_u16_row( plane: &[f32], out: &mut [u16], width: usize, @@ -160,45 +175,45 @@ pub(crate) fn grayf32_to_rgba_u16_row( assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= rgba_row_elems(width), "out too short"); if !use_simd { - return scalar::grayf32_to_rgba_u16_row(plane, out, width); + return scalar::grayf32_to_rgba_u16_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_rgba_u16_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_rgba_u16_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_rgba_u16_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_rgba_u16_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_rgba_u16_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_rgba_u16_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_rgba_u16_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_rgba_u16_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_rgba_u16_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_rgba_u16_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_rgba_u16_row(plane, out, width); + scalar::grayf32_to_rgba_u16_row::(plane, out, width); } // ---- grayf32_to_rgb_f32_row --------------------------------------------------- /// Dispatch `grayf32_to_rgb_f32_row` (lossless replicate, all backends delegate to scalar). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_f32_row( +pub(crate) fn grayf32_to_rgb_f32_row( plane: &[f32], out: &mut [f32], width: usize, @@ -206,56 +221,61 @@ pub(crate) fn grayf32_to_rgb_f32_row( ) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= rgb_row_elems(width), "out too short"); - scalar::grayf32_to_rgb_f32_row(plane, out, width); + scalar::grayf32_to_rgb_f32_row::(plane, out, width); } // ---- grayf32_to_luma_row ------------------------------------------------------ /// Dispatch `grayf32_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_row(plane: &[f32], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn grayf32_to_luma_row( + plane: &[f32], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::grayf32_to_luma_row(plane, out, width); + return scalar::grayf32_to_luma_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_luma_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_luma_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_luma_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_luma_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_luma_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_luma_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_luma_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_luma_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_luma_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_luma_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_luma_row(plane, out, width); + scalar::grayf32_to_luma_row::(plane, out, width); } // ---- grayf32_to_luma_u16_row -------------------------------------------------- /// Dispatch `grayf32_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_u16_row( +pub(crate) fn grayf32_to_luma_u16_row( plane: &[f32], out: &mut [u16], width: usize, @@ -264,45 +284,45 @@ pub(crate) fn grayf32_to_luma_u16_row( assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::grayf32_to_luma_u16_row(plane, out, width); + return scalar::grayf32_to_luma_u16_row::(plane, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_luma_u16_row(plane, out, width); } + unsafe { arch::neon::grayf32_to_luma_u16_row::(plane, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_luma_u16_row(plane, out, width); } + unsafe { arch::x86_avx512::grayf32_to_luma_u16_row::(plane, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_luma_u16_row(plane, out, width); } + unsafe { arch::x86_avx2::grayf32_to_luma_u16_row::(plane, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_luma_u16_row(plane, out, width); } + unsafe { arch::x86_sse41::grayf32_to_luma_u16_row::(plane, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_luma_u16_row(plane, out, width); } + unsafe { arch::wasm_simd128::grayf32_to_luma_u16_row::(plane, out, width); } return; } }, _ => {} } - scalar::grayf32_to_luma_u16_row(plane, out, width); + scalar::grayf32_to_luma_u16_row::(plane, out, width); } // ---- grayf32_to_luma_f32_row -------------------------------------------------- /// Dispatch `grayf32_to_luma_f32_row` (lossless memcpy, no SIMD needed). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_f32_row( +pub(crate) fn grayf32_to_luma_f32_row( plane: &[f32], out: &mut [f32], width: usize, @@ -310,14 +330,14 @@ pub(crate) fn grayf32_to_luma_f32_row( ) { assert!(plane.len() >= width, "plane too short"); assert!(out.len() >= width, "out too short"); - scalar::grayf32_to_luma_f32_row(plane, out, width); + scalar::grayf32_to_luma_f32_row::(plane, out, width); } // ---- grayf32_to_hsv_row ------------------------------------------------------- /// Dispatch `grayf32_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_hsv_row( +pub(crate) fn grayf32_to_hsv_row( plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -330,36 +350,36 @@ pub(crate) fn grayf32_to_hsv_row( assert!(s_out.len() >= width, "S out too short"); assert!(v_out.len() >= width, "V out too short"); if !use_simd { - return scalar::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); + return scalar::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); } + unsafe { arch::neon::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); } + unsafe { arch::x86_avx512::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); } + unsafe { arch::x86_avx2::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); } + unsafe { arch::x86_sse41::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); } + unsafe { arch::wasm_simd128::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } return; } }, _ => {} } - scalar::grayf32_to_hsv_row(plane, h_out, s_out, v_out, width); + scalar::grayf32_to_hsv_row::(plane, h_out, s_out, v_out, width); } diff --git a/src/row/dispatch/ya16.rs b/src/row/dispatch/ya16.rs index 38dab9c0..41e0c486 100644 --- a/src/row/dispatch/ya16.rs +++ b/src/row/dispatch/ya16.rs @@ -27,259 +27,289 @@ use crate::row::{ /// Dispatch `ya16_to_rgb_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgb_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_rgb_row( + packed: &[u16], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= rgb_row_bytes(width), "out too short"); if !use_simd { - return scalar::ya16_to_rgb_row(packed, out, width); + return scalar::ya16_to_rgb_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_rgb_row(packed, out, width); } + unsafe { arch::neon::ya16_to_rgb_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_rgb_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_rgb_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_rgb_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_rgb_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_rgb_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_rgb_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_rgb_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_rgb_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_rgb_row(packed, out, width); + scalar::ya16_to_rgb_row::(packed, out, width); } // ---- ya16_to_rgba_row --------------------------------------------------------- /// Dispatch `ya16_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgba_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_rgba_row( + packed: &[u16], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= rgba_row_bytes(width), "out too short"); if !use_simd { - return scalar::ya16_to_rgba_row(packed, out, width); + return scalar::ya16_to_rgba_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_rgba_row(packed, out, width); } + unsafe { arch::neon::ya16_to_rgba_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_rgba_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_rgba_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_rgba_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_rgba_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_rgba_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_rgba_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_rgba_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_rgba_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_rgba_row(packed, out, width); + scalar::ya16_to_rgba_row::(packed, out, width); } // ---- ya16_to_rgb_u16_row ------------------------------------------------------ /// Dispatch `ya16_to_rgb_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_rgb_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= rgb_row_elems(width), "out too short"); if !use_simd { - return scalar::ya16_to_rgb_u16_row(packed, out, width); + return scalar::ya16_to_rgb_u16_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_rgb_u16_row(packed, out, width); } + unsafe { arch::neon::ya16_to_rgb_u16_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_rgb_u16_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_rgb_u16_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_rgb_u16_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_rgb_u16_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_rgb_u16_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_rgb_u16_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_rgb_u16_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_rgb_u16_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_rgb_u16_row(packed, out, width); + scalar::ya16_to_rgb_u16_row::(packed, out, width); } // ---- ya16_to_rgba_u16_row ----------------------------------------------------- /// Dispatch `ya16_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_rgba_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= rgba_row_elems(width), "out too short"); if !use_simd { - return scalar::ya16_to_rgba_u16_row(packed, out, width); + return scalar::ya16_to_rgba_u16_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_rgba_u16_row(packed, out, width); } + unsafe { arch::neon::ya16_to_rgba_u16_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_rgba_u16_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_rgba_u16_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_rgba_u16_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_rgba_u16_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_rgba_u16_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_rgba_u16_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_rgba_u16_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_rgba_u16_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_rgba_u16_row(packed, out, width); + scalar::ya16_to_rgba_u16_row::(packed, out, width); } // ---- ya16_to_luma_row --------------------------------------------------------- /// Dispatch `ya16_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_luma_row(packed: &[u16], out: &mut [u8], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::ya16_to_luma_row(packed, out, width); + return scalar::ya16_to_luma_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_luma_row(packed, out, width); } + unsafe { arch::neon::ya16_to_luma_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_luma_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_luma_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_luma_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_luma_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_luma_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_luma_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_luma_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_luma_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_luma_row(packed, out, width); + scalar::ya16_to_luma_row::(packed, out, width); } // ---- ya16_to_luma_u16_row ----------------------------------------------------- /// Dispatch `ya16_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize, use_simd: bool) { +pub(crate) fn ya16_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, + use_simd: bool, +) { assert!(packed.len() >= ya_row_elems(width), "packed too short"); assert!(out.len() >= width, "out too short"); if !use_simd { - return scalar::ya16_to_luma_u16_row(packed, out, width); + return scalar::ya16_to_luma_u16_row::(packed, out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_luma_u16_row(packed, out, width); } + unsafe { arch::neon::ya16_to_luma_u16_row::(packed, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_luma_u16_row(packed, out, width); } + unsafe { arch::x86_avx512::ya16_to_luma_u16_row::(packed, out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_luma_u16_row(packed, out, width); } + unsafe { arch::x86_avx2::ya16_to_luma_u16_row::(packed, out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_luma_u16_row(packed, out, width); } + unsafe { arch::x86_sse41::ya16_to_luma_u16_row::(packed, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_luma_u16_row(packed, out, width); } + unsafe { arch::wasm_simd128::ya16_to_luma_u16_row::(packed, out, width); } return; } }, _ => {} } - scalar::ya16_to_luma_u16_row(packed, out, width); + scalar::ya16_to_luma_u16_row::(packed, out, width); } // ---- ya16_to_hsv_row ---------------------------------------------------------- /// Dispatch `ya16_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_hsv_row( +pub(crate) fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -292,36 +322,36 @@ pub(crate) fn ya16_to_hsv_row( assert!(s_out.len() >= width, "S out too short"); assert!(v_out.len() >= width, "V out too short"); if !use_simd { - return scalar::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); + return scalar::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); } + unsafe { arch::neon::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); } + unsafe { arch::x86_avx512::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); } + unsafe { arch::x86_avx2::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); } + unsafe { arch::x86_sse41::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); } + unsafe { arch::wasm_simd128::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } return; } }, _ => {} } - scalar::ya16_to_hsv_row(packed, h_out, s_out, v_out, width); + scalar::ya16_to_hsv_row::(packed, h_out, s_out, v_out, width); } diff --git a/src/row/mod.rs b/src/row/mod.rs index 297f1c3c..920856a4 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -710,7 +710,7 @@ mod overflow_tests { fn gray_n_to_rgb_dispatcher_rejects_width_times_3_overflow() { let y: [u16; 0] = []; let mut rgb: [u8; 0] = []; - gray_n_to_rgb_row::<10>(&y, &mut rgb, OVERFLOW_WIDTH, false, true); + gray_n_to_rgb_row::<10, false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -719,7 +719,7 @@ mod overflow_tests { fn gray_n_to_rgba_dispatcher_rejects_width_times_4_overflow() { let y: [u16; 0] = []; let mut rgba: [u8; 0] = []; - gray_n_to_rgba_row::<10>(&y, &mut rgba, OVERFLOW_WIDTH, false, true); + gray_n_to_rgba_row::<10, false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -728,7 +728,7 @@ mod overflow_tests { fn gray_n_to_rgb_u16_dispatcher_rejects_width_times_3_overflow() { let y: [u16; 0] = []; let mut rgb: [u16; 0] = []; - gray_n_to_rgb_u16_row::<10>(&y, &mut rgb, OVERFLOW_WIDTH, false, true); + gray_n_to_rgb_u16_row::<10, false>(&y, &mut rgb, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -737,7 +737,7 @@ mod overflow_tests { fn gray_n_to_rgba_u16_dispatcher_rejects_width_times_4_overflow() { let y: [u16; 0] = []; let mut rgba: [u16; 0] = []; - gray_n_to_rgba_u16_row::<10>(&y, &mut rgba, OVERFLOW_WIDTH, false, true); + gray_n_to_rgba_u16_row::<10, false>(&y, &mut rgba, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -746,7 +746,7 @@ mod overflow_tests { fn gray16_to_rgb_dispatcher_rejects_width_times_3_overflow() { let y: [u16; 0] = []; let mut rgb: [u8; 0] = []; - gray16_to_rgb_row(&y, &mut rgb, OVERFLOW_WIDTH, false, true); + gray16_to_rgb_row::(&y, &mut rgb, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -755,7 +755,7 @@ mod overflow_tests { fn gray16_to_rgba_dispatcher_rejects_width_times_4_overflow() { let y: [u16; 0] = []; let mut rgba: [u8; 0] = []; - gray16_to_rgba_row(&y, &mut rgba, OVERFLOW_WIDTH, false, true); + gray16_to_rgba_row::(&y, &mut rgba, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -764,7 +764,7 @@ mod overflow_tests { fn gray16_to_rgb_u16_dispatcher_rejects_width_times_3_overflow() { let y: [u16; 0] = []; let mut rgb: [u16; 0] = []; - gray16_to_rgb_u16_row(&y, &mut rgb, OVERFLOW_WIDTH, false, true); + gray16_to_rgb_u16_row::(&y, &mut rgb, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] @@ -773,7 +773,7 @@ mod overflow_tests { fn gray16_to_rgba_u16_dispatcher_rejects_width_times_4_overflow() { let y: [u16; 0] = []; let mut rgba: [u16; 0] = []; - gray16_to_rgba_u16_row(&y, &mut rgba, OVERFLOW_WIDTH, false, true); + gray16_to_rgba_u16_row::(&y, &mut rgba, OVERFLOW_WIDTH, false, true); } #[cfg(target_pointer_width = "32")] diff --git a/src/row/scalar/gray.rs b/src/row/scalar/gray.rs index 8211f084..8f591e30 100644 --- a/src/row/scalar/gray.rs +++ b/src/row/scalar/gray.rs @@ -213,10 +213,11 @@ pub(crate) fn gray8_to_hsv_row( /// GrayN → packed RGB u8. Masks to BITS bits, downshifts `BITS - 8` to u8, /// broadcasts. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to [0, 255] /// before broadcast. Luma outputs always pass Y through without rescaling. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgb_row( +pub(crate) fn gray_n_to_rgb_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -227,6 +228,11 @@ pub(crate) fn gray_n_to_rgb_row( let mask = bits_mask::(); let shift = BITS - 8; for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let masked = raw & mask; let y8 = if full_range { (masked >> shift) as u8 @@ -240,9 +246,10 @@ pub(crate) fn gray_n_to_rgb_row( /// GrayN → packed RGBA u8. Masks to BITS bits, downshifts to u8, broadcasts, /// α = 0xFF. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to [0, 255]. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgba_row( +pub(crate) fn gray_n_to_rgba_row( y_plane: &[u16], out: &mut [u8], width: usize, @@ -253,6 +260,11 @@ pub(crate) fn gray_n_to_rgba_row( let mask = bits_mask::(); let shift = BITS - 8; for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let masked = raw & mask; let y8 = if full_range { (masked >> shift) as u8 @@ -265,10 +277,11 @@ pub(crate) fn gray_n_to_rgba_row( /// GrayN → packed u16 RGB. Masks to BITS bits, broadcasts at native depth. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to full native range /// [0, (1<( +pub(crate) fn gray_n_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -278,6 +291,11 @@ pub(crate) fn gray_n_to_rgb_u16_row( debug_assert!(out.len() >= width * 3, "out too short"); let mask = bits_mask::(); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let masked = raw & mask; let y_out = if full_range { masked @@ -290,9 +308,10 @@ pub(crate) fn gray_n_to_rgb_u16_row( /// GrayN → packed u16 RGBA. Masks to BITS bits, broadcasts, α = `(1 << BITS) - 1`. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to full native range. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_rgba_u16_row( +pub(crate) fn gray_n_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -303,6 +322,11 @@ pub(crate) fn gray_n_to_rgba_u16_row( let mask = bits_mask::(); let alpha = mask; // full-range max for BITS for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let masked = raw & mask; let y_out = if full_range { masked @@ -315,25 +339,36 @@ pub(crate) fn gray_n_to_rgba_u16_row( /// GrayN → luma u8. Masks to BITS bits, downshifts `BITS - 8`. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// Always passes raw Y through without `full_range` rescaling — /// the caller is explicitly requesting the source luma plane as-is. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) fn gray_n_to_luma_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width, "out too short"); let mask = bits_mask::(); let shift = BITS - 8; for (out_byte, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; *out_byte = ((raw & mask) >> shift) as u8; } } /// GrayN → luma u16. Masks to BITS bits, identity copy. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// Always passes raw Y through without `full_range` rescaling — /// the caller is explicitly requesting the source luma plane as-is. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_luma_u16_row( +pub(crate) fn gray_n_to_luma_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -342,16 +377,22 @@ pub(crate) fn gray_n_to_luma_u16_row( debug_assert!(out.len() >= width, "out too short"); let mask = bits_mask::(); for (out_el, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; *out_el = raw & mask; } } /// GrayN → HSV u8. Masks to BITS bits, downshifts to u8, H=0 S=0 V=Y8. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, the V channel uses the rescaled luma value. /// See [`gray8_to_hsv_row`] for the S=0 convention. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray_n_to_hsv_row( +pub(crate) fn gray_n_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -366,6 +407,11 @@ pub(crate) fn gray_n_to_hsv_row( let mask = bits_mask::(); let shift = BITS - 8; for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let masked = raw & mask; h_out[x] = 0; s_out[x] = 0; @@ -381,13 +427,24 @@ pub(crate) fn gray_n_to_hsv_row( /// Gray16 → packed RGB u8. Downshifts `>> 8` to u8, broadcasts. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y (black=4096, white=56064+4096) /// is rescaled to [0, 255] before broadcast. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgb_row(y_plane: &[u16], out: &mut [u8], width: usize, full_range: bool) { +pub(crate) fn gray16_to_rgb_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, + full_range: bool, +) { debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width * 3, "out too short"); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let y8 = if full_range { (raw >> 8) as u8 } else { @@ -399,12 +456,23 @@ pub(crate) fn gray16_to_rgb_row(y_plane: &[u16], out: &mut [u8], width: usize, f /// Gray16 → packed RGBA u8. Downshifts `>> 8`, broadcasts, α = 0xFF. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to [0, 255]. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgba_row(y_plane: &[u16], out: &mut [u8], width: usize, full_range: bool) { +pub(crate) fn gray16_to_rgba_row( + y_plane: &[u16], + out: &mut [u8], + width: usize, + full_range: bool, +) { debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width * 4, "out too short"); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let y8 = if full_range { (raw >> 8) as u8 } else { @@ -416,9 +484,10 @@ pub(crate) fn gray16_to_rgba_row(y_plane: &[u16], out: &mut [u8], width: usize, /// Gray16 → packed u16 RGB. Identity broadcast, native 16-bit depth. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to [0, 65535]. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgb_u16_row( +pub(crate) fn gray16_to_rgb_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -427,6 +496,11 @@ pub(crate) fn gray16_to_rgb_u16_row( debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width * 3, "out too short"); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let y_out = if full_range { raw } else { @@ -438,9 +512,10 @@ pub(crate) fn gray16_to_rgb_u16_row( /// Gray16 → packed u16 RGBA. Identity broadcast, α = 0xFFFF. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, limited-range Y is rescaled to [0, 65535]. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_rgba_u16_row( +pub(crate) fn gray16_to_rgba_u16_row( y_plane: &[u16], out: &mut [u16], width: usize, @@ -449,6 +524,11 @@ pub(crate) fn gray16_to_rgba_u16_row( debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width * 4, "out too short"); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; let y_out = if full_range { raw } else { @@ -460,34 +540,52 @@ pub(crate) fn gray16_to_rgba_u16_row( /// Gray16 → luma u8. Downshifts `>> 8`. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// Always passes raw Y through without `full_range` rescaling — /// the caller is explicitly requesting the source luma plane as-is. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { +pub(crate) fn gray16_to_luma_row(y_plane: &[u16], out: &mut [u8], width: usize) { debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width, "out too short"); for (out_byte, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; *out_byte = (raw >> 8) as u8; } } -/// Gray16 → luma u16. Identity copy. +/// Gray16 → luma u16. Identity copy (or byte-swap copy for BE). /// +/// When `BE = true`, each u16 sample is byte-swapped before output. /// Always passes raw Y through without `full_range` rescaling — /// the caller is explicitly requesting the source luma plane as-is. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_luma_u16_row(y_plane: &[u16], out: &mut [u16], width: usize) { +pub(crate) fn gray16_to_luma_u16_row( + y_plane: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(y_plane.len() >= width, "y_plane too short"); debug_assert!(out.len() >= width, "out too short"); - out[..width].copy_from_slice(&y_plane[..width]); + for (o, &raw) in out[..width].iter_mut().zip(y_plane[..width].iter()) { + *o = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + } } /// Gray16 → HSV u8. `>> 8` to u8, H=0 S=0 V=Y8. /// +/// When `BE = true`, each u16 sample is byte-swapped before processing. /// When `full_range = false`, the V channel uses the rescaled luma value. /// See [`gray8_to_hsv_row`] for the S=0 convention. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gray16_to_hsv_row( +pub(crate) fn gray16_to_hsv_row( y_plane: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -500,6 +598,11 @@ pub(crate) fn gray16_to_hsv_row( debug_assert!(s_out.len() >= width, "S out too short"); debug_assert!(v_out.len() >= width, "V out too short"); for (x, &raw) in y_plane[..width].iter().enumerate() { + let raw = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; h_out[x] = 0; s_out[x] = 0; v_out[x] = if full_range { @@ -609,7 +712,7 @@ mod tests { // 10-bit black = 16 << 2 = 64 let y: std::vec::Vec = std::vec![64u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<10>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -618,7 +721,7 @@ mod tests { // 10-bit white = 235 << 2 = 940 let y: std::vec::Vec = std::vec![940u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<10>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[255, 255, 255]); } @@ -627,7 +730,7 @@ mod tests { // 10-bit mid: 125 << 2 = 500 → approx 127 let y: std::vec::Vec = std::vec![500u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<10>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, false); assert!( out[0] >= 126 && out[0] <= 128, "expected ~127 got {}", @@ -640,7 +743,7 @@ mod tests { // 10-bit full range: value 512 >> 2 = 128 let y: std::vec::Vec = std::vec![512u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<10>(&y, &mut out, 1, true); + gray_n_to_rgb_row::<10, false>(&y, &mut out, 1, true); assert_eq!(&out[0..3], &[128, 128, 128]); } @@ -651,7 +754,7 @@ mod tests { // 12-bit black = 16 << 4 = 256 let y: std::vec::Vec = std::vec![256u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<12>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<12, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -660,7 +763,7 @@ mod tests { // 12-bit white = 235 << 4 = 3760 let y: std::vec::Vec = std::vec![3760u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<12>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<12, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[255, 255, 255]); } @@ -671,7 +774,7 @@ mod tests { // 14-bit black = 16 << 6 = 1024 let y: std::vec::Vec = std::vec![1024u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<14>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<14, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -680,7 +783,7 @@ mod tests { // 14-bit white = 235 << 6 = 15040 let y: std::vec::Vec = std::vec![15040u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<14>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<14, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[255, 255, 255]); } @@ -691,7 +794,7 @@ mod tests { // 16-bit black = 16 << 8 = 4096 let y: std::vec::Vec = std::vec![4096u16]; let mut out = std::vec![0u8; 3]; - gray16_to_rgb_row(&y, &mut out, 1, false); + gray16_to_rgb_row::(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -700,7 +803,7 @@ mod tests { // 16-bit white = 235 << 8 = 60160 let y: std::vec::Vec = std::vec![60160u16]; let mut out = std::vec![0u8; 3]; - gray16_to_rgb_row(&y, &mut out, 1, false); + gray16_to_rgb_row::(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[255, 255, 255]); } @@ -709,7 +812,7 @@ mod tests { // 16-bit mid: 125 << 8 = 32000 → approx 127 let y: std::vec::Vec = std::vec![32000u16]; let mut out = std::vec![0u8; 3]; - gray16_to_rgb_row(&y, &mut out, 1, false); + gray16_to_rgb_row::(&y, &mut out, 1, false); assert!( out[0] >= 126 && out[0] <= 128, "expected ~127 got {}", @@ -722,7 +825,7 @@ mod tests { // 16-bit full range: 0x8000 >> 8 = 128 let y: std::vec::Vec = std::vec![0x8000u16]; let mut out = std::vec![0u8; 3]; - gray16_to_rgb_row(&y, &mut out, 1, true); + gray16_to_rgb_row::(&y, &mut out, 1, true); assert_eq!(&out[0..3], &[128, 128, 128]); } @@ -737,7 +840,7 @@ mod tests { fn gray16_to_rgb_u16_limited_range_black() { let y: std::vec::Vec = std::vec![4096u16]; // limited-range black let mut out = std::vec![0u16; 3]; - gray16_to_rgb_u16_row(&y, &mut out, 1, false); + gray16_to_rgb_u16_row::(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -745,7 +848,7 @@ mod tests { fn gray16_to_rgb_u16_limited_range_white() { let y: std::vec::Vec = std::vec![60160u16]; // limited-range white let mut out = std::vec![0u16; 3]; - gray16_to_rgb_u16_row(&y, &mut out, 1, false); + gray16_to_rgb_u16_row::(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[65535, 65535, 65535]); } @@ -754,7 +857,7 @@ mod tests { // Over-white (Y > 60160) is clamped to max_native=65535. let y: std::vec::Vec = std::vec![65535u16]; let mut out = std::vec![0u16; 3]; - gray16_to_rgb_u16_row(&y, &mut out, 1, false); + gray16_to_rgb_u16_row::(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[65535, 65535, 65535]); } @@ -762,7 +865,7 @@ mod tests { fn gray16_to_rgba_u16_limited_range_black_and_white() { let y: std::vec::Vec = std::vec![4096u16, 60160u16]; let mut out = std::vec![0u16; 8]; - gray16_to_rgba_u16_row(&y, &mut out, 2, false); + gray16_to_rgba_u16_row::(&y, &mut out, 2, false); assert_eq!(&out[0..3], &[0, 0, 0]); assert_eq!(out[3], 0xFFFF); assert_eq!(&out[4..7], &[65535, 65535, 65535]); @@ -776,7 +879,7 @@ mod tests { // 10-bit: 1023 >> 2 = 255; 0 >> 2 = 0; 512 >> 2 = 128 let y: std::vec::Vec = std::vec![0, 512, 1023]; let mut out = std::vec![0u8; 9]; - gray_n_to_rgb_row::<10>(&y, &mut out, 3, true); + gray_n_to_rgb_row::<10, false>(&y, &mut out, 3, true); assert_eq!(&out[0..3], &[0, 0, 0]); assert_eq!(&out[3..6], &[128, 128, 128]); assert_eq!(&out[6..9], &[255, 255, 255]); @@ -787,7 +890,7 @@ mod tests { // Upper bits should be masked out: 0xFFFF & 0x03FF = 0x03FF = 1023 let y: std::vec::Vec = std::vec![0xFFFF, 512, 0]; let mut out = std::vec![0u16; 9]; - gray_n_to_rgb_u16_row::<10>(&y, &mut out, 3, true); + gray_n_to_rgb_u16_row::<10, false>(&y, &mut out, 3, true); assert_eq!(&out[0..3], &[1023, 1023, 1023]); assert_eq!(&out[3..6], &[512, 512, 512]); assert_eq!(&out[6..9], &[0, 0, 0]); @@ -799,7 +902,7 @@ mod tests { let mut h = std::vec![0xFFu8; 1]; let mut s = std::vec![0xFFu8; 1]; let mut v = std::vec![0u8; 1]; - gray_n_to_hsv_row::<10>(&y, &mut h, &mut s, &mut v, 1, true); + gray_n_to_hsv_row::<10, false>(&y, &mut h, &mut s, &mut v, 1, true); assert_eq!(h[0], 0); assert_eq!(s[0], 0); assert_eq!(v[0], 128); @@ -809,7 +912,7 @@ mod tests { fn gray16_to_rgb_downshifts_8() { let y: std::vec::Vec = std::vec![0, 0x8000, 0xFFFF]; let mut out = std::vec![0u8; 9]; - gray16_to_rgb_row(&y, &mut out, 3, true); + gray16_to_rgb_row::(&y, &mut out, 3, true); assert_eq!(&out[0..3], &[0, 0, 0]); assert_eq!(&out[3..6], &[0x80, 0x80, 0x80]); assert_eq!(&out[6..9], &[0xFF, 0xFF, 0xFF]); @@ -819,7 +922,7 @@ mod tests { fn gray16_to_luma_u16_identity() { let y: std::vec::Vec = std::vec![0, 1000, 65535]; let mut out = std::vec![0u16; 3]; - gray16_to_luma_u16_row(&y, &mut out, 3); + gray16_to_luma_u16_row::(&y, &mut out, 3); assert_eq!(out.as_slice(), &[0, 1000, 65535]); } @@ -827,7 +930,7 @@ mod tests { fn gray16_to_rgba_u16_opaque() { let y: std::vec::Vec = std::vec![12345u16]; let mut out = std::vec![0u16; 4]; - gray16_to_rgba_u16_row(&y, &mut out, 1, true); + gray16_to_rgba_u16_row::(&y, &mut out, 1, true); assert_eq!(&out[0..4], &[12345, 12345, 12345, 0xFFFF]); } @@ -835,7 +938,7 @@ mod tests { fn gray_n_to_luma_u16_10bit_masks() { let y: std::vec::Vec = std::vec![0xFFFF]; // should mask to 1023 let mut out = std::vec![0u16; 1]; - gray_n_to_luma_u16_row::<10>(&y, &mut out, 1); + gray_n_to_luma_u16_row::<10, false>(&y, &mut out, 1); assert_eq!(out[0], 1023); } @@ -846,7 +949,7 @@ mod tests { // 9-bit black = 16 << 1 = 32 let y: std::vec::Vec = std::vec![32u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<9>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[0, 0, 0]); } @@ -855,7 +958,7 @@ mod tests { // 9-bit white = 235 << 1 = 470 let y: std::vec::Vec = std::vec![470u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<9>(&y, &mut out, 1, false); + gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, false); assert_eq!(&out[0..3], &[255, 255, 255]); } @@ -864,7 +967,113 @@ mod tests { // 9-bit full range: value 256 >> 1 = 128 let y: std::vec::Vec = std::vec![256u16]; let mut out = std::vec![0u8; 3]; - gray_n_to_rgb_row::<9>(&y, &mut out, 1, true); + gray_n_to_rgb_row::<9, false>(&y, &mut out, 1, true); assert_eq!(&out[0..3], &[128, 128, 128]); } + + // ---- BE parity tests: gray_n (Gray9-14) ----------------------------------- + // Pattern: construct LE input, byte-swap to produce BE input, call with + // BE=true, assert output equals LE-input run output. + + #[test] + fn gray10_be_parity_rgb() { + // LE value 512 >> 2 = 128. BE encoding: 512 = 0x0200, BE bytes = [0x02, 0x00]. + let le: std::vec::Vec = std::vec![512u16]; + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 3]; + let mut out_be = std::vec![0u8; 3]; + gray_n_to_rgb_row::<10, false>(&le, &mut out_le, 1, true); + gray_n_to_rgb_row::<10, true>(&be, &mut out_be, 1, true); + assert_eq!(out_le, out_be, "BE and LE gray10 rgb outputs must match"); + } + + #[test] + fn gray10_be_parity_rgba() { + let le: std::vec::Vec = std::vec![768u16]; // 768 >> 2 = 192 + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 4]; + let mut out_be = std::vec![0u8; 4]; + gray_n_to_rgba_row::<10, false>(&le, &mut out_le, 1, true); + gray_n_to_rgba_row::<10, true>(&be, &mut out_be, 1, true); + assert_eq!(out_le, out_be, "BE and LE gray10 rgba outputs must match"); + } + + #[test] + fn gray10_be_parity_luma() { + let le: std::vec::Vec = std::vec![256u16]; // 256 >> 2 = 64 + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 1]; + let mut out_be = std::vec![0u8; 1]; + gray_n_to_luma_row::<10, false>(&le, &mut out_le, 1); + gray_n_to_luma_row::<10, true>(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE gray10 luma outputs must match"); + } + + #[test] + fn gray10_be_parity_luma_u16() { + let le: std::vec::Vec = std::vec![512u16]; + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u16; 1]; + let mut out_be = std::vec![0u16; 1]; + gray_n_to_luma_u16_row::<10, false>(&le, &mut out_le, 1); + gray_n_to_luma_u16_row::<10, true>(&be, &mut out_be, 1); + assert_eq!( + out_le, out_be, + "BE and LE gray10 luma_u16 outputs must match" + ); + } + + // ---- BE parity tests: gray16 ----------------------------------------------- + + #[test] + fn gray16_be_parity_rgb() { + // LE value 0x8000 >> 8 = 128. + let le: std::vec::Vec = std::vec![0x8000u16]; + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 3]; + let mut out_be = std::vec![0u8; 3]; + gray16_to_rgb_row::(&le, &mut out_le, 1, true); + gray16_to_rgb_row::(&be, &mut out_be, 1, true); + assert_eq!(out_le, out_be, "BE and LE gray16 rgb outputs must match"); + } + + #[test] + fn gray16_be_parity_rgba() { + let le: std::vec::Vec = std::vec![0xC000u16]; // 0xC0 = 192 + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 4]; + let mut out_be = std::vec![0u8; 4]; + gray16_to_rgba_row::(&le, &mut out_le, 1, true); + gray16_to_rgba_row::(&be, &mut out_be, 1, true); + assert_eq!(out_le, out_be, "BE and LE gray16 rgba outputs must match"); + } + + #[test] + fn gray16_be_parity_luma() { + let le: std::vec::Vec = std::vec![0x4000u16]; // 0x40 = 64 + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u8; 1]; + let mut out_be = std::vec![0u8; 1]; + gray16_to_luma_row::(&le, &mut out_le, 1); + gray16_to_luma_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE gray16 luma outputs must match"); + } + + #[test] + fn gray16_be_parity_luma_u16() { + // For gray16_to_luma_u16_row with BE=true, swap_bytes is applied. + // LE: 0x1234. BE encoding of that value: swap bytes → 0x3412. + // After BE kernel processes 0x3412 with swap_bytes → 0x1234. Output = 0x1234. + let le_val: u16 = 0x1234; + let le: std::vec::Vec = std::vec![le_val]; + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = std::vec![0u16; 1]; + let mut out_be = std::vec![0u16; 1]; + gray16_to_luma_u16_row::(&le, &mut out_le, 1); + gray16_to_luma_u16_row::(&be, &mut out_be, 1); + assert_eq!( + out_le, out_be, + "BE and LE gray16 luma_u16 outputs must match" + ); + } } diff --git a/src/row/scalar/grayf32.rs b/src/row/scalar/grayf32.rs index b762edef..f7a4d6db 100644 --- a/src/row/scalar/grayf32.rs +++ b/src/row/scalar/grayf32.rs @@ -42,11 +42,18 @@ fn f32_to_u16(y: f32) -> u16 { // ---- kernel implementations ------------------------------------------------- /// Grayf32 → packed u8 RGB. Clamp [0,1] × 255 → u8, broadcast R=G=B=Y. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_row(plane: &[f32], rgb_out: &mut [u8], width: usize) { +pub(crate) fn grayf32_to_rgb_row(plane: &[f32], rgb_out: &mut [u8], width: usize) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; let v = f32_to_u8(y); let i = x * 3; rgb_out[i] = v; @@ -56,11 +63,22 @@ pub(crate) fn grayf32_to_rgb_row(plane: &[f32], rgb_out: &mut [u8], width: usize } /// Grayf32 → packed u8 RGBA. Same broadcast as rgb; α = 0xFF. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgba_row(plane: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) fn grayf32_to_rgba_row( + plane: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; let v = f32_to_u8(y); let i = x * 4; rgba_out[i] = v; @@ -71,11 +89,22 @@ pub(crate) fn grayf32_to_rgba_row(plane: &[f32], rgba_out: &mut [u8], width: usi } /// Grayf32 → packed u16 RGB. Clamp [0,1] × 65535 → u16, broadcast R=G=B=Y. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn grayf32_to_rgb_u16_row( + plane: &[f32], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; let v = f32_to_u16(y); let i = x * 3; rgb_u16_out[i] = v; @@ -85,11 +114,22 @@ pub(crate) fn grayf32_to_rgb_u16_row(plane: &[f32], rgb_u16_out: &mut [u16], wid } /// Grayf32 → packed u16 RGBA. Same broadcast; α = 0xFFFF. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgba_u16_row(plane: &[f32], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn grayf32_to_rgba_u16_row( + plane: &[f32], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; let v = f32_to_u16(y); let i = x * 4; rgba_u16_out[i] = v; @@ -100,11 +140,23 @@ pub(crate) fn grayf32_to_rgba_u16_row(plane: &[f32], rgba_u16_out: &mut [u16], w } /// Grayf32 → packed f32 RGB. Lossless: replicate Y → R=G=B (no clamp, no round). +/// +/// When `BE = true`, each f32 element is byte-swapped (treats stored bits as +/// BE-encoded IEEE 754 and converts to host-native before replication). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_rgb_f32_row(plane: &[f32], rgb_f32_out: &mut [f32], width: usize) { +pub(crate) fn grayf32_to_rgb_f32_row( + plane: &[f32], + rgb_f32_out: &mut [f32], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(rgb_f32_out.len() >= width * 3, "rgb_f32_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; let i = x * 3; rgb_f32_out[i] = y; rgb_f32_out[i + 1] = y; @@ -113,39 +165,74 @@ pub(crate) fn grayf32_to_rgb_f32_row(plane: &[f32], rgb_f32_out: &mut [f32], wid } /// Grayf32 → luma u8. Clamp [0,1] × 255 → u8. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_row(plane: &[f32], luma_out: &mut [u8], width: usize) { +pub(crate) fn grayf32_to_luma_row( + plane: &[f32], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(luma_out.len() >= width, "luma_out too short"); - for (out, &y) in luma_out[..width].iter_mut().zip(plane[..width].iter()) { + for (out, &raw) in luma_out[..width].iter_mut().zip(plane[..width].iter()) { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; *out = f32_to_u8(y); } } /// Grayf32 → luma u16. Clamp [0,1] × 65535 → u16. +/// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_u16_row(plane: &[f32], luma_u16_out: &mut [u16], width: usize) { +pub(crate) fn grayf32_to_luma_u16_row( + plane: &[f32], + luma_u16_out: &mut [u16], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(luma_u16_out.len() >= width, "luma_u16_out too short"); - for (out, &y) in luma_u16_out[..width].iter_mut().zip(plane[..width].iter()) { + for (out, &raw) in luma_u16_out[..width].iter_mut().zip(plane[..width].iter()) { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; *out = f32_to_u16(y); } } -/// Grayf32 → luma f32. Lossless pass-through (memcpy-equivalent). +/// Grayf32 → luma f32. Lossless pass-through (or byte-swap copy for BE). +/// +/// When `BE = true`, each f32 element is byte-swapped before output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_luma_f32_row(plane: &[f32], luma_f32_out: &mut [f32], width: usize) { +pub(crate) fn grayf32_to_luma_f32_row( + plane: &[f32], + luma_f32_out: &mut [f32], + width: usize, +) { debug_assert!(plane.len() >= width, "plane too short"); debug_assert!(luma_f32_out.len() >= width, "luma_f32_out too short"); - luma_f32_out[..width].copy_from_slice(&plane[..width]); + for (out, &raw) in luma_f32_out[..width].iter_mut().zip(plane[..width].iter()) { + *out = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; + } } /// Grayf32 → HSV u8. Gray fast-path: H=0, S=0, V = clamp(Y, 0, 1) × 255. /// +/// When `BE = true`, each f32 element is loaded via byte-swapped u32 bits. /// Gray sources are achromatic (saturation = 0 identically). H is fixed to 0 /// to match OpenCV's `cv2.COLOR_GRAY2HSV` convention. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn grayf32_to_hsv_row( +pub(crate) fn grayf32_to_hsv_row( plane: &[f32], h_out: &mut [u8], s_out: &mut [u8], @@ -156,7 +243,12 @@ pub(crate) fn grayf32_to_hsv_row( debug_assert!(h_out.len() >= width, "h_out too short"); debug_assert!(s_out.len() >= width, "s_out too short"); debug_assert!(v_out.len() >= width, "v_out too short"); - for (x, &y) in plane[..width].iter().enumerate() { + for (x, &raw) in plane[..width].iter().enumerate() { + let y = if BE { + f32::from_bits(u32::from_be(raw.to_bits())) + } else { + f32::from_bits(u32::from_le(raw.to_bits())) + }; h_out[x] = 0; s_out[x] = 0; v_out[x] = f32_to_u8(y); @@ -173,7 +265,7 @@ mod tests { fn grayf32_to_rgb_zero() { let plane = [0.0f32]; let mut out = [0xFFu8; 3]; - grayf32_to_rgb_row(&plane, &mut out, 1); + grayf32_to_rgb_row::(&plane, &mut out, 1); assert_eq!(out, [0, 0, 0]); } @@ -181,7 +273,7 @@ mod tests { fn grayf32_to_rgb_max() { let plane = [1.0f32]; let mut out = [0u8; 3]; - grayf32_to_rgb_row(&plane, &mut out, 1); + grayf32_to_rgb_row::(&plane, &mut out, 1); assert_eq!(out, [255, 255, 255]); } @@ -195,7 +287,7 @@ mod tests { // truncate` is the contract this crate uses across scalar + SIMD. let plane = [0.5f32]; let mut out = [0u8; 3]; - grayf32_to_rgb_row(&plane, &mut out, 1); + grayf32_to_rgb_row::(&plane, &mut out, 1); assert_eq!(out, [128, 128, 128]); } @@ -203,7 +295,7 @@ mod tests { fn grayf32_to_rgb_saturates_high() { let plane = [1.5f32]; let mut out = [0u8; 3]; - grayf32_to_rgb_row(&plane, &mut out, 1); + grayf32_to_rgb_row::(&plane, &mut out, 1); assert_eq!(out, [255, 255, 255]); } @@ -211,7 +303,7 @@ mod tests { fn grayf32_to_rgb_saturates_low() { let plane = [-0.1f32]; let mut out = [0xFFu8; 3]; - grayf32_to_rgb_row(&plane, &mut out, 1); + grayf32_to_rgb_row::(&plane, &mut out, 1); assert_eq!(out, [0, 0, 0]); } @@ -221,7 +313,7 @@ mod tests { fn grayf32_to_rgba_zero_alpha_opaque() { let plane = [0.0f32]; let mut out = [0u8; 4]; - grayf32_to_rgba_row(&plane, &mut out, 1); + grayf32_to_rgba_row::(&plane, &mut out, 1); assert_eq!(out, [0, 0, 0, 0xFF]); } @@ -229,7 +321,7 @@ mod tests { fn grayf32_to_rgba_max_alpha_opaque() { let plane = [1.0f32]; let mut out = [0u8; 4]; - grayf32_to_rgba_row(&plane, &mut out, 1); + grayf32_to_rgba_row::(&plane, &mut out, 1); assert_eq!(out, [255, 255, 255, 0xFF]); } @@ -239,7 +331,7 @@ mod tests { fn grayf32_to_rgb_u16_zero() { let plane = [0.0f32]; let mut out = [0xFFFFu16; 3]; - grayf32_to_rgb_u16_row(&plane, &mut out, 1); + grayf32_to_rgb_u16_row::(&plane, &mut out, 1); assert_eq!(out, [0, 0, 0]); } @@ -247,7 +339,7 @@ mod tests { fn grayf32_to_rgb_u16_max() { let plane = [1.0f32]; let mut out = [0u16; 3]; - grayf32_to_rgb_u16_row(&plane, &mut out, 1); + grayf32_to_rgb_u16_row::(&plane, &mut out, 1); assert_eq!(out, [65535, 65535, 65535]); } @@ -255,7 +347,7 @@ mod tests { fn grayf32_to_rgb_u16_saturates_high() { let plane = [2.0f32]; let mut out = [0u16; 3]; - grayf32_to_rgb_u16_row(&plane, &mut out, 1); + grayf32_to_rgb_u16_row::(&plane, &mut out, 1); assert_eq!(out, [65535, 65535, 65535]); } @@ -265,7 +357,7 @@ mod tests { fn grayf32_to_rgba_u16_opaque() { let plane = [1.0f32]; let mut out = [0u16; 4]; - grayf32_to_rgba_u16_row(&plane, &mut out, 1); + grayf32_to_rgba_u16_row::(&plane, &mut out, 1); assert_eq!(out, [65535, 65535, 65535, 0xFFFF]); } @@ -276,7 +368,7 @@ mod tests { // Non-clamped value preserved exactly. let plane = [1.5f32]; let mut out = [0.0f32; 3]; - grayf32_to_rgb_f32_row(&plane, &mut out, 1); + grayf32_to_rgb_f32_row::(&plane, &mut out, 1); assert_eq!(out, [1.5, 1.5, 1.5]); } @@ -284,7 +376,7 @@ mod tests { fn grayf32_to_rgb_f32_negative_preserved() { let plane = [-0.5f32]; let mut out = [0.0f32; 3]; - grayf32_to_rgb_f32_row(&plane, &mut out, 1); + grayf32_to_rgb_f32_row::(&plane, &mut out, 1); assert_eq!(out, [-0.5, -0.5, -0.5]); } @@ -294,7 +386,7 @@ mod tests { fn grayf32_to_luma_zero() { let plane = [0.0f32]; let mut out = [0xFFu8; 1]; - grayf32_to_luma_row(&plane, &mut out, 1); + grayf32_to_luma_row::(&plane, &mut out, 1); assert_eq!(out, [0]); } @@ -302,7 +394,7 @@ mod tests { fn grayf32_to_luma_max() { let plane = [1.0f32]; let mut out = [0u8; 1]; - grayf32_to_luma_row(&plane, &mut out, 1); + grayf32_to_luma_row::(&plane, &mut out, 1); assert_eq!(out, [255]); } @@ -312,7 +404,7 @@ mod tests { fn grayf32_to_luma_u16_max() { let plane = [1.0f32]; let mut out = [0u16; 1]; - grayf32_to_luma_u16_row(&plane, &mut out, 1); + grayf32_to_luma_u16_row::(&plane, &mut out, 1); assert_eq!(out, [65535]); } @@ -322,7 +414,7 @@ mod tests { fn grayf32_to_luma_f32_identity() { let plane = [0.0f32, 0.5, 1.0, 1.5, -0.1]; let mut out = [99.0f32; 5]; - grayf32_to_luma_f32_row(&plane, &mut out, 5); + grayf32_to_luma_f32_row::(&plane, &mut out, 5); // Lossless pass-through — exact bit equality. assert_eq!(out, [0.0, 0.5, 1.0, 1.5, -0.1]); } @@ -335,7 +427,7 @@ mod tests { let mut h = [0xFFu8; 1]; let mut s = [0xFFu8; 1]; let mut v = [0u8; 1]; - grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1); + grayf32_to_hsv_row::(&plane, &mut h, &mut s, &mut v, 1); assert_eq!(h[0], 0, "H must be 0 for achromatic source"); assert_eq!(s[0], 0, "S must be 0 for achromatic source"); assert_eq!(v[0], 0); @@ -347,7 +439,7 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0u8; 1]; - grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1); + grayf32_to_hsv_row::(&plane, &mut h, &mut s, &mut v, 1); assert_eq!(h[0], 0); assert_eq!(s[0], 0); assert_eq!(v[0], 255); @@ -360,7 +452,7 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0u8; 1]; - grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1); + grayf32_to_hsv_row::(&plane, &mut h, &mut s, &mut v, 1); assert_eq!(h[0], 0); assert_eq!(s[0], 0); assert_eq!(v[0], 128); @@ -373,7 +465,7 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0u8; 1]; - grayf32_to_hsv_row(&plane, &mut h, &mut s, &mut v, 1); + grayf32_to_hsv_row::(&plane, &mut h, &mut s, &mut v, 1); assert_eq!(v[0], 255); } @@ -381,9 +473,43 @@ mod tests { fn grayf32_to_rgb_multi_pixel() { let plane = [0.0f32, 1.0, 0.5]; let mut out = [0u8; 9]; - grayf32_to_rgb_row(&plane, &mut out, 3); + grayf32_to_rgb_row::(&plane, &mut out, 3); assert_eq!(&out[0..3], &[0, 0, 0]); assert_eq!(&out[3..6], &[255, 255, 255]); assert_eq!(&out[6..9], &[128, 128, 128]); // 0.5 → 128 } + + // ---- BE parity tests: grayf32 --------------------------------------------- + // Pattern: construct LE f32 input, reinterpret bytes as BE-encoded f32 + // (i.e. byte-swap the u32 bits), call BE kernel, assert output matches LE run. + + /// Helper: produce a BE-encoded copy of an f32 slice (swap u32 bits of each element). + fn f32_to_be_bytes(src: &[f32]) -> std::vec::Vec { + src + .iter() + .map(|&v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() + } + + #[test] + fn grayf32_be_parity_rgb() { + let le = [0.5f32]; + let be = f32_to_be_bytes(&le); + let mut out_le = [0u8; 3]; + let mut out_be = [0u8; 3]; + grayf32_to_rgb_row::(&le, &mut out_le, 1); + grayf32_to_rgb_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE grayf32 rgb outputs must match"); + } + + #[test] + fn grayf32_be_parity_luma() { + let le = [0.25f32]; + let be = f32_to_be_bytes(&le); + let mut out_le = [0u8; 1]; + let mut out_be = [0u8; 1]; + grayf32_to_luma_row::(&le, &mut out_le, 1); + grayf32_to_luma_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE grayf32 luma outputs must match"); + } } diff --git a/src/row/scalar/ya16.rs b/src/row/scalar/ya16.rs index b2d8f831..8ca3fe5a 100644 --- a/src/row/scalar/ya16.rs +++ b/src/row/scalar/ya16.rs @@ -19,12 +19,19 @@ //! α is dropped for HSV output. /// Ya16 → packed u8 RGB. Y `>> 8`, broadcast R=G=B; α dropped. +/// +/// When `BE = true`, each u16 element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgb_row(packed: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn ya16_to_rgb_row(packed: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short"); for x in 0..width { - let y8 = (packed[x * 2] >> 8) as u8; + let y_raw = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; + let y8 = (y_raw >> 8) as u8; let i = x * 3; rgb_out[i] = y8; rgb_out[i + 1] = y8; @@ -33,13 +40,25 @@ pub(crate) fn ya16_to_rgb_row(packed: &[u16], rgb_out: &mut [u8], width: usize) } /// Ya16 → packed u8 RGBA. Y `>> 8`, broadcast R=G=B; A `>> 8` from source slot 1. +/// +/// When `BE = true`, each u16 element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgba_row(packed: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn ya16_to_rgba_row(packed: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for x in 0..width { - let y8 = (packed[x * 2] >> 8) as u8; - let a8 = (packed[x * 2 + 1] >> 8) as u8; + let y_raw = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; + let a_raw = if BE { + u16::from_be(packed[x * 2 + 1]) + } else { + u16::from_le(packed[x * 2 + 1]) + }; + let y8 = (y_raw >> 8) as u8; + let a8 = (a_raw >> 8) as u8; let i = x * 4; rgba_out[i] = y8; rgba_out[i + 1] = y8; @@ -49,12 +68,22 @@ pub(crate) fn ya16_to_rgba_row(packed: &[u16], rgba_out: &mut [u8], width: usize } /// Ya16 → packed u16 RGB. Y native u16, broadcast R=G=B=Y; α dropped. +/// +/// When `BE = true`, each u16 element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn ya16_to_rgb_u16_row( + packed: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short"); for x in 0..width { - let y = packed[x * 2]; + let y = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; let i = x * 3; rgb_u16_out[i] = y; rgb_u16_out[i + 1] = y; @@ -63,13 +92,27 @@ pub(crate) fn ya16_to_rgb_u16_row(packed: &[u16], rgb_u16_out: &mut [u16], width } /// Ya16 → packed u16 RGBA. Y native u16, broadcast; A native u16 from source slot 1. +/// +/// When `BE = true`, each u16 element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn ya16_to_rgba_u16_row( + packed: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short"); for x in 0..width { - let y = packed[x * 2]; - let a = packed[x * 2 + 1]; + let y = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; + let a = if BE { + u16::from_be(packed[x * 2 + 1]) + } else { + u16::from_le(packed[x * 2 + 1]) + }; let i = x * 4; rgba_u16_out[i] = y; rgba_u16_out[i + 1] = y; @@ -79,30 +122,48 @@ pub(crate) fn ya16_to_rgba_u16_row(packed: &[u16], rgba_u16_out: &mut [u16], wid } /// Ya16 → luma u8. Y `>> 8`. +/// +/// When `BE = true`, each u16 element is byte-swapped before processing. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) fn ya16_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(luma_out.len() >= width, "luma_out too short"); for x in 0..width { - luma_out[x] = (packed[x * 2] >> 8) as u8; + let y = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; + luma_out[x] = (y >> 8) as u8; } } -/// Ya16 → luma u16. Y native u16 pass-through. +/// Ya16 → luma u16. Y native u16 pass-through (or byte-swap for BE). +/// +/// When `BE = true`, each u16 element is byte-swapped before output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_luma_u16_row(packed: &[u16], luma_u16_out: &mut [u16], width: usize) { +pub(crate) fn ya16_to_luma_u16_row( + packed: &[u16], + luma_u16_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(luma_u16_out.len() >= width, "luma_u16_out too short"); for x in 0..width { - luma_u16_out[x] = packed[x * 2]; + luma_u16_out[x] = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; } } /// Ya16 → HSV u8. Gray fast-path: H=0, S=0, V = Y `>> 8`. α dropped. /// +/// When `BE = true`, each u16 element is byte-swapped before processing. /// See [`super::gray::gray8_to_hsv_row`] for the S=0 convention. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ya16_to_hsv_row( +pub(crate) fn ya16_to_hsv_row( packed: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -114,9 +175,14 @@ pub(crate) fn ya16_to_hsv_row( debug_assert!(s_out.len() >= width, "s_out too short"); debug_assert!(v_out.len() >= width, "v_out too short"); for x in 0..width { + let y = if BE { + u16::from_be(packed[x * 2]) + } else { + u16::from_le(packed[x * 2]) + }; h_out[x] = 0; s_out[x] = 0; - v_out[x] = (packed[x * 2] >> 8) as u8; + v_out[x] = (y >> 8) as u8; } } @@ -136,7 +202,7 @@ mod tests { // Y=0x8000, A=0x4000 → rgb [0x80, 0x80, 0x80] let p = packed_ya(&[(0x8000, 0x4000)]); let mut out = [0u8; 3]; - ya16_to_rgb_row(&p, &mut out, 1); + ya16_to_rgb_row::(&p, &mut out, 1); assert_eq!(out, [0x80, 0x80, 0x80]); } @@ -144,7 +210,7 @@ mod tests { fn ya16_to_rgb_zero_pixel() { let p = packed_ya(&[(0, 0)]); let mut out = [0xFFu8; 3]; - ya16_to_rgb_row(&p, &mut out, 1); + ya16_to_rgb_row::(&p, &mut out, 1); assert_eq!(out, [0, 0, 0]); } @@ -152,7 +218,7 @@ mod tests { fn ya16_to_rgb_max_y() { let p = packed_ya(&[(0xFFFF, 0)]); let mut out = [0u8; 3]; - ya16_to_rgb_row(&p, &mut out, 1); + ya16_to_rgb_row::(&p, &mut out, 1); assert_eq!(out, [0xFF, 0xFF, 0xFF]); } @@ -163,7 +229,7 @@ mod tests { // Y=0x8000, A=0x4000 → rgba [0x80, 0x80, 0x80, 0x40] let p = packed_ya(&[(0x8000, 0x4000)]); let mut out = [0u8; 4]; - ya16_to_rgba_row(&p, &mut out, 1); + ya16_to_rgba_row::(&p, &mut out, 1); assert_eq!(out, [0x80, 0x80, 0x80, 0x40]); } @@ -171,7 +237,7 @@ mod tests { fn ya16_to_rgba_two_pixels() { let p = packed_ya(&[(0x8000, 0x4000), (0x1000, 0x0800)]); let mut out = [0u8; 8]; - ya16_to_rgba_row(&p, &mut out, 2); + ya16_to_rgba_row::(&p, &mut out, 2); assert_eq!(&out[0..4], &[0x80, 0x80, 0x80, 0x40]); assert_eq!(&out[4..8], &[0x10, 0x10, 0x10, 0x08]); } @@ -183,7 +249,7 @@ mod tests { // Y=0x8000 native, broadcast let p = packed_ya(&[(0x8000, 0x4000)]); let mut out = [0u16; 3]; - ya16_to_rgb_u16_row(&p, &mut out, 1); + ya16_to_rgb_u16_row::(&p, &mut out, 1); assert_eq!(out, [0x8000, 0x8000, 0x8000]); } @@ -191,7 +257,7 @@ mod tests { fn ya16_to_rgb_u16_zero() { let p = packed_ya(&[(0, 0)]); let mut out = [0xFFFFu16; 3]; - ya16_to_rgb_u16_row(&p, &mut out, 1); + ya16_to_rgb_u16_row::(&p, &mut out, 1); assert_eq!(out, [0, 0, 0]); } @@ -202,7 +268,7 @@ mod tests { // Y=0x8000, A=0x4000 → rgba_u16 [0x8000, 0x8000, 0x8000, 0x4000] let p = packed_ya(&[(0x8000, 0x4000)]); let mut out = [0u16; 4]; - ya16_to_rgba_u16_row(&p, &mut out, 1); + ya16_to_rgba_u16_row::(&p, &mut out, 1); assert_eq!(out, [0x8000, 0x8000, 0x8000, 0x4000]); } @@ -212,7 +278,7 @@ mod tests { fn ya16_to_luma_downshifts() { let p = packed_ya(&[(0x8000, 0x4000), (0x0000, 0xFFFF)]); let mut out = [0u8; 2]; - ya16_to_luma_row(&p, &mut out, 2); + ya16_to_luma_row::(&p, &mut out, 2); assert_eq!(out, [0x80, 0x00]); } @@ -222,7 +288,7 @@ mod tests { fn ya16_to_luma_u16_native_passthrough() { let p = packed_ya(&[(0x8000, 0x0000)]); let mut out = [0u16; 1]; - ya16_to_luma_u16_row(&p, &mut out, 1); + ya16_to_luma_u16_row::(&p, &mut out, 1); assert_eq!(out[0], 0x8000); } @@ -235,7 +301,7 @@ mod tests { let mut h = [0xFFu8; 1]; let mut s = [0xFFu8; 1]; let mut v = [0u8; 1]; - ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1); + ya16_to_hsv_row::(&p, &mut h, &mut s, &mut v, 1); assert_eq!(h[0], 0); assert_eq!(s[0], 0); assert_eq!(v[0], 0x80); @@ -247,7 +313,7 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0xFFu8; 1]; - ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1); + ya16_to_hsv_row::(&p, &mut h, &mut s, &mut v, 1); assert_eq!(v[0], 0); } @@ -257,7 +323,46 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0u8; 1]; - ya16_to_hsv_row(&p, &mut h, &mut s, &mut v, 1); + ya16_to_hsv_row::(&p, &mut h, &mut s, &mut v, 1); assert_eq!(v[0], 0xFF); } + + // ---- BE parity tests: ya16 ------------------------------------------------- + // Pattern: construct LE packed input, byte-swap each u16 element to produce + // BE input, call BE kernel, assert output equals LE-input run output. + + #[test] + fn ya16_be_parity_rgb() { + // Y=0x8000, A=0x4000 LE → RGB [0x80, 0x80, 0x80] + let le = packed_ya(&[(0x8000, 0x4000)]); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = [0u8; 3]; + let mut out_be = [0u8; 3]; + ya16_to_rgb_row::(&le, &mut out_le, 1); + ya16_to_rgb_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE ya16 rgb outputs must match"); + } + + #[test] + fn ya16_be_parity_rgba() { + // Y=0x8000, A=0x4000 LE → RGBA [0x80, 0x80, 0x80, 0x40] + let le = packed_ya(&[(0x8000, 0x4000)]); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = [0u8; 4]; + let mut out_be = [0u8; 4]; + ya16_to_rgba_row::(&le, &mut out_le, 1); + ya16_to_rgba_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE ya16 rgba outputs must match"); + } + + #[test] + fn ya16_be_parity_luma() { + let le = packed_ya(&[(0xC000, 0x0000)]); + let be: std::vec::Vec = le.iter().map(|v| v.swap_bytes()).collect(); + let mut out_le = [0u8; 1]; + let mut out_be = [0u8; 1]; + ya16_to_luma_row::(&le, &mut out_le, 1); + ya16_to_luma_row::(&be, &mut out_be, 1); + assert_eq!(out_le, out_be, "BE and LE ya16 luma outputs must match"); + } } diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs index 592befd1..b588e324 100644 --- a/src/sinker/mixed/gray.rs +++ b/src/sinker/mixed/gray.rs @@ -268,7 +268,7 @@ fn process_gray_n<'a, const BITS: u32>( // Luma u8 — always passes raw Y through, no full_range rescaling. if let Some(buf) = luma.as_deref_mut() { - gray_n_to_luma_row::( + gray_n_to_luma_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -278,7 +278,7 @@ fn process_gray_n<'a, const BITS: u32>( // Luma u16 — always passes raw Y through, no full_range rescaling. if let Some(buf) = luma_u16.as_deref_mut() { - gray_n_to_luma_u16_row::( + gray_n_to_luma_u16_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -294,7 +294,7 @@ fn process_gray_n<'a, const BITS: u32>( let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - gray_n_to_rgba_u16_row::(y_plane, rgba_u16_row, w, use_simd, full_range); + gray_n_to_rgba_u16_row::(y_plane, rgba_u16_row, w, use_simd, full_range); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_start = one_plane_start * 3; @@ -306,7 +306,7 @@ fn process_gray_n<'a, const BITS: u32>( channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - gray_n_to_rgb_u16_row::(y_plane, rgb_u16_row, w, use_simd, full_range); + gray_n_to_rgb_u16_row::(y_plane, rgb_u16_row, w, use_simd, full_range); if want_rgba_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = @@ -324,7 +324,7 @@ fn process_gray_n<'a, const BITS: u32>( if want_rgba && !want_rgb && !want_hsv { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gray_n_to_rgba_row::(y_plane, rgba_row, w, use_simd, full_range); + gray_n_to_rgba_row::(y_plane, rgba_row, w, use_simd, full_range); return Ok(()); } @@ -332,7 +332,7 @@ fn process_gray_n<'a, const BITS: u32>( // (rescaled if limited-range). if want_hsv && !want_rgb && !want_rgba { let hsv = hsv.as_mut().unwrap(); - gray_n_to_hsv_row::( + gray_n_to_hsv_row::( y_plane, &mut hsv.h[one_plane_start..one_plane_end], &mut hsv.s[one_plane_start..one_plane_end], @@ -356,7 +356,7 @@ fn process_gray_n<'a, const BITS: u32>( w, h, )?; - gray_n_to_rgb_row::(y_plane, rgb_row, w, use_simd, full_range); + gray_n_to_rgb_row::(y_plane, rgb_row, w, use_simd, full_range); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( @@ -690,7 +690,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { // Luma u8 — shift >> 8. if let Some(buf) = luma.as_deref_mut() { - gray16_to_luma_row( + gray16_to_luma_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -700,7 +700,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { // Luma u16 — identity copy. if let Some(buf) = luma_u16.as_deref_mut() { - gray16_to_luma_u16_row( + gray16_to_luma_u16_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -716,7 +716,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - gray16_to_rgba_u16_row(y_plane, rgba_u16_row, w, use_simd, full_range); + gray16_to_rgba_u16_row::(y_plane, rgba_u16_row, w, use_simd, full_range); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_start = one_plane_start * 3; @@ -729,7 +729,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - gray16_to_rgb_u16_row(y_plane, rgb_u16_row, w, use_simd, full_range); + gray16_to_rgb_u16_row::(y_plane, rgb_u16_row, w, use_simd, full_range); if want_rgba_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = @@ -750,7 +750,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { if want_rgba && !need_rgb_kernel && !want_hsv { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gray16_to_rgba_row(y_plane, rgba_row, w, use_simd, full_range); + gray16_to_rgba_row::(y_plane, rgba_row, w, use_simd, full_range); return Ok(()); } @@ -758,7 +758,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { // Skip RGB scratch entirely when only HSV (and optionally RGBA) is needed. if want_hsv && !want_rgb { let hsv = hsv.as_mut().unwrap(); - gray16_to_hsv_row( + gray16_to_hsv_row::( y_plane, &mut hsv.h[one_plane_start..one_plane_end], &mut hsv.s[one_plane_start..one_plane_end], @@ -769,7 +769,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { ); if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gray16_to_rgba_row(y_plane, rgba_row, w, use_simd, full_range); + gray16_to_rgba_row::(y_plane, rgba_row, w, use_simd, full_range); } return Ok(()); } @@ -786,7 +786,7 @@ impl PixelSink for MixedSinker<'_, Gray16> { w, h, )?; - gray16_to_rgb_row(y_plane, rgb_row, w, use_simd, full_range); + gray16_to_rgb_row::(y_plane, rgb_row, w, use_simd, full_range); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( @@ -970,7 +970,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { // luma f32 pass-through — highest priority (no clamp, no round). if let Some(buf) = self.luma_f32.as_deref_mut() { - grayf32_to_luma_f32_row( + grayf32_to_luma_f32_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -988,12 +988,12 @@ impl PixelSink for MixedSinker<'_, Grayf32> { height: h, channels: 3, })?; - grayf32_to_rgb_f32_row(y_plane, &mut buf[rgb_f32_start..rgb_f32_end], w, use_simd); + grayf32_to_rgb_f32_row::(y_plane, &mut buf[rgb_f32_start..rgb_f32_end], w, use_simd); } // luma u8. if let Some(buf) = self.luma.as_deref_mut() { - grayf32_to_luma_row( + grayf32_to_luma_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -1003,7 +1003,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { // luma u16. if let Some(buf) = self.luma_u16.as_deref_mut() { - grayf32_to_luma_u16_row( + grayf32_to_luma_u16_row::( y_plane, &mut buf[one_plane_start..one_plane_end], w, @@ -1019,7 +1019,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - grayf32_to_rgba_u16_row(y_plane, rgba_u16_row, w, use_simd); + grayf32_to_rgba_u16_row::(y_plane, rgba_u16_row, w, use_simd); } else if want_rgb_u16 { let rgb_u16_buf = self.rgb_u16.as_deref_mut().unwrap(); let rgb_plane_start = one_plane_start * 3; @@ -1032,7 +1032,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - grayf32_to_rgb_u16_row(y_plane, rgb_u16_row, w, use_simd); + grayf32_to_rgb_u16_row::(y_plane, rgb_u16_row, w, use_simd); if want_rgba_u16 { let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = @@ -1050,14 +1050,14 @@ impl PixelSink for MixedSinker<'_, Grayf32> { if want_rgba && !want_rgb && !want_hsv { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - grayf32_to_rgba_row(y_plane, rgba_row, w, use_simd); + grayf32_to_rgba_row::(y_plane, rgba_row, w, use_simd); return Ok(()); } // Standalone HSV fast path — Grayf32 always has H=0, S=0, V=clamp(Y)×255. if want_hsv && !want_rgb { let hsv = self.hsv.as_mut().unwrap(); - grayf32_to_hsv_row( + grayf32_to_hsv_row::( y_plane, &mut hsv.h[one_plane_start..one_plane_end], &mut hsv.s[one_plane_start..one_plane_end], @@ -1067,7 +1067,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { ); if let Some(buf) = self.rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - grayf32_to_rgba_row(y_plane, rgba_row, w, use_simd); + grayf32_to_rgba_row::(y_plane, rgba_row, w, use_simd); } return Ok(()); } @@ -1084,7 +1084,7 @@ impl PixelSink for MixedSinker<'_, Grayf32> { w, h, )?; - grayf32_to_rgb_row(y_plane, rgb_row, w, use_simd); + grayf32_to_rgb_row::(y_plane, rgb_row, w, use_simd); if let Some(hsv) = self.hsv.as_mut() { rgb_to_hsv_row( @@ -1454,7 +1454,7 @@ impl PixelSink for MixedSinker<'_, Ya16> { // luma u8 — `Y >> 8`. if let Some(buf) = self.luma.as_deref_mut() { - ya16_to_luma_row( + ya16_to_luma_row::( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -1464,7 +1464,7 @@ impl PixelSink for MixedSinker<'_, Ya16> { // luma u16 — native pass-through. if let Some(buf) = self.luma_u16.as_deref_mut() { - ya16_to_luma_u16_row( + ya16_to_luma_u16_row::( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -1480,7 +1480,7 @@ impl PixelSink for MixedSinker<'_, Ya16> { let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - ya16_to_rgba_u16_row(packed, rgba_u16_row, w, use_simd); + ya16_to_rgba_u16_row::(packed, rgba_u16_row, w, use_simd); } else if want_rgb_u16 { let rgb_u16_buf = self.rgb_u16.as_deref_mut().unwrap(); let rgb_plane_start = one_plane_start * 3; @@ -1493,7 +1493,7 @@ impl PixelSink for MixedSinker<'_, Ya16> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - ya16_to_rgb_u16_row(packed, rgb_u16_row, w, use_simd); + ya16_to_rgb_u16_row::(packed, rgb_u16_row, w, use_simd); if want_rgba_u16 { let rgba_u16_buf = self.rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = @@ -1513,14 +1513,14 @@ impl PixelSink for MixedSinker<'_, Ya16> { if want_rgba && !want_rgb && !want_hsv { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - ya16_to_rgba_row(packed, rgba_row, w, use_simd); + ya16_to_rgba_row::(packed, rgba_row, w, use_simd); return Ok(()); } // Standalone HSV fast path. if want_hsv && !want_rgb && !want_rgba { let hsv = self.hsv.as_mut().unwrap(); - ya16_to_hsv_row( + ya16_to_hsv_row::( packed, &mut hsv.h[one_plane_start..one_plane_end], &mut hsv.s[one_plane_start..one_plane_end], @@ -1544,7 +1544,7 @@ impl PixelSink for MixedSinker<'_, Ya16> { w, h, )?; - ya16_to_rgb_row(packed, rgb_row, w, use_simd); + ya16_to_rgb_row::(packed, rgb_row, w, use_simd); if let Some(hsv) = self.hsv.as_mut() { rgb_to_hsv_row(