diff --git a/src/row/arch/neon/packed_rgb.rs b/src/row/arch/neon/packed_rgb.rs index ccf26eb2..17954389 100644 --- a/src/row/arch/neon/packed_rgb.rs +++ b/src/row/arch/neon/packed_rgb.rs @@ -517,55 +517,61 @@ unsafe fn x2_extract_10bit_u16_lane(pix: uint32x4_t, shift: i32) -> uint16x4_t { /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -584,55 +590,61 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// 3. `x2rgb10` / `rgba_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -651,37 +663,43 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - - // Channel low bit positions: R at 20, G at 10, B at 0. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + + // Channel low bit positions: R at 20, G at 10, B at 0. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -695,54 +713,60 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// B at >>22). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -754,55 +778,61 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// NEON X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -814,37 +844,43 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// NEON X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - - // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + + // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/neon/packed_rgb_16bit.rs b/src/row/arch/neon/packed_rgb_16bit.rs index 3370c8a9..b2a35d01 100644 --- a/src/row/arch/neon/packed_rgb_16bit.rs +++ b/src/row/arch/neon/packed_rgb_16bit.rs @@ -21,6 +21,14 @@ //! - **Rgba64 / Bgra64:** `vld4q_u16(src_ptr)` → `uint16x8x4_t(ch0, ch1, ch2, ch3)`. //! For Bgra64, `ch0` = B and `ch2` = R (swapped on store). //! +//! ## Big-endian support +//! +//! Every public kernel accepts ``. When `BE = true`, each +//! per-channel `uint16x8_t` vector produced by `vld3q_u16`/`vld4q_u16` is +//! byte-swapped via `byteswap_u16x8::` before any channel math. On LE +//! targets (all current AArch64 hardware) the helper is a no-op and emits +//! zero extra instructions. +//! //! ## Depth conversion //! //! - **u16 → u8:** `vshrn_n_u16::<8>(v)` — high-byte extraction, matching @@ -35,6 +43,25 @@ use core::arch::aarch64::*; use crate::row::scalar; +// ---- endian byte-swap helper ------------------------------------------------ + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Implemented as `vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v)))`, +/// the same transform used inside `load_be_u16x8` in the NEON endian module. +/// +/// # Safety +/// +/// Caller must have NEON enabled. +#[inline(always)] +unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { + if BE { + unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -44,6 +71,8 @@ use crate::row::scalar; /// `vld3q_u16` deinterleaves into `(R, G, B)` u16x8; `vshrn_n_u16::<8>` /// narrows each channel; `vst3_u8` interleaves back. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -51,7 +80,11 @@ use crate::row::scalar; /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -59,20 +92,22 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -80,7 +115,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -89,9 +128,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -99,14 +138,15 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// NEON Rgb48 → native-depth u16 RGB (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgb48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// -/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves — no narrowing. +/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves. +/// When `BE = true` each channel is byte-swapped to host-native order before storing. /// /// # Safety /// @@ -115,7 +155,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -125,18 +169,24 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each channel is byte-swapped to host-native order before storing. +/// /// # Safety /// /// 1. NEON must be available. @@ -144,7 +194,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -155,12 +209,17 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, alpha), + uint16x8x4_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + alpha, + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -173,6 +232,7 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `vld3q_u16` deinterleaves into `(B, G, R)` u16x8; channels are swapped /// (`px.2` = R, `px.0` = B) in the `vst3_u8` call to produce R-first output. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -181,7 +241,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -190,20 +254,21 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R (source BGR order) let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); // R (was at position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (was at position 0) + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (was at position 2) + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (was at position 0) vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap on output; alpha forced to 0xFF. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -212,7 +277,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -221,9 +290,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -231,13 +300,14 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap: `px.2` → position 0 (R), `px.0` → position 2 (B). +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -246,7 +316,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -257,18 +331,23 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 // Swap B↔R: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -277,7 +356,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -289,12 +372,17 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u // Store (R=px.2, G=px.1, B=px.0, A=0xFFFF) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, alpha), + uint16x8x4_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + alpha, + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -307,6 +395,7 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// /// `vld4q_u16` deinterleaves into `(R, G, B, A)` u16x8; R/G/B narrowed; /// `vst3_u8` writes only 3 channels. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -315,7 +404,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -323,15 +416,15 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -339,6 +432,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// NEON Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `vshrn_n_u16::<8>`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -347,7 +441,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -355,10 +453,10 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -366,7 +464,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,6 +472,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// NEON Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// /// `vld4q_u16` deinterleaves; `vst3q_u16` writes R, G, B channels only. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -382,7 +481,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -393,19 +496,24 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u // Alpha (px.3) discarded. vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } -/// NEON Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgba64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// /// `vld4q_u16` deinterleaves; `vst4q_u16` reinterleaves — source alpha preserved. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -414,7 +522,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -428,12 +536,17 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, px.3), + uint16x8x4_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.3), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -446,6 +559,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `vld4q_u16` gives `(B, G, R, A)` → store `(R=px.2, G=px.1, B=px.0)`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -454,7 +568,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -463,21 +581,22 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R, px.3 = A let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); // R (from position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (from position 0) + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (from position 2) + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (from position 0) // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -486,7 +605,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -494,10 +617,10 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -505,13 +628,14 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. `vld4q_u16` → `vst3q_u16(R, G, B)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -520,7 +644,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -531,12 +659,16 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u // Swap B↔R, drop alpha: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -545,6 +677,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// B↔R swap; source alpha preserved at position 3. /// /// `vld4q_u16` gives `(B, G, R, A)` → `vst4q_u16(R=px.2, G=px.1, B=px.0, A=px.3)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -553,7 +686,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -568,12 +701,17 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( // Swap B↔R, preserve A: store (R=px.2, G=px.1, B=px.0, A=px.3) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, px.3), + uint16x8x4_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.3), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/neon/tests/packed_rgb.rs b/src/row/arch/neon/tests/packed_rgb.rs index 7e5ace29..8f597259 100644 --- a/src/row/arch/neon/tests/packed_rgb.rs +++ b/src/row/arch/neon/tests/packed_rgb.rs @@ -261,9 +261,9 @@ fn x2rgb10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -276,9 +276,9 @@ fn x2rgb10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_neon, w); + x2rgb10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -291,9 +291,9 @@ fn x2rgb10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -306,9 +306,9 @@ fn x2bgr10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -321,9 +321,9 @@ fn x2bgr10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_neon, w); + x2bgr10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -336,9 +336,9 @@ fn x2bgr10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } diff --git a/src/row/arch/neon/tests/packed_rgb_16bit.rs b/src/row/arch/neon/tests/packed_rgb_16bit.rs index ad38e7b1..c71131d7 100644 --- a/src/row/arch/neon/tests/packed_rgb_16bit.rs +++ b/src/row/arch/neon/tests/packed_rgb_16bit.rs @@ -33,8 +33,8 @@ fn neon_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgb: SIMD vs scalar mismatch"); } @@ -45,8 +45,8 @@ fn neon_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgba: SIMD vs scalar mismatch"); } @@ -57,8 +57,8 @@ fn neon_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16: SIMD vs scalar mismatch" @@ -72,8 +72,8 @@ fn neon_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16: SIMD vs scalar mismatch" @@ -91,8 +91,8 @@ fn neon_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgb: SIMD vs scalar mismatch"); } @@ -103,8 +103,8 @@ fn neon_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgba: SIMD vs scalar mismatch"); } @@ -115,8 +115,8 @@ fn neon_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16: SIMD vs scalar mismatch" @@ -130,8 +130,8 @@ fn neon_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16: SIMD vs scalar mismatch" @@ -149,8 +149,8 @@ fn neon_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgb: SIMD vs scalar mismatch"); } @@ -161,8 +161,8 @@ fn neon_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgba: SIMD vs scalar mismatch"); } @@ -173,8 +173,8 @@ fn neon_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16: SIMD vs scalar mismatch" @@ -188,8 +188,8 @@ fn neon_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16: SIMD vs scalar mismatch" @@ -207,8 +207,8 @@ fn neon_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgb: SIMD vs scalar mismatch"); } @@ -219,8 +219,8 @@ fn neon_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgba: SIMD vs scalar mismatch"); } @@ -231,8 +231,8 @@ fn neon_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16: SIMD vs scalar mismatch" @@ -246,8 +246,8 @@ fn neon_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16: SIMD vs scalar mismatch" @@ -265,8 +265,8 @@ fn neon_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -280,8 +280,8 @@ fn neon_rgba64_to_rgba_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 4]; let mut scalar_out = std::vec![0u8; 8 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 8); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgba exact-8: SIMD vs scalar mismatch" @@ -299,8 +299,8 @@ fn neon_rgb48_to_rgb_width1_scalar_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -314,8 +314,8 @@ fn neon_bgra64_to_rgba_u16_width1_scalar_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" diff --git a/src/row/arch/wasm_simd128/packed_rgb.rs b/src/row/arch/wasm_simd128/packed_rgb.rs index 49d1edb9..53644ab0 100644 --- a/src/row/arch/wasm_simd128/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/packed_rgb.rs @@ -623,99 +623,105 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); - - // Extract 10-bit channels as u32x4 (low 10 bits set per lane). - // X2RGB10: R at >>20, G at >>10, B at >>0. - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); - let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); - let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - let b2 = v128_and(p2, mask_3ff); - let b3 = v128_and(p3, mask_3ff); - - // Down-shift 10-bit → 8-bit. - let r0_u8 = u32x4_shr(r0, 2); - let r1_u8 = u32x4_shr(r1, 2); - let r2_u8 = u32x4_shr(r2, 2); - let r3_u8 = u32x4_shr(r3, 2); - let g0_u8 = u32x4_shr(g0, 2); - let g1_u8 = u32x4_shr(g1, 2); - let g2_u8 = u32x4_shr(g2, 2); - let g3_u8 = u32x4_shr(g3, 2); - let b0_u8 = u32x4_shr(b0, 2); - let b1_u8 = u32x4_shr(b1, 2); - let b2_u8 = u32x4_shr(b2, 2); - let b3_u8 = u32x4_shr(b3, 2); - - // u32x4 → u16x8 (saturating narrow). - let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); - let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); - let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); - let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); - let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); - let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); - - // u16x8 → u8x16. - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - // Interleave (R, G, B) into 48 packed bytes via the same - // 9-shuffle pattern used by the YUV→RGB kernels. - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); + + // Extract 10-bit channels as u32x4 (low 10 bits set per lane). + // X2RGB10: R at >>20, G at >>10, B at >>0. + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); + let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); + let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + let b2 = v128_and(p2, mask_3ff); + let b3 = v128_and(p3, mask_3ff); + + // Down-shift 10-bit → 8-bit. + let r0_u8 = u32x4_shr(r0, 2); + let r1_u8 = u32x4_shr(r1, 2); + let r2_u8 = u32x4_shr(r2, 2); + let r3_u8 = u32x4_shr(r3, 2); + let g0_u8 = u32x4_shr(g0, 2); + let g1_u8 = u32x4_shr(g1, 2); + let g2_u8 = u32x4_shr(g2, 2); + let g3_u8 = u32x4_shr(g3, 2); + let b0_u8 = u32x4_shr(b0, 2); + let b1_u8 = u32x4_shr(b1, 2); + let b2_u8 = u32x4_shr(b2, 2); + let b3_u8 = u32x4_shr(b3, 2); + + // u32x4 → u16x8 (saturating narrow). + let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); + let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); + let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); + let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); + let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); + let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); + + // u16x8 → u8x16. + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + // Interleave (R, G, B) into 48 packed bytes via the same + // 9-shuffle pattern used by the YUV→RGB kernels. + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -728,7 +734,11 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// to `0xFF`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -736,36 +746,38 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - - // Extract 10-bit channels into u32 lanes, down-shift to u8. - let r = v128_and(u32x4_shr(pix, 20), mask_3ff); - let g = v128_and(u32x4_shr(pix, 10), mask_3ff); - let b = v128_and(pix, mask_3ff); - let r = u32x4_shr(r, 2); - let g = u32x4_shr(g, 2); - let b = u32x4_shr(b, 2); - - // Pack (R, G, B, 0xFF) bytes per pixel. - // Each channel value is in low byte of its u32 lane. - // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + + // Extract 10-bit channels into u32 lanes, down-shift to u8. + let r = v128_and(u32x4_shr(pix, 20), mask_3ff); + let g = v128_and(u32x4_shr(pix, 10), mask_3ff); + let b = v128_and(pix, mask_3ff); + let r = u32x4_shr(r, 2); + let g = u32x4_shr(g, 2); + let b = u32x4_shr(b, 2); + + // Pack (R, G, B, 0xFF) bytes per pixel. + // Each channel value is in low byte of its u32 lane. + // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -777,72 +789,78 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2RGB10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - // Interleave (R, G, B) u16x8 into 24 u16 elements. - // Element granularity is u16 (2 bytes); shuffle masks below - // index by byte. For u16-per-element interleave, byte mask - // pulls 2 consecutive bytes per element. - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + // Interleave (R, G, B) u16x8 into 24 u16 elements. + // Element granularity is u16 (2 bytes); shuffle masks below + // index by byte. For u16-per-element interleave, byte mask + // pulls 2 consecutive bytes per element. + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -855,80 +873,86 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// extracts R from low bits and B from high bits. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); - - // X2BGR10: R at low 10, G at >>10, B at >>20. - let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); - let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); - let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); - let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); - let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); - let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); - let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); - let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); - let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); - let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); - let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); - let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); - - let r_lo = u16x8_narrow_i32x4(r0, r1); - let r_hi = u16x8_narrow_i32x4(r2, r3); - let g_lo = u16x8_narrow_i32x4(g0, g1); - let g_hi = u16x8_narrow_i32x4(g2, g3); - let b_lo = u16x8_narrow_i32x4(b0, b1); - let b_hi = u16x8_narrow_i32x4(b2, b3); - - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); + + // X2BGR10: R at low 10, G at >>10, B at >>20. + let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); + let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); + let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); + let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); + let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); + let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); + let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); + let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); + let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); + let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); + let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); + let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); + + let r_lo = u16x8_narrow_i32x4(r0, r1); + let r_hi = u16x8_narrow_i32x4(r2, r3); + let g_lo = u16x8_narrow_i32x4(g0, g1); + let g_hi = u16x8_narrow_i32x4(g2, g3); + let b_lo = u16x8_narrow_i32x4(b0, b1); + let b_hi = u16x8_narrow_i32x4(b2, b3); + + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -941,7 +965,11 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// holds 4 RGBA pixels). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -949,30 +977,32 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - - // X2BGR10 channel positions: R at low, G mid, B high. - let r = u32x4_shr(v128_and(pix, mask_3ff), 2); - let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); - let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); - - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + + // X2BGR10 channel positions: R at low, G mid, B high. + let r = u32x4_shr(v128_and(pix, mask_3ff), 2); + let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); + let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); + + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -984,68 +1014,74 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(p0, mask_3ff); - let r1 = v128_and(p1, mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(p0, mask_3ff); + let r1 = v128_and(p1, mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs index 087eb8f3..4fa3fed5 100644 --- a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs @@ -217,6 +217,24 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { u8x16_narrow_i16x8(shr, zero) } +// ---- endian byte-swap helper ------------------------------------------------- + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Uses `u8x16_swizzle` with a compile-time mask. +#[inline(always)] +unsafe fn byteswap_if_be(v: v128) -> v128 { + if BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + u8x16_swizzle( + v, + i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + ) + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -234,7 +252,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -242,9 +264,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -257,7 +279,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -271,7 +293,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -280,9 +306,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -294,7 +320,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -308,7 +334,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -316,15 +346,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -338,7 +368,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -347,15 +381,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,7 +408,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -382,9 +420,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -396,7 +434,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -411,7 +449,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -420,9 +462,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -433,7 +475,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -448,7 +490,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -456,16 +502,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B order write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -480,7 +526,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -489,16 +539,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B, A order write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -517,7 +567,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -525,10 +579,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -539,7 +593,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -554,7 +608,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -562,10 +620,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -577,7 +635,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -592,7 +650,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -600,16 +662,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -624,7 +686,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -636,16 +698,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,7 +728,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -674,10 +740,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -689,7 +755,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -704,7 +770,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -712,10 +782,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -727,7 +797,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -742,7 +812,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -750,17 +824,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0) let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -775,7 +849,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -787,17 +861,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0, A=ch3) let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb.rs b/src/row/arch/wasm_simd128/tests/packed_rgb.rs index dbd979af..6e99d430 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb.rs @@ -207,9 +207,9 @@ fn simd128_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -224,9 +224,9 @@ fn simd128_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_wasm, w); + x2rgb10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -241,9 +241,9 @@ fn simd128_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -258,9 +258,9 @@ fn simd128_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -275,9 +275,9 @@ fn simd128_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_wasm, w); + x2bgr10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -292,9 +292,9 @@ fn simd128_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs index e2ab1f78..40b1e770 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn wasm_rgb48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xDEAD_BEEF_1234_5678); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb diverges (width={w})"); } } @@ -49,8 +49,8 @@ fn wasm_rgb48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xCAFE_BABE_DEAD_1234); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba diverges (width={w})"); } } @@ -62,8 +62,8 @@ fn wasm_rgb48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xFEED_FACE_ABCD_EF01); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb_u16 diverges (width={w})"); } } @@ -75,8 +75,8 @@ fn wasm_rgb48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x1234_5678_9ABC_DEF0); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba_u16 diverges (width={w})"); } } @@ -92,8 +92,8 @@ fn wasm_bgr48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xABCD_EF01_2345_6789); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb diverges (width={w})"); } } @@ -105,8 +105,8 @@ fn wasm_bgr48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x9876_5432_10FE_DCBA); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba diverges (width={w})"); } } @@ -118,8 +118,8 @@ fn wasm_bgr48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x0011_2233_4455_6677); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb_u16 diverges (width={w})"); } } @@ -131,8 +131,8 @@ fn wasm_bgr48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x8899_AABB_CCDD_EEFF); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba_u16 diverges (width={w})"); } } @@ -148,8 +148,8 @@ fn wasm_rgba64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xF0F0_F0F0_0F0F_0F0F); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb diverges (width={w})"); } } @@ -161,8 +161,8 @@ fn wasm_rgba64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x1357_9BDF_2468_ACE0); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba diverges (width={w})"); } } @@ -174,8 +174,8 @@ fn wasm_rgba64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x2468_ACE0_1357_9BDF); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb_u16 diverges (width={w})"); } } @@ -187,8 +187,8 @@ fn wasm_rgba64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x3C3C_C3C3_5A5A_A5A5); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba_u16 diverges (width={w})"); } } @@ -204,8 +204,8 @@ fn wasm_bgra64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x7654_3210_FEDC_BA98); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb diverges (width={w})"); } } @@ -217,8 +217,8 @@ fn wasm_bgra64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xAABB_CCDD_EEFF_0011); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba diverges (width={w})"); } } @@ -230,8 +230,8 @@ fn wasm_bgra64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x5566_7788_99AA_BBCC); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb_u16 diverges (width={w})"); } } @@ -243,8 +243,8 @@ fn wasm_bgra64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xDDEE_FF00_1122_3344); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba_u16 diverges (width={w})"); } } diff --git a/src/row/arch/x86_avx2/packed_rgb.rs b/src/row/arch/x86_avx2/packed_rgb.rs index bfae38a0..b90174f8 100644 --- a/src/row/arch/x86_avx2/packed_rgb.rs +++ b/src/row/arch/x86_avx2/packed_rgb.rs @@ -445,21 +445,27 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -471,21 +477,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2RGB10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -497,21 +509,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2RGB10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -523,21 +541,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX2 X2BGR10→RGB. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -549,21 +573,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2BGR10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -575,21 +605,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2BGR10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx2/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/packed_rgb_16bit.rs index 086a689f..db9343e2 100644 --- a/src/row/arch/x86_avx2/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/packed_rgb_16bit.rs @@ -297,6 +297,41 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3 subset of AVX2). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Byte-swap every u16 lane in a `__m256i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm256_shuffle_epi8` (AVX2). +#[inline(always)] +unsafe fn byteswap256_if_be(v: __m256i) -> __m256i { + if BE { + // Same u16-lane byte-swap mask, broadcast to both 128-bit lanes. + const MASK: __m256i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm256_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -307,6 +342,7 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// target_feature, exploiting that SSE4.1/SSSE3 are AVX2 subsets. Each half /// deinterleaves with shuffle masks, narrows via `>> 8`, writes 8 pixels /// (24 bytes). 16 pixels are produced per outer loop iteration. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -315,7 +351,11 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -327,9 +367,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let ptr = rgb48.as_ptr().add(x * 3); // First half: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -340,9 +380,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi // Second half: pixels x+8..x+15 let ptr8 = ptr.add(24); // 24 u16 ahead = 8 pixels × 3 channels - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -355,13 +395,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi } // Handle remaining pixels (< 16) via scalar fallback. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → packed u8 RGBA. 16 pixels per outer iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -369,7 +411,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -381,9 +427,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -393,9 +439,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -407,13 +453,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGB (identity repack). 16 pixels per iteration. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -421,7 +469,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -430,29 +482,31 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGBA. 16 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -460,7 +514,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -470,23 +528,23 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -499,6 +557,7 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -507,7 +566,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -517,9 +580,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -529,9 +592,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -543,13 +606,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 16; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → packed u8 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -558,7 +622,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -570,9 +638,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -582,9 +650,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -596,13 +664,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGB. 16 pixels per outer iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -611,7 +680,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -620,29 +693,30 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -651,7 +725,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -661,23 +739,23 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -691,6 +769,7 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// Loads 4 × `__m256i` (64 u16 = 16 pixels), deinterleaves via the /// cascade helper, narrows via `>> 8` + `packus_epi16` + lane fix, writes /// 16 pixels (48 bytes) via `write_rgb_16` on the low 128 bits. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -699,7 +778,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -708,10 +791,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -720,13 +803,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -735,7 +819,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -744,10 +832,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -757,13 +845,15 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGB. 16 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -771,7 +861,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -779,10 +873,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Write in two 8-pixel halves using the existing 128-bit helper. write_rgb_u16_8( @@ -800,13 +894,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGBA (identity copy). 16 pixels per iteration. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -815,7 +910,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -827,10 +922,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( _mm256_castsi256_si128(r_u16), @@ -849,7 +944,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( x += 16; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -862,6 +957,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_16px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -870,7 +966,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -879,10 +979,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); @@ -892,13 +992,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -907,7 +1008,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -916,10 +1021,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -929,13 +1034,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGB. 16 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -944,7 +1050,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -952,10 +1062,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Swap B↔R: store (R, G, B) write_rgb_u16_8( @@ -973,13 +1083,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -988,7 +1099,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1000,10 +1111,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( @@ -1023,7 +1134,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( x += 16; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx2/tests/packed_rgb.rs b/src/row/arch/x86_avx2/tests/packed_rgb.rs index 981c50e0..16ea736c 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb.rs @@ -231,9 +231,9 @@ fn avx2_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -251,9 +251,9 @@ fn avx2_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -271,9 +271,9 @@ fn avx2_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -291,9 +291,9 @@ fn avx2_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -311,9 +311,9 @@ fn avx2_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -331,9 +331,9 @@ fn avx2_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs index 1490d6e1..9dceec81 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs @@ -63,8 +63,8 @@ fn avx2_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -80,8 +80,8 @@ fn avx2_rgb48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xF0F0); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-16: SIMD vs scalar mismatch" @@ -97,8 +97,8 @@ fn avx2_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -115,8 +115,8 @@ fn avx2_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -136,8 +136,8 @@ fn avx2_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -157,8 +157,8 @@ fn avx2_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -175,8 +175,8 @@ fn avx2_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -196,8 +196,8 @@ fn avx2_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -217,8 +217,8 @@ fn avx2_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn avx2_bgr48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xA1A1); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-16: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx2_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); // reuse helper (ch0 treated as B) let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx2_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx2_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx2_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx2_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx2_rgba64_to_rgb_exact16_matches_scalar() { let src = make_rgba64_src(16, 0x0F0F); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-16: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx2_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx2_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx2_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx2_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx2_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx2_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx2_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx2_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx2_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx2_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx2_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx2_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx2_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx2_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx2_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx2_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -722,7 +722,7 @@ fn avx2_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -739,7 +739,7 @@ fn avx2_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -755,7 +755,7 @@ fn avx2_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..17 { @@ -774,7 +774,7 @@ fn avx2_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); diff --git a/src/row/arch/x86_avx512/packed_rgb.rs b/src/row/arch/x86_avx512/packed_rgb.rs index 164804d6..84d6e8c6 100644 --- a/src/row/arch/x86_avx512/packed_rgb.rs +++ b/src/row/arch/x86_avx512/packed_rgb.rs @@ -446,23 +446,29 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -474,23 +480,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2RGB10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -502,23 +514,29 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2RGB10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -530,23 +548,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX-512 X2BGR10→RGB. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -558,23 +582,29 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2BGR10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -586,23 +616,29 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2BGR10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx512/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/packed_rgb_16bit.rs index 243fff83..3b000cbb 100644 --- a/src/row/arch/x86_avx512/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/packed_rgb_16bit.rs @@ -240,6 +240,42 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { unsafe { _mm512_cvtusepi16_epi8(_mm512_srli_epi16::<8>(v)) } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of AVX-512). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Byte-swap every u16 lane in a `__m512i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm512_shuffle_epi8` (AVX-512BW). +#[inline(always)] +unsafe fn byteswap512_if_be(v: __m512i) -> __m512i { + if BE { + // Same u16-lane byte-swap mask, broadcast across all 64 bytes. + const MASK: __m512i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, + 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm512_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -249,6 +285,7 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// Processes four 8-pixel halves (3 × 128-bit loads each) under the /// AVX-512 target_feature context (SSE4.1/SSSE3 are subsets). Narrows /// each channel via `>> 8` and writes 8 pixels (24 bytes) per half. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -257,7 +294,11 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -268,9 +309,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], while x + 32 <= width { let ptr = rgb48.as_ptr().add(x * 3); // Half 0: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -281,9 +322,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 1: pixels x+8..x+15 let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -294,9 +335,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 2: pixels x+16..x+23 let ptr16 = ptr.add(48); - let v6 = _mm_loadu_si128(ptr16.cast()); - let v7 = _mm_loadu_si128(ptr16.add(8).cast()); - let v8 = _mm_loadu_si128(ptr16.add(16).cast()); + let v6 = byteswap128_if_be::(_mm_loadu_si128(ptr16.cast())); + let v7 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(8).cast())); + let v8 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(16).cast())); let (r2, g2, b2) = deinterleave_rgb48_8px(v6, v7, v8); let r2u8 = narrow_u16x8_to_u8x8(r2, zero); let g2u8 = narrow_u16x8_to_u8x8(g2, zero); @@ -307,9 +348,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 3: pixels x+24..x+31 let ptr24 = ptr.add(72); - let v9 = _mm_loadu_si128(ptr24.cast()); - let v10 = _mm_loadu_si128(ptr24.add(8).cast()); - let v11 = _mm_loadu_si128(ptr24.add(16).cast()); + let v9 = byteswap128_if_be::(_mm_loadu_si128(ptr24.cast())); + let v10 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(8).cast())); + let v11 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(16).cast())); let (r3, g3, b3) = deinterleave_rgb48_8px(v9, v10, v11); let r3u8 = narrow_u16x8_to_u8x8(r3, zero); let g3u8 = narrow_u16x8_to_u8x8(g3, zero); @@ -322,13 +363,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], } // Scalar tail: remaining < 32 pixels. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → packed u8 RGBA. 32 pixels per outer iteration. Alpha /// forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -337,7 +379,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -351,9 +397,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] macro_rules! process_half { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -372,13 +418,15 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGB (identity repack). 32 pixels per iter. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available. @@ -386,7 +434,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -397,9 +449,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u macro_rules! process_half_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -413,13 +465,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGBA. 32 pixels per iter. Alpha forced to /// 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -428,7 +481,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -444,9 +497,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( macro_rules! process_half_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -460,7 +513,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -471,6 +524,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// AVX-512 Bgr48 → packed u8 RGB. 32 pixels per outer iteration. /// B↔R swap via passing `(ch2, ch1, ch0)` to write helpers. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -479,7 +533,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -491,9 +549,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], macro_rules! process_half_bgr { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -512,13 +570,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], x += 32; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → packed u8 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -527,7 +586,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -541,9 +604,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] macro_rules! process_half_bgr_rgba { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -562,13 +625,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGB. 32 pixels per iter. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -577,7 +641,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -588,9 +656,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u macro_rules! process_half_bgr_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -604,13 +672,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -619,7 +688,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -635,9 +704,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( macro_rules! process_half_bgr_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -651,7 +720,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,6 +735,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 32 pixels (96 bytes) via `write_rgb_16` on 128-bit quarters. /// /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -674,7 +744,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -682,10 +756,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -707,13 +781,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -722,7 +797,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -730,10 +809,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -757,13 +836,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -772,7 +852,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -784,23 +864,24 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); // Use the shared write_rgb_u16_32 helper (writes 32 px = 4 × 8-px chunks). write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGBA (identity copy). 32 pixels per iter. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -809,7 +890,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -821,10 +902,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let opaque = _mm_set1_epi16(-1i16); // 0xFFFF placeholder — not used; a_u16 has real alpha let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -862,7 +943,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -875,6 +956,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_32px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -883,7 +965,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -891,10 +977,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); @@ -916,13 +1002,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -931,7 +1018,11 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -939,10 +1030,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -966,13 +1057,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -981,7 +1073,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -993,23 +1085,24 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -1018,7 +1111,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1030,10 +1123,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -1068,7 +1161,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx512/tests/packed_rgb.rs b/src/row/arch/x86_avx512/tests/packed_rgb.rs index 1cb18dbb..4fb00aa2 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn avx512_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -263,9 +263,9 @@ fn avx512_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -283,9 +283,9 @@ fn avx512_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -303,9 +303,9 @@ fn avx512_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -323,9 +323,9 @@ fn avx512_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -343,9 +343,9 @@ fn avx512_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs index fe4c2536..4ae0709b 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs @@ -65,8 +65,8 @@ fn avx512_rgb48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0101); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=33: SIMD vs scalar mismatch" @@ -82,8 +82,8 @@ fn avx512_rgb48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xF0F0); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-32: SIMD vs scalar mismatch" @@ -99,8 +99,8 @@ fn avx512_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -117,8 +117,8 @@ fn avx512_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -138,8 +138,8 @@ fn avx512_rgb48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0303); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=33: SIMD vs scalar mismatch" @@ -159,8 +159,8 @@ fn avx512_rgb48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0505); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -177,8 +177,8 @@ fn avx512_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -198,8 +198,8 @@ fn avx512_rgb48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0707); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -219,8 +219,8 @@ fn avx512_bgr48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x1111); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=33: SIMD vs scalar mismatch" @@ -236,8 +236,8 @@ fn avx512_bgr48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xA1A1); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-32: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx512_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx512_bgr48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x2222); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=33: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx512_bgr48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x3333); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx512_bgr48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x4444); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx512_rgba64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0xAAAA); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=33: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx512_rgba64_to_rgb_exact32_matches_scalar() { let src = make_rgba64_src(32, 0x0F0F); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-32: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx512_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx512_rgba64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0xBBBB); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=33: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx512_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx512_rgba64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xCCCC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx512_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx512_rgba64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDDDD); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx512_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx512_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx512_bgra64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0x1234); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=33: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx512_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx512_bgra64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0x5678); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=33: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx512_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx512_bgra64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0x9ABC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx512_bgra64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDEF0); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx512_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx512_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -720,7 +720,7 @@ fn avx512_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -737,7 +737,7 @@ fn avx512_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -753,7 +753,7 @@ fn avx512_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..33 { @@ -772,7 +772,7 @@ fn avx512_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); diff --git a/src/row/arch/x86_sse41/packed_rgb.rs b/src/row/arch/x86_sse41/packed_rgb.rs index e5bb35e8..12dccd40 100644 --- a/src/row/arch/x86_sse41/packed_rgb.rs +++ b/src/row/arch/x86_sse41/packed_rgb.rs @@ -426,18 +426,24 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -449,21 +455,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2RGB10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgba_16_pixels( - x2rgb10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgba_16_pixels( + x2rgb10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -476,21 +488,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// `u16`, max value `1023`). 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2rgb10_to_rgb_u16_8_pixels( - x2rgb10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2rgb10_to_rgb_u16_8_pixels( + x2rgb10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -502,18 +520,24 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// SSE4.1 X2BGR10→RGB. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -525,21 +549,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgba_16_pixels( - x2bgr10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgba_16_pixels( + x2bgr10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -551,21 +581,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// SSE4.1 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2bgr10_to_rgb_u16_8_pixels( - x2bgr10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2bgr10_to_rgb_u16_8_pixels( + x2bgr10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_sse41/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/packed_rgb_16bit.rs index b9dc50fa..c9a8bff4 100644 --- a/src/row/arch/x86_sse41/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/packed_rgb_16bit.rs @@ -306,6 +306,24 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) } } +// ---- endian byte-swap helper ------------------------------------------------ + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of SSE4.1) with the same mask as +/// `endian::BYTESWAP_MASK_U16`. +#[inline(always)] +unsafe fn byteswap_if_be(v: __m128i) -> __m128i { + if BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -314,6 +332,7 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// /// Loads 3 × 128-bit chunks (24 u16), deinterleaves with shuffle masks, /// narrows via `>> 8`, writes 8 pixels (24 bytes) of interleaved RGB. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -322,7 +341,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -331,9 +354,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -345,13 +368,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -359,7 +384,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -370,9 +399,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -383,14 +412,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// SSE4.1 Rgb48 → native-depth u16 RGB (identity repack). 8 pixels per iteration. +/// SSE4.1 Rgb48 → native-depth u16 RGB. 8 pixels per iteration. /// /// Deinterleaves with shuffle masks, writes 8 pixels via `write_rgb_u16_8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -399,7 +429,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -407,21 +441,23 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → native-depth u16 RGBA. 8 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -429,7 +465,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -442,9 +478,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -456,7 +492,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -469,6 +505,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -477,7 +514,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -486,9 +527,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R (source BGR order) let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -500,13 +541,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -515,7 +557,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -526,9 +572,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -539,13 +585,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -554,7 +601,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -562,22 +613,23 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); // Store as R, G, B (swap applied by argument order) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -586,7 +638,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -599,9 +651,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -613,7 +665,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -624,6 +676,8 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// SSE4.1 Rgba64 → packed u8 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -631,7 +685,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -640,10 +698,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -654,7 +712,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -662,6 +720,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// SSE4.1 Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `>> 8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -670,7 +729,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -679,10 +742,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -694,13 +757,15 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -708,7 +773,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -720,16 +785,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -737,6 +802,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// SSE4.1 Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per iteration. /// /// All 4 channels passed through at native depth; source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -745,7 +811,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -757,16 +823,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -779,6 +845,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_8px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -787,7 +854,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -796,10 +867,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -811,13 +882,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -826,7 +898,11 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -835,10 +911,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -850,13 +926,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -865,7 +942,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -877,23 +954,24 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); // Swap B↔R: store (R, G, B) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -902,7 +980,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -914,17 +992,17 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_sse41/tests/packed_rgb.rs b/src/row/arch/x86_sse41/tests/packed_rgb.rs index 10f81926..e64aeba0 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn sse41_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -263,9 +263,9 @@ fn sse41_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_sse, w); + x2rgb10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -283,9 +283,9 @@ fn sse41_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -303,9 +303,9 @@ fn sse41_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -323,9 +323,9 @@ fn sse41_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_sse, w); + x2bgr10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -343,9 +343,9 @@ fn sse41_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs index 319ee5f9..57c5c8b6 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn sse41_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -53,8 +53,8 @@ fn sse41_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -70,8 +70,8 @@ fn sse41_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -91,8 +91,8 @@ fn sse41_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -112,8 +112,8 @@ fn sse41_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -133,8 +133,8 @@ fn sse41_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -154,8 +154,8 @@ fn sse41_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -171,8 +171,8 @@ fn sse41_bgr48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xA1A1); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-8: SIMD vs scalar mismatch" @@ -192,8 +192,8 @@ fn sse41_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -213,8 +213,8 @@ fn sse41_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn sse41_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -255,8 +255,8 @@ fn sse41_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -272,8 +272,8 @@ fn sse41_rgba64_to_rgb_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-8: SIMD vs scalar mismatch" @@ -293,8 +293,8 @@ fn sse41_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -314,8 +314,8 @@ fn sse41_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -335,8 +335,8 @@ fn sse41_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -352,8 +352,8 @@ fn sse41_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -373,8 +373,8 @@ fn sse41_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn sse41_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -415,8 +415,8 @@ fn sse41_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -436,8 +436,8 @@ fn sse41_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -453,8 +453,8 @@ fn sse41_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" diff --git a/src/row/dispatch/packed_rgb_16bit.rs b/src/row/dispatch/packed_rgb_16bit.rs index 6e317177..6ceb854a 100644 --- a/src/row/dispatch/packed_rgb_16bit.rs +++ b/src/row/dispatch/packed_rgb_16bit.rs @@ -72,7 +72,12 @@ fn rgba64_packed_elems(width: usize) -> usize { /// Converts one row of `Rgb48` to packed u8 RGB. Each 16-bit channel is /// narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -81,38 +86,43 @@ pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_row::(rgb48, rgb_out, width); } /// Converts one row of `Rgb48` to packed u8 RGBA. Alpha forced to `0xFF`. /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -121,38 +131,43 @@ pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_row::(rgb48, rgba_out, width); } /// Converts one row of `Rgb48` to native-depth u16 RGB (identity copy). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -161,38 +176,43 @@ pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_u16_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } /// Converts one row of `Rgb48` to native-depth u16 RGBA. Alpha forced to /// `0xFFFF`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -201,32 +221,32 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_u16_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } /// Derives 8-bit luma from one row of `Rgb48` source. Narrows to u8 RGB via @@ -234,7 +254,7 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, /// `rgb_to_luma_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_row( +pub fn rgb48_to_luma_row( rgb48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -248,7 +268,7 @@ pub fn rgb48_to_luma_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -258,7 +278,7 @@ pub fn rgb48_to_luma_row( /// the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_u16_row( +pub fn rgb48_to_luma_u16_row( rgb48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -272,7 +292,7 @@ pub fn rgb48_to_luma_u16_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -281,7 +301,7 @@ pub fn rgb48_to_luma_u16_row( /// `rgb_to_hsv_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_hsv_row( +pub fn rgb48_to_hsv_row( rgb48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -297,7 +317,7 @@ pub fn rgb48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -308,7 +328,12 @@ pub fn rgb48_to_hsv_row( /// Converts one row of `Bgr48` to packed u8 RGB (B↔R swap, narrow via `>> 8`). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -317,38 +342,43 @@ pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_row::(bgr48, rgb_out, width); } /// Converts one row of `Bgr48` to packed u8 RGBA (B↔R swap, alpha forced to /// `0xFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -357,38 +387,43 @@ pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_row::(bgr48, rgba_out, width); } /// Converts one row of `Bgr48` to native-depth u16 RGB (B↔R swap, values /// unchanged). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -397,38 +432,43 @@ pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_u16_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } /// Converts one row of `Bgr48` to native-depth u16 RGBA (B↔R swap, alpha /// forced to `0xFFFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -437,39 +477,39 @@ pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_u16_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } /// Derives 8-bit luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_row( +pub fn bgr48_to_luma_row( bgr48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -483,7 +523,7 @@ pub fn bgr48_to_luma_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -491,7 +531,7 @@ pub fn bgr48_to_luma_row( /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_u16_row( +pub fn bgr48_to_luma_u16_row( bgr48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -505,7 +545,7 @@ pub fn bgr48_to_luma_u16_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -513,7 +553,7 @@ pub fn bgr48_to_luma_u16_row( /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_hsv_row( +pub fn bgr48_to_hsv_row( bgr48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -529,7 +569,7 @@ pub fn bgr48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -540,7 +580,12 @@ pub fn bgr48_to_hsv_row( /// Converts one row of `Rgba64` to packed u8 RGB. Source alpha is discarded; /// R/G/B narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -549,38 +594,43 @@ pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_row::(rgba64, rgb_out, width); } /// Converts one row of `Rgba64` to packed u8 RGBA. All 4 channels narrowed via /// `>> 8`; source alpha passes through. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -589,38 +639,43 @@ pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_row::(rgba64, rgba_out, width); } /// Converts one row of `Rgba64` to native-depth u16 RGB. Source alpha /// discarded; R/G/B copied as-is. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -629,38 +684,43 @@ pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_u16_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } /// Converts one row of `Rgba64` to native-depth u16 RGBA (identity copy of all /// 4 channels; source alpha preserved). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_u16_row( + rgba64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -669,32 +729,32 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_u16_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } /// Derives 8-bit luma from one row of `Rgba64` source. Narrows to u8 RGB via @@ -702,7 +762,7 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_row( +pub fn rgba64_to_luma_row( rgba64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -716,7 +776,7 @@ pub fn rgba64_to_luma_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -725,7 +785,7 @@ pub fn rgba64_to_luma_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_u16_row( +pub fn rgba64_to_luma_u16_row( rgba64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -739,7 +799,7 @@ pub fn rgba64_to_luma_u16_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -748,7 +808,7 @@ pub fn rgba64_to_luma_u16_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_hsv_row( +pub fn rgba64_to_hsv_row( rgba64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -764,7 +824,7 @@ pub fn rgba64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -775,7 +835,12 @@ pub fn rgba64_to_hsv_row( /// Converts one row of `Bgra64` to packed u8 RGB (B↔R swap, drop alpha, /// narrow via `>> 8`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -784,39 +849,44 @@ pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_row::(bgra64, rgb_out, width); } /// Converts one row of `Bgra64` to packed u8 RGBA (B↔R swap, all 4 channels /// narrowed via `>> 8`; source alpha passes through). `use_simd = false` forces /// the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -825,38 +895,43 @@ pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_row::(bgra64, rgba_out, width); } /// Converts one row of `Bgra64` to native-depth u16 RGB (B↔R swap, drop alpha, /// values copied as-is). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -865,38 +940,43 @@ pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_u16_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } /// Converts one row of `Bgra64` to native-depth u16 RGBA (B↔R swap; source /// alpha preserved at position 3). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_u16_row( + bgra64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -905,32 +985,32 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_u16_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } /// Derives 8-bit luma from one row of `Bgra64` source. Narrows to u8 RGB via @@ -938,7 +1018,7 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_row( +pub fn bgra64_to_luma_row( bgra64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -952,7 +1032,7 @@ pub fn bgra64_to_luma_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -961,7 +1041,7 @@ pub fn bgra64_to_luma_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_u16_row( +pub fn bgra64_to_luma_u16_row( bgra64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -975,7 +1055,7 @@ pub fn bgra64_to_luma_u16_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -984,7 +1064,7 @@ pub fn bgra64_to_luma_u16_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_hsv_row( +pub fn bgra64_to_hsv_row( bgra64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1000,7 +1080,7 @@ pub fn bgra64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -1035,7 +1115,7 @@ mod tests { // All-white Rgb48: each u16 channel = 0xFFFF; narrowed >> 8 = 0xFF. let src = solid_rgb48(4, 0xFFFF); let mut rgb = std::vec![0u8; 4 * 3]; - rgb48_to_rgb_row(&src, &mut rgb, 4, false); + rgb48_to_rgb_row::(&src, &mut rgb, 4, false); assert!( rgb.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {rgb:?}" @@ -1046,7 +1126,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_scalar_path() { let src = solid_rgb48(4, 0x1200); let mut rgba = std::vec![0u8; 4 * 4]; - rgb48_to_rgba_row(&src, &mut rgba, 4, false); + rgb48_to_rgba_row::(&src, &mut rgba, 4, false); for px in rgba.chunks(4) { assert_eq!(px[0], 0x12, "R channel"); assert_eq!(px[3], 0xFF, "alpha forced to 0xFF"); @@ -1057,7 +1137,7 @@ mod tests { fn rgb48_dispatcher_to_rgb_u16_scalar_path() { let src = solid_rgb48(4, 0xABCD); let mut rgb_u16 = std::vec![0u16; 4 * 3]; - rgb48_to_rgb_u16_row(&src, &mut rgb_u16, 4, false); + rgb48_to_rgb_u16_row::(&src, &mut rgb_u16, 4, false); assert!( rgb_u16.iter().all(|&v| v == 0xABCD), "expected identity copy" @@ -1068,7 +1148,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_u16_scalar_path() { let src = solid_rgb48(4, 0x1234); let mut rgba_u16 = std::vec![0u16; 4 * 4]; - rgb48_to_rgba_u16_row(&src, &mut rgba_u16, 4, false); + rgb48_to_rgba_u16_row::(&src, &mut rgba_u16, 4, false); for px in rgba_u16.chunks(4) { assert_eq!(px[0], 0x1234, "R channel"); assert_eq!(px[3], 0xFFFF, "alpha forced to 0xFFFF"); @@ -1081,7 +1161,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1100,7 +1180,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgb48_to_luma_u16_row( + rgb48_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1125,7 +1205,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgb48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgb48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 0, "H for pure red must be 0"); assert_eq!(s[0], 255, "S for pure red must be 255"); assert!(v[0] >= 254, "V for pure red must be near 255, got {}", v[0]); @@ -1138,7 +1218,7 @@ mod tests { // Bgr48 pixel [B=0x1100, G=0x2200, R=0x3300] → rgb [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300]; let mut rgb = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut rgb, 1, false); + bgr48_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1148,7 +1228,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_scalar_path() { let src = [0x1100u16, 0x2200, 0x3300]; let mut rgba = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut rgba, 1, false); + bgr48_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R"); assert_eq!(rgba[3], 0xFF, "alpha forced to 0xFF"); } @@ -1157,7 +1237,7 @@ mod tests { fn bgr48_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgb_u16 = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgr48_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1167,7 +1247,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgba_u16 = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgr48_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R"); assert_eq!(rgba_u16[3], 0xFFFF, "alpha forced to 0xFFFF"); } @@ -1177,7 +1257,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); // all channels = 0xFF00 let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgr48_to_luma_row( + bgr48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1196,7 +1276,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgr48_to_luma_u16_row( + bgr48_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1219,7 +1299,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgr48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgr48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1236,7 +1316,7 @@ mod tests { // Source alpha should be dropped; R/G/B narrowed. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; // R, G, B, A let mut rgb = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut rgb, 1, false); + rgba64_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x11, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x33, "B"); @@ -1247,7 +1327,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut rgba, 1, false); + rgba64_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1255,7 +1335,7 @@ mod tests { fn rgba64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut rgb_u16 = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + rgba64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x1111, "R"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x3333, "B"); @@ -1266,7 +1346,7 @@ mod tests { // Identity copy; source alpha preserved. let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut rgba_u16 = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + rgba64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x1111, "R"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1277,7 +1357,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgba64_to_luma_row( + rgba64_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1296,7 +1376,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgba64_to_luma_u16_row( + rgba64_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1321,7 +1401,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgba64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgba64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 60, "H for pure green must be 60 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure green must be 255"); assert!( @@ -1338,7 +1418,7 @@ mod tests { // Bgra64: B=0x1100, G=0x2200, R=0x3300, A=0xDEAD → RGB [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut rgb = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut rgb, 1, false); + bgra64_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1349,7 +1429,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8; channels swapped. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut rgba, 1, false); + bgra64_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R (from position 2)"); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1358,7 +1438,7 @@ mod tests { fn bgra64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; // B, G, R, A let mut rgb_u16 = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgra64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1368,7 +1448,7 @@ mod tests { fn bgra64_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; // B, G, R, A let mut rgba_u16 = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgra64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1378,7 +1458,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgra64_to_luma_row( + bgra64_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1397,7 +1477,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgra64_to_luma_u16_row( + bgra64_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1423,7 +1503,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgra64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgra64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1440,7 +1520,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_input() { let src = [0u16; 2]; // needs 3 for width=1 let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1448,7 +1528,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_output() { let src = [0u16; 3]; let mut out = [0u8; 2]; // needs 3 - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1456,7 +1536,7 @@ mod tests { fn rgba64_to_rgb_row_rejects_short_input() { let src = [0u16; 3]; // needs 4 for width=1 let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1, false); + rgba64_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1464,7 +1544,7 @@ mod tests { fn rgba64_to_rgba_row_rejects_short_output() { let src = [0u16; 4]; let mut out = [0u8; 3]; // needs 4 - rgba64_to_rgba_row(&src, &mut out, 1, false); + rgba64_to_rgba_row::(&src, &mut out, 1, false); } #[test] @@ -1473,7 +1553,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 3]; let mut luma: [u8; 0] = []; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1490,7 +1570,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 2]; // needs 3 let mut luma = [0u8; 1]; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1521,7 +1601,7 @@ mod tests { fn rgb48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgb48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + rgb48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1530,7 +1610,7 @@ mod tests { fn bgr48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgr48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + bgr48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1539,7 +1619,7 @@ mod tests { fn rgba64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgba64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + rgba64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } #[cfg(target_pointer_width = "32")] @@ -1548,6 +1628,6 @@ mod tests { fn bgra64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgra64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + bgra64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } } diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index 9b93a405..eedf8b2a 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -948,7 +948,12 @@ pub fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize, use_simd /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2rgb10.len() >= in_min, "x2rgb10 row too short"); @@ -958,34 +963,34 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } /// Drops the 2-bit padding, down-shifts to 8 bits, and forces alpha @@ -993,7 +998,12 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2rgb10.len() >= rgba_min, "x2rgb10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1002,34 +1012,34 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::neon::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); + scalar::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } /// Extracts each 10-bit channel into native-depth `u16` (low-bit @@ -1038,7 +1048,12 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes — match // the rest of the high-bit-depth dispatchers. @@ -1050,41 +1065,46 @@ pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_row`]. Channel /// positions in the source `u32` are reversed; output is still /// `R, G, B`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2bgr10.len() >= in_min, "x2bgr10 row too short"); @@ -1094,39 +1114,44 @@ pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgba_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2bgr10.len() >= rgba_min, "x2bgr10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1135,39 +1160,44 @@ pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::neon::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); + scalar::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_u16_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes. let rgb_min = rgb_row_elems(width); @@ -1178,32 +1208,32 @@ pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } diff --git a/src/row/scalar/packed_rgb.rs b/src/row/scalar/packed_rgb.rs index f1c2862d..0f4091f6 100644 --- a/src/row/scalar/packed_rgb.rs +++ b/src/row/scalar/packed_rgb.rs @@ -306,12 +306,17 @@ pub(crate) fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize) { /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -330,12 +335,21 @@ pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -355,12 +369,21 @@ pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = ((pix >> 20) & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; @@ -377,12 +400,17 @@ pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -400,12 +428,21 @@ pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -423,12 +460,21 @@ pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = (pix & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; diff --git a/src/row/scalar/packed_rgb_16bit.rs b/src/row/scalar/packed_rgb_16bit.rs index d530eaa2..e568c19e 100644 --- a/src/row/scalar/packed_rgb_16bit.rs +++ b/src/row/scalar/packed_rgb_16bit.rs @@ -1,8 +1,13 @@ //! Scalar reference kernels for 16-bit packed RGB sources (Tier 8 finish). //! -//! Input planes are `&[u16]`. Each u16 sample is the native channel value -//! (range [0, 65535]). No endian conversion — caller deserialises LE bytes -//! to `&[u16]` before constructing the frame. +//! Input planes are `&[u16]`. Each u16 sample is either LE- or BE-encoded on +//! disk/wire; the `` const-generic parameter selects the +//! interpretation. When `BE = false` the input is LE-encoded; when `BE = true` +//! the input is BE-encoded. In both cases each element is converted to +//! host-native byte order on load via `u16::from_le` / `u16::from_be`, which +//! are no-ops when the source byte order already matches the host. This +//! mirrors the SIMD `load_endian_u16x*` helpers and keeps the scalar reference +//! correct on big-endian hosts (s390x). //! //! # Format layouts //! @@ -18,56 +23,101 @@ //! - u16 → u8: `(v >> 8) as u8` (high-byte extraction, matching Y216 / Ship 11d). //! - u16 → u16: identity copy (no scaling). +// ---- Endian load helper ------------------------------------------------------ + +/// Load one u16 element from a source whose byte order is selected by `BE`, +/// returning the value in host-native byte order. +/// +/// `u16::from_be` / `u16::from_le` are target-endian aware: each is a no-op +/// when the source byte order matches the host, and a `swap_bytes` otherwise. +/// This matches the SIMD `load_endian_u16x*` helpers and keeps the scalar +/// reference correct on big-endian hosts (s390x). +/// +/// The `if BE` branch is evaluated at compile time (monomorphization), so the +/// unused branch is entirely eliminated from the generated binary. +#[inline(always)] +fn load_u16(v: u16) -> u16 { + if BE { u16::from_be(v) } else { u16::from_le(v) } +} + // ---- Rgb48 family (3 u16 elements per pixel: R, G, B) ---------------------- /// Rgb48 → packed u8 RGB: narrow each 16-bit channel via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load so the channel +/// value is in host-native order before narrowing. +/// /// Input stride: `width * 3` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (rgb48[src] >> 8) as u8; - rgb_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; } } -/// Rgb48 → packed u16 RGB: identity copy (already R, G, B order). +/// Rgb48 → packed u16 RGB: copy with optional byte-swap (already R, G, B order). +/// +/// When `BE = true` each element is byte-swapped so the output contains +/// host-native u16 values. /// /// Input and output stride: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); - rgb_u16_out[..width * 3].copy_from_slice(&rgb48[..width * 3]); + if BE { + for i in 0..width * 3 { + rgb_u16_out[i] = u16::from_be(rgb48[i]); + } + } else { + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 3 { + rgb_u16_out[i] = u16::from_le(rgb48[i]); + } + } } /// Rgb48 → packed u8 RGBA: narrow each 16-bit channel via `>> 8`, force alpha = 0xFF. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 3` u16 elements, output: `width * 4` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (rgb48[src] >> 8) as u8; - rgba_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgba_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgba_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgba_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgba_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; rgba_out[dst + 3] = 0xFF; } } -/// Rgb48 → packed u16 RGBA: copy R/G/B as-is, force alpha = 0xFFFF. +/// Rgb48 → packed u16 RGBA: copy R/G/B (with optional byte-swap), force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 3` u16 elements, output: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -76,9 +126,9 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = rgb48[src]; - rgba_u16_out[dst + 1] = rgb48[src + 1]; - rgba_u16_out[dst + 2] = rgb48[src + 2]; + rgba_u16_out[dst] = load_u16::(rgb48[src]); + rgba_u16_out[dst + 1] = load_u16::(rgb48[src + 1]); + rgba_u16_out[dst + 2] = load_u16::(rgb48[src + 2]); rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -87,54 +137,70 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid /// Bgr48 → packed u8 RGB: narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (bgr48[src + 2] >> 8) as u8; // R (from B-G-R position 2) - rgb_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgr48[src] >> 8) as u8; // B (from B-G-R position 0) + rgb_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R (from B-G-R position 2) + rgb_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B (from B-G-R position 0) } } -/// Bgr48 → packed u16 RGB: copy with B↔R swap. +/// Bgr48 → packed u16 RGB: copy with B↔R swap (and optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_u16_out[dst] = bgr48[src + 2]; // R - rgb_u16_out[dst + 1] = bgr48[src + 1]; // G - rgb_u16_out[dst + 2] = bgr48[src]; // B + rgb_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgr48[src]); // B } } /// Bgr48 → packed u8 RGBA: narrow + B↔R swap + force alpha = 0xFF. +/// +/// When `BE = true` each u16 element is byte-swapped on load. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (bgr48[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgr48[src] >> 8) as u8; // B + rgba_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B rgba_out[dst + 3] = 0xFF; } } -/// Bgr48 → packed u16 RGBA: B↔R swap + force alpha = 0xFFFF. +/// Bgr48 → packed u16 RGBA: B↔R swap (+ optional byte-swap) + force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -143,9 +209,9 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = bgr48[src + 2]; // R - rgba_u16_out[dst + 1] = bgr48[src + 1]; // G - rgba_u16_out[dst + 2] = bgr48[src]; // B + rgba_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgr48[src]); // B rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -154,121 +220,171 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid /// Rgba64 → packed u8 RGB: drop alpha, narrow R/G/B via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 4` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (rgba64[src] >> 8) as u8; - rgb_out[dst + 1] = (rgba64[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgba64[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgba64[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgba64[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgba64[src + 2]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B as-is. +/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 4` u16 elements, output: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = rgba64[src]; - rgb_u16_out[dst + 1] = rgba64[src + 1]; - rgb_u16_out[dst + 2] = rgba64[src + 2]; + rgb_u16_out[dst] = load_u16::(rgba64[src]); + rgb_u16_out[dst + 1] = load_u16::(rgba64[src + 1]); + rgb_u16_out[dst + 2] = load_u16::(rgba64[src + 2]); } } /// Rgba64 → packed u8 RGBA: narrow all 4 channels via `>> 8` (source alpha passes through). /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input and output stride: `width * 4` elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - rgba_out[i] = (rgba64[i] >> 8) as u8; - rgba_out[i + 1] = (rgba64[i + 1] >> 8) as u8; - rgba_out[i + 2] = (rgba64[i + 2] >> 8) as u8; - rgba_out[i + 3] = (rgba64[i + 3] >> 8) as u8; + rgba_out[i] = (load_u16::(rgba64[i]) >> 8) as u8; + rgba_out[i + 1] = (load_u16::(rgba64[i + 1]) >> 8) as u8; + rgba_out[i + 2] = (load_u16::(rgba64[i + 2]) >> 8) as u8; + rgba_out[i + 3] = (load_u16::(rgba64[i + 3]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGBA: identity copy of all 4 channels. +/// Rgba64 → packed u16 RGBA: copy all 4 channels (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input and output stride: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgba_u16_row( + rgba64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, "rgba_u16_out row too short" ); - rgba_u16_out[..width * 4].copy_from_slice(&rgba64[..width * 4]); + if BE { + for i in 0..width * 4 { + rgba_u16_out[i] = u16::from_be(rgba64[i]); + } + } else { + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 4 { + rgba_u16_out[i] = u16::from_le(rgba64[i]); + } + } } // ---- Bgra64 family (4 u16 elements per pixel: B, G, R, A) ------------------ /// Bgra64 → packed u8 RGB: drop alpha, narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (bgra64[src + 2] >> 8) as u8; // R (from position 2) - rgb_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgra64[src] >> 8) as u8; // B (from position 0) + rgb_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R (from position 2) + rgb_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B (from position 0) } } -/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap. +/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap (+ optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = bgra64[src + 2]; // R - rgb_u16_out[dst + 1] = bgra64[src + 1]; // G - rgb_u16_out[dst + 2] = bgra64[src]; // B + rgb_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgra64[src]); // B } } /// Bgra64 → packed u8 RGBA: narrow via `>> 8`, swap B↔R, pass through source alpha. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all narrowed `>> 8`). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_out[dst] = (bgra64[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgra64[src] >> 8) as u8; // B - rgba_out[dst + 3] = (bgra64[src + 3] >> 8) as u8; // A + rgba_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B + rgba_out[dst + 3] = (load_u16::(bgra64[src + 3]) >> 8) as u8; // A } } -/// Bgra64 → packed u16 RGBA: B↔R swap, pass through source alpha unchanged. +/// Bgra64 → packed u16 RGBA: B↔R swap (+ optional byte-swap), pass through source alpha. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all native u16). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgba_u16_row( + bgra64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -277,10 +393,10 @@ pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], w for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_u16_out[dst] = bgra64[src + 2]; // R - rgba_u16_out[dst + 1] = bgra64[src + 1]; // G - rgba_u16_out[dst + 2] = bgra64[src]; // B - rgba_u16_out[dst + 3] = bgra64[src + 3]; // A (unchanged) + rgba_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgra64[src]); // B + rgba_u16_out[dst + 3] = load_u16::(bgra64[src + 3]); // A (byte-order corrected) } } @@ -297,7 +413,7 @@ mod tests { fn rgb48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u16; 3 * 4]; - rgb48_to_rgb_u16_row(&src, &mut out, 4); + rgb48_to_rgb_u16_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF, got {out:?}" @@ -309,7 +425,7 @@ mod tests { fn rgb48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u8; 3 * 4]; - rgb48_to_rgb_row(&src, &mut out, 4); + rgb48_to_rgb_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {out:?}" @@ -321,7 +437,7 @@ mod tests { fn rgb48_to_rgb_narrow_known_value() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1); + rgb48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x12, "R channel"); assert_eq!(out[1], 0x56, "G channel"); assert_eq!(out[2], 0x9A, "B channel"); @@ -332,7 +448,7 @@ mod tests { fn rgb48_to_rgba_forces_alpha_0xff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u8; 4]; - rgb48_to_rgba_row(&src, &mut out, 1); + rgb48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[3], 0xFF, "alpha must be 0xFF"); assert_eq!(out[0], 0xAA, "R"); assert_eq!(out[1], 0xBB, "G"); @@ -344,7 +460,7 @@ mod tests { fn rgb48_to_rgba_u16_forces_alpha_0xffff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u16; 4]; - rgb48_to_rgba_u16_row(&src, &mut out, 1); + rgb48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0xAAAA, "R"); assert_eq!(out[1], 0xBBBB, "G"); assert_eq!(out[2], 0xCCCC, "B"); @@ -358,7 +474,7 @@ mod tests { fn bgr48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u16; 3 * 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 3); + bgr48_to_rgb_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -367,7 +483,7 @@ mod tests { fn bgr48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u8; 3 * 3]; - bgr48_to_rgb_row(&src, &mut out, 3); + bgr48_to_rgb_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -378,7 +494,7 @@ mod tests { // Source pixel in BGR order: B=0x1234, G=0x5678, R=0x9ABC let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 1); + bgr48_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9ABC, "R (was at src[2])"); assert_eq!(out[1], 0x5678, "G (unchanged)"); assert_eq!(out[2], 0x1234, "B (was at src[0])"); @@ -389,7 +505,7 @@ mod tests { fn bgr48_to_rgb_channel_order_and_narrow() { let src = [0x1200u16, 0x5600, 0x9A00]; let mut out = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut out, 1); + bgr48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9A, "R"); assert_eq!(out[1], 0x56, "G"); assert_eq!(out[2], 0x12, "B"); @@ -400,7 +516,7 @@ mod tests { fn bgr48_to_rgba_channel_order_and_alpha() { let src = [0x1100u16, 0x2200, 0x3300]; let mut out = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut out, 1); + bgr48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -412,7 +528,7 @@ mod tests { fn bgr48_to_rgba_u16_channel_order_and_alpha() { let src = [0x1111u16, 0x2222, 0x3333]; let mut out = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut out, 1); + bgr48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -426,7 +542,7 @@ mod tests { fn rgba64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u16; 4 * 3]; - rgba64_to_rgba_u16_row(&src, &mut out, 3); + rgba64_to_rgba_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -435,7 +551,7 @@ mod tests { fn rgba64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u8; 4 * 3]; - rgba64_to_rgba_row(&src, &mut out, 3); + rgba64_to_rgba_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -445,7 +561,7 @@ mod tests { // R=0x1111, G=0x2222, B=0x3333, A=0xABCD let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut out = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut out, 1); + rgba64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -457,7 +573,7 @@ mod tests { fn rgba64_to_rgba_source_alpha_depth_converted() { let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut out = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut out, 1); + rgba64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -469,7 +585,7 @@ mod tests { fn rgba64_to_rgb_drops_alpha() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1); + rgba64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -480,7 +596,7 @@ mod tests { fn rgba64_to_rgb_u16_drops_alpha() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut out, 1); + rgba64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -493,7 +609,7 @@ mod tests { fn bgra64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u16; 4 * 2]; - bgra64_to_rgba_u16_row(&src, &mut out, 2); + bgra64_to_rgba_u16_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -502,7 +618,7 @@ mod tests { fn bgra64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u8; 4 * 2]; - bgra64_to_rgba_row(&src, &mut out, 2); + bgra64_to_rgba_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -512,7 +628,7 @@ mod tests { // Source in BGRA order: B=0x1111, G=0x2222, R=0x3333, A=0x4444 let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; let mut out = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut out, 1); + bgra64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R (from src[2])"); assert_eq!(out[1], 0x2222, "G (unchanged)"); assert_eq!(out[2], 0x1111, "B (from src[0])"); @@ -524,7 +640,7 @@ mod tests { fn bgra64_to_rgba_channel_order_and_alpha_narrowed() { let src = [0x1100u16, 0x2200, 0x3300, 0xAB00]; let mut out = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut out, 1); + bgra64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -536,7 +652,7 @@ mod tests { fn bgra64_to_rgb_drops_alpha_and_swaps() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut out, 1); + bgra64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -547,7 +663,7 @@ mod tests { fn bgra64_to_rgb_u16_drops_alpha_and_swaps() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut out, 1); + bgra64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -564,7 +680,7 @@ mod tests { 0x1100u16, 0x2200, 0x3300, 0x4400, 0x5500, 0x6600, 0x7700, 0x8800, 0x9900, ]; let mut out = [0u8; 9]; - rgb48_to_rgb_row(&src, &mut out, 3); + rgb48_to_rgb_row::(&src, &mut out, 3); assert_eq!(out[0], 0x11); assert_eq!(out[1], 0x22); assert_eq!(out[2], 0x33); @@ -584,7 +700,7 @@ mod tests { 0x5555, 0x6666, 0x7777, 0x8888, // pixel 1 ]; let mut out = [0u16; 8]; - rgba64_to_rgba_u16_row(&src, &mut out, 2); + rgba64_to_rgba_u16_row::(&src, &mut out, 2); assert_eq!(&out, &src, "identity copy must be byte-exact"); } @@ -598,8 +714,8 @@ mod tests { let mut rgb48_out = [0u8; 3]; let mut bgr48_out = [0u8; 3]; - rgb48_to_rgb_row(&rgb48_src, &mut rgb48_out, 1); - bgr48_to_rgb_row(&bgr48_src, &mut bgr48_out, 1); + rgb48_to_rgb_row::(&rgb48_src, &mut rgb48_out, 1); + bgr48_to_rgb_row::(&bgr48_src, &mut bgr48_out, 1); assert_eq!( rgb48_out, bgr48_out, diff --git a/src/sinker/mixed/packed_rgb_10bit.rs b/src/sinker/mixed/packed_rgb_10bit.rs index 4470663d..ea0a62f5 100644 --- a/src/sinker/mixed/packed_rgb_10bit.rs +++ b/src/sinker/mixed/packed_rgb_10bit.rs @@ -149,7 +149,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { w, h, )?; - x2rgb10_to_rgb_row(x2rgb10_in, rgb_row, w, use_simd); + x2rgb10_to_rgb_row::(x2rgb10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -177,7 +177,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { // u8 RGBA output (single-pass, dedicated kernel forces alpha). if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2rgb10_to_rgba_row(x2rgb10_in, rgba_row, w, use_simd); + x2rgb10_to_rgba_row::(x2rgb10_in, rgba_row, w, use_simd); } // u16 native RGB output (10-bit precision preserved). @@ -193,7 +193,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2rgb10_to_rgb_u16_row(x2rgb10_in, rgb_u16_row, w, use_simd); + x2rgb10_to_rgb_u16_row::(x2rgb10_in, rgb_u16_row, w, use_simd); } Ok(()) @@ -307,7 +307,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { w, h, )?; - x2bgr10_to_rgb_row(x2bgr10_in, rgb_row, w, use_simd); + x2bgr10_to_rgb_row::(x2bgr10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -334,7 +334,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2bgr10_to_rgba_row(x2bgr10_in, rgba_row, w, use_simd); + x2bgr10_to_rgba_row::(x2bgr10_in, rgba_row, w, use_simd); } if want_rgb_u16 { @@ -349,7 +349,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2bgr10_to_rgb_u16_row(x2bgr10_in, rgb_u16_row, w, use_simd); + x2bgr10_to_rgb_u16_row::(x2bgr10_in, rgb_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index 837b8246..f09315c6 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -206,7 +206,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // with_luma_u16, or with_hsv is attached. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgb48_to_rgb_row(in48, rgb_row, w, use_simd); + rgb48_to_rgb_row::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -245,7 +245,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // u8 RGBA — single-pass kernel, alpha forced to 0xFF. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_row(in48, rgba_row, w, use_simd); + rgb48_to_rgba_row::(in48, rgba_row, w, use_simd); } // u16 RGB — native passthrough. @@ -257,13 +257,13 @@ impl PixelSink for MixedSinker<'_, Rgb48> { height: h, channels: 3, })?; - rgb48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + rgb48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); } // u16 RGBA — native passthrough, alpha forced to 0xFFFF. if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + rgb48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -426,7 +426,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgr48_to_rgb_row(in48, rgb_row, w, use_simd); + bgr48_to_rgb_row::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -464,7 +464,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_row(in48, rgba_row, w, use_simd); + bgr48_to_rgba_row::(in48, rgba_row, w, use_simd); } if let Some(buf) = rgb_u16.as_deref_mut() { @@ -475,12 +475,12 @@ impl PixelSink for MixedSinker<'_, Bgr48> { height: h, channels: 3, })?; - bgr48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + bgr48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); } if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + bgr48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -667,7 +667,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -675,7 +675,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); return Ok(()); } @@ -683,7 +683,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { // and Strategy A+ RGBA fan-out. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgba64_to_rgb_row(in64, rgb_row, w, use_simd); + rgba64_to_rgb_row::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -735,7 +735,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); } // ===== u16 path ===== @@ -750,7 +750,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - rgba64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + rgba64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached — derive from the // just-computed u16 RGB row (writes α=0xFFFF), then overwrite α @@ -768,7 +768,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); } Ok(()) @@ -940,7 +940,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -948,14 +948,14 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); return Ok(()); } // u8 RGB staging path. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgra64_to_rgb_row(in64, rgb_row, w, use_simd); + bgra64_to_rgb_row::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -1003,7 +1003,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); } // u16 RGB path. @@ -1017,7 +1017,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - bgra64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + bgra64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached. if want_rgba_u16 { @@ -1032,7 +1032,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); } Ok(())