From 6f6e21321fac1fb47f3a46a64f86c268e057fb42 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 21:51:24 +1200 Subject: [PATCH 1/5] update --- src/row/scalar.rs | 336 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 298 insertions(+), 38 deletions(-) diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 71d1018..c25a9e7 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -534,6 +534,43 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi } } +/// `u16` analogue of [`expand_rgb_to_rgba_row`]: copy each `u16` RGB +/// triple into a `u16` RGBA quadruple, with the alpha element set to +/// `(1 << BITS) - 1` (opaque maximum at the input bit depth). Used by +/// `MixedSinker` Strategy A on the **u16** path when both +/// `with_rgb_u16` and `with_rgba_u16` are attached — runs the YUV→RGB +/// math once into the u16 RGB buffer, then this helper fans out to the +/// u16 RGBA buffer with no second per-pixel kernel call. +/// +/// `BITS` is a `const` parameter so the alpha constant resolves at +/// compile time per format (10 / 12 / 16 etc.); the compiler folds the +/// `(1 << BITS) - 1` expression to a literal in each monomorphization. +/// +/// # Panics (debug builds) +/// +/// - `rgb.len() >= 3 * width` (`u16` elements) +/// - `rgba_out.len() >= 4 * width` (`u16` elements) +#[cfg(any(feature = "std", feature = "alloc"))] +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn expand_rgb_u16_to_rgba_u16_row( + rgb: &[u16], + rgba_out: &mut [u16], + width: usize, +) { + debug_assert!(rgb.len() >= width * 3, "rgb row too short"); + debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); + let alpha_max: u16 = ((1u32 << BITS) - 1) as u16; + for (rgb_px, rgba_px) in rgb[..width * 3] + .chunks_exact(3) + .zip(rgba_out[..width * 4].chunks_exact_mut(4)) + { + rgba_px[0] = rgb_px[0]; + rgba_px[1] = rgb_px[1]; + rgba_px[2] = rgb_px[2]; + rgba_px[3] = alpha_max; + } +} + // ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) ------------- /// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the @@ -551,6 +588,8 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi /// source precision inline rather than converting first at `BITS` and /// then downshifting. This keeps the fast path a single Q15 shift. /// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `width` must be even. @@ -565,18 +604,74 @@ pub(crate) fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); +} + +/// Converts one row of high‑bit‑depth 4:2:0 YUV (`u16` samples in the +/// low `BITS` bits) directly to **8-bit** packed **RGBA**. Same numerical +/// contract as [`yuv_420p_n_to_rgb_row`]; the only differences are the +/// per-pixel stride (4 vs 3) and the alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); +} + +/// Shared kernel for [`yuv_420p_n_to_rgb_row`] (`ALPHA = false`, +/// 3 bpp store) and [`yuv_420p_n_to_rgba_row`] (`ALPHA = true`, +/// 4 bpp store with constant `0xFF` alpha). +/// +/// The compiler monomorphizes into two separate functions; the +/// `if ALPHA` branches are DCE'd at each call site. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { // Compile-time guard — fails monomorphization for any BITS outside // {9, 10, 12, 14}. 16 would overflow the Q15 chroma sum (16-bit lives // in `yuv_420p16_to_rgb_row`'s i64 chroma family); 8 belongs to the - // non-const-generic `yuv_420_to_rgb_row`. Without this guard a release - // build instantiating ::<16> would silently produce wrong output. + // non-const-generic `yuv_420_to_rgb_or_rgba_row`. Without this guard a + // release build instantiating ::<16, _> would silently produce wrong + // output. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); debug_assert!(v_half.len() >= width / 2, "v_half row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); @@ -603,14 +698,20 @@ pub(crate) fn yuv_420p_n_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); + out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); + out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } @@ -650,6 +751,8 @@ fn q15_chroma(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 { /// downstream work. Callers who only need 8‑bit output should prefer /// [`yuv_420p_n_to_rgb_row`], which is ~2× faster. /// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `width` must be even. @@ -665,25 +768,79 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - // Compile-time guard — see note on `yuv_420p_n_to_rgb_row`. The - // 16-bit u16-output path is `yuv_420p16_to_rgb_u16_row` (i64 chroma - // family). + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); +} + +/// Converts one row of high‑bit‑depth 4:2:0 YUV → **native‑depth `u16` +/// packed RGBA**. Same numerical contract as +/// [`yuv_420p_n_to_rgb_u16_row`]; the only differences are the +/// per-pixel stride (4 vs 3 `u16` elements) and the alpha element, +/// `(1 << BITS) - 1` (opaque maximum at the input bit depth). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); +} + +/// Shared kernel for [`yuv_420p_n_to_rgb_u16_row`] (`ALPHA = false`, +/// 3 bpp store) and [`yuv_420p_n_to_rgba_u16_row`] (`ALPHA = true`, +/// 4 bpp store with opaque alpha = `(1 << BITS) - 1`). +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // Compile-time guard — see note on `yuv_420p_n_to_rgb_or_rgba_row`. + // The 16-bit u16-output path is `yuv_420p16_to_rgb_or_rgba_u16_row` + // (i64 chroma family). const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); debug_assert!(v_half.len() >= width / 2, "v_half row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); let bias = chroma_bias::(); let out_max: i32 = (1i32 << BITS) - 1; let mask = bits_mask::(); + let alpha_max: u16 = out_max as u16; // Every sample AND‑masked to the low `BITS` bits — see matching - // comment in [`yuv_420p_n_to_rgb_row`]. Critical for the native‑ - // depth u16 output path: `range_params_n::<10, 10>` uses + // comment in [`yuv_420p_n_to_rgb_or_rgba_row`]. Critical for the + // native‑depth u16 output path: `range_params_n::<10, 10>` uses // `y_scale = c_scale = 32768` (unit Q15 for BITS==OUT_BITS full // range), so an unmasked out‑of‑range sample would push `u_d` / // `v_d` to ±32256 and the subsequent `coeff * v_d` exceeds i16 @@ -700,14 +857,20 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = alpha_max; + } let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[(x + 1) * bpp + 3] = alpha_max; + } x += 2; } @@ -1163,6 +1326,8 @@ pub(crate) fn p16_to_rgb_u16_row( /// kernel — has its active low bits discarded (producing near‑black /// output), matching every SIMD backend. /// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `width` must be even. @@ -1176,6 +1341,45 @@ pub(crate) fn p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012) +/// to **8‑bit** packed **RGBA**. Same numerical contract as +/// [`p_n_to_rgb_row`]; the only differences are the per-pixel stride +/// (4 vs 3) and the alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`p_n_to_rgb_row`] (`ALPHA = false`, 3 bpp store) +/// and [`p_n_to_rgba_row`] (`ALPHA = true`, 4 bpp store with constant +/// `0xFF` alpha). +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { // High-bit-packed Pn kernels are only defined for BITS in {10, 12}. // Outside that set, `16 - BITS` could under/overflow and the Q15 @@ -1183,12 +1387,13 @@ pub(crate) fn p_n_to_rgb_row( // the SIMD dispatcher hands control to unsafe code. debug_assert!( BITS == 10 || BITS == 12, - "p_n_to_rgb_row only supports BITS in {{10, 12}}" + "p_n_to_rgb_or_rgba_row only supports BITS in {{10, 12}}" ); + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); @@ -1215,14 +1420,20 @@ pub(crate) fn p_n_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); + out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); + out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } @@ -1241,6 +1452,8 @@ pub(crate) fn p_n_to_rgb_row( /// extract the `BITS`-bit value from the high-bit packing) and the /// interleaved UV layout. /// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `width` must be even. @@ -1255,22 +1468,63 @@ pub(crate) fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - // See `p_n_to_rgb_row` for the BITS range rationale. Duplicated + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012) +/// to **native‑depth `u16`** packed **RGBA** — output is low‑bit‑packed +/// to match [`p_n_to_rgb_u16_row`]. Alpha is `(1 << BITS) - 1` (opaque +/// maximum at the input bit depth). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`p_n_to_rgb_u16_row`] (`ALPHA = false`, 3 bpp +/// store) and [`p_n_to_rgba_u16_row`] (`ALPHA = true`, 4 bpp store +/// with opaque alpha = `(1 << BITS) - 1`). +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. Duplicated // here so either entry point catches misuse on its own. debug_assert!( BITS == 10 || BITS == 12, - "p_n_to_rgb_u16_row only supports BITS in {{10, 12}}" + "p_n_to_rgb_or_rgba_u16_row only supports BITS in {{10, 12}}" ); + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); let bias = chroma_bias::(); let out_max: i32 = (1i32 << BITS) - 1; let shift = 16 - BITS; + let alpha_max: u16 = out_max as u16; let mut x = 0; while x < width { @@ -1285,14 +1539,20 @@ pub(crate) fn p_n_to_rgb_u16_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = alpha_max; + } let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[(x + 1) * bpp + 3] = alpha_max; + } x += 2; } From 80fbcda987d12f2f40786af62f4b4ed05d945e5a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:04:38 +1200 Subject: [PATCH 2/5] update --- src/row/scalar.rs | 281 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 253 insertions(+), 28 deletions(-) diff --git a/src/row/scalar.rs b/src/row/scalar.rs index c25a9e7..3bdf51c 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -550,7 +550,13 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi /// /// - `rgb.len() >= 3 * width` (`u16` elements) /// - `rgba_out.len() >= 4 * width` (`u16` elements) +// +// Scalar prep for Ship 8 Tranche 5: the consumer (MixedSinker Strategy A +// on the u16 path) lands in the follow-up Tranche 5b PR. `dead_code` +// allow lets this prep PR ship the foundation without the eventual call +// site. #[cfg(any(feature = "std", feature = "alloc"))] +#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn expand_rgb_u16_to_rgba_u16_row( rgb: &[u16], @@ -621,6 +627,12 @@ pub(crate) fn yuv_420p_n_to_rgb_row( /// /// - `width` must be even. /// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant). +// +// Scalar prep for Ship 8 Tranche 5a: the public dispatcher +// `row::yuv420p10_to_rgba_row` (and its u16 sibling) lands in the +// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no +// caller. +#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p_n_to_rgba_row( y: &[u16], @@ -785,6 +797,11 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( /// /// - `width` must be even. /// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant). +// +// Scalar prep for Ship 8 Tranche 5b: the public dispatcher +// `row::yuv420p10_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher +// PR. Until then this thin wrapper has no caller. +#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p_n_to_rgba_u16_row( y: &[u16], @@ -1023,6 +1040,8 @@ fn q15_scale64(sample: i32, scale_q15: i32) -> i32 { /// `BITS = 16`, just without the AND-mask (no upper-bit-zero /// guarantee to enforce at 16 bits). /// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `width` must be even. @@ -1038,11 +1057,56 @@ pub(crate) fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **8-bit** packed +/// **RGBA**. Same numerical contract as [`yuv_420p16_to_rgb_row`]; +/// the only differences are the per-pixel stride (4 vs 3) and the +/// alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5a: the public dispatcher +// `row::yuv420p16_to_rgba_row` lands in the follow-up SIMD/dispatcher +// PR. Until then this thin wrapper has no caller. +#[allow(dead_code)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Shared 16-bit YUV 4:2:0 → 8-bit RGB / RGBA kernel. `ALPHA = false` +/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. +/// +/// 16-bit input has no AND-mask (every `u16` is a valid sample) and +/// uses i32 chroma — output-target scaling keeps `u_d * coeff` inside +/// i32 for u8 output (the i64 chroma family lives in +/// [`yuv_420p16_to_rgb_or_rgba_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); debug_assert!(v_half.len() >= width / 2, "v_half row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range); @@ -1062,14 +1126,20 @@ pub(crate) fn yuv_420p16_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); + out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); + out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } @@ -1080,6 +1150,8 @@ pub(crate) fn yuv_420p16_to_rgb_row( /// chroma matrix multiply in i64** to accommodate the wider /// `coeff × u_d` product at 16 → 16-bit scaling. /// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// Same contract as [`yuv_420p16_to_rgb_row`] plus `rgb_out` is @@ -1094,11 +1166,54 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + yuv_420p16_to_rgb_or_rgba_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — alpha element is `0xFFFF` (opaque maximum at +/// 16-bit). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5b: the public dispatcher +// `row::yuv420p16_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher +// PR. Until then this thin wrapper has no caller. +#[allow(dead_code)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_420p16_to_rgb_or_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Shared 16-bit YUV 4:2:0 → native-depth `u16` RGB / RGBA kernel. +/// `ALPHA = false` emits 3 bpp; `ALPHA = true` emits 4 bpp with +/// constant `0xFFFF` alpha. +/// +/// Uses i64 chroma multiply (same rationale as +/// [`yuv_420p16_to_rgb_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); debug_assert!(v_half.len() >= width / 2, "v_half row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range); @@ -1116,14 +1231,20 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row( let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = 0xFFFF; + } let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFFFF; + } x += 2; } @@ -1219,6 +1340,8 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row( /// - `width` must be even. /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p16_to_rgb_row( y: &[u16], @@ -1228,10 +1351,47 @@ pub(crate) fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **8-bit** packed **RGBA**. Same +/// numerical contract as [`p16_to_rgb_row`] except for the per-pixel +/// stride (4 vs 3) and the alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5a: the public dispatcher +// `row::p016_to_rgba_row` lands in the follow-up SIMD/dispatcher PR. +// Until then this thin wrapper has no caller. +#[allow(dead_code)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Shared P016 → 8-bit RGB / RGBA kernel. `ALPHA = false` emits 3 bpp; +/// `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range); @@ -1250,14 +1410,20 @@ pub(crate) fn p16_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); + out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); + out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } @@ -1266,6 +1432,8 @@ pub(crate) fn p16_to_rgb_row( /// Converts one row of **P016** to **native-depth `u16`** packed /// RGB — full-range output in `[0, 65535]`. Chroma matrix multiply /// runs in i64 (same reasoning as [`yuv_420p16_to_rgb_u16_row`]). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p16_to_rgb_u16_row( y: &[u16], @@ -1275,10 +1443,49 @@ pub(crate) fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed +/// **RGBA** — alpha element is `0xFFFF` (opaque maximum at 16-bit). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5b: the public dispatcher +// `row::p016_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher +// PR. Until then this thin wrapper has no caller. +#[allow(dead_code)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p16_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Shared P016 → native-depth `u16` RGB / RGBA kernel. `ALPHA = false` +/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFFFF` +/// alpha. +/// +/// Uses i64 chroma multiply (same rationale as [`yuv_420p16_to_rgb_or_rgba_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_half.len() >= width, "uv row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range); @@ -1298,14 +1505,20 @@ pub(crate) fn p16_to_rgb_u16_row( let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = 0xFFFF; + } let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale); - rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; - rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; + out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFFFF; + } x += 2; } @@ -1351,6 +1564,12 @@ pub(crate) fn p_n_to_rgb_row( /// (4 vs 3) and the alpha byte (`0xFF`, opaque). /// /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5a: the public dispatcher +// `row::p010_to_rgba_row` (and P012/P016 siblings) lands in the +// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no +// caller. +#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_row( y: &[u16], @@ -1477,6 +1696,12 @@ pub(crate) fn p_n_to_rgb_u16_row( /// maximum at the input bit depth). /// /// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +// +// Scalar prep for Ship 8 Tranche 5b: the public dispatcher +// `row::p010_to_rgba_u16_row` (and P012/P016 siblings) lands in the +// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no +// caller. +#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_u16_row( y: &[u16], From 809a0f922f6252c655c56e8143bf7007d965006b Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:13:52 +1200 Subject: [PATCH 3/5] update --- src/row/scalar.rs | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 3bdf51c..8b5da89 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -1565,10 +1565,11 @@ pub(crate) fn p_n_to_rgb_row( /// /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. // -// Scalar prep for Ship 8 Tranche 5a: the public dispatcher -// `row::p010_to_rgba_row` (and P012/P016 siblings) lands in the +// Scalar prep for Ship 8 Tranche 5a: the public dispatchers +// `row::p010_to_rgba_row` and `row::p012_to_rgba_row` land in the // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no -// caller. +// caller. P016 has its own kernel family +// ([`p16_to_rgb_or_rgba_row`]) — never routed here. #[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_row( @@ -1602,12 +1603,13 @@ pub(crate) fn p_n_to_rgb_or_rgba_row( ) { // High-bit-packed Pn kernels are only defined for BITS in {10, 12}. // Outside that set, `16 - BITS` could under/overflow and the Q15 - // coefficient table has no corresponding entry. Caught here before - // the SIMD dispatcher hands control to unsafe code. - debug_assert!( - BITS == 10 || BITS == 12, - "p_n_to_rgb_or_rgba_row only supports BITS in {{10, 12}}" - ); + // coefficient table has no corresponding entry. P016 (BITS=16) has + // its own dedicated kernel family with i64 chroma multiply — using + // this i32 path at BITS=16 would silently overflow on high chroma + // values. The compile-time assertion fails monomorphization for any + // BITS outside {10, 12}, eliminating that release-build corruption + // trap. + const { assert!(BITS == 10 || BITS == 12) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); @@ -1697,10 +1699,11 @@ pub(crate) fn p_n_to_rgb_u16_row( /// /// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. // -// Scalar prep for Ship 8 Tranche 5b: the public dispatcher -// `row::p010_to_rgba_u16_row` (and P012/P016 siblings) lands in the -// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no -// caller. +// Scalar prep for Ship 8 Tranche 5b: the public dispatchers +// `row::p010_to_rgba_u16_row` and `row::p012_to_rgba_u16_row` land in +// the follow-up SIMD/dispatcher PR. Until then this thin wrapper has +// no caller. P016 has its own u16 kernel family +// ([`p16_to_rgb_or_rgba_u16_row`]) — never routed here. #[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_u16_row( @@ -1732,12 +1735,12 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. Duplicated - // here so either entry point catches misuse on its own. - debug_assert!( - BITS == 10 || BITS == 12, - "p_n_to_rgb_or_rgba_u16_row only supports BITS in {{10, 12}}" - ); + // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. The + // P016 u16 path lives in [`p16_to_rgb_or_rgba_u16_row`] (i64 chroma + // multiply); this i32 path would overflow before clamp at 16-bit + // chroma. Compile-time assertion eliminates the release-build + // corruption trap. + const { assert!(BITS == 10 || BITS == 12) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width"); debug_assert!(y.len() >= width, "y row too short"); From 5fc4ab53d004c22bfdbdd27e0b64ae136d83ebd1 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:26:23 +1200 Subject: [PATCH 4/5] update --- src/row/mod.rs | 515 ++++++++++++++++++++++++++++++++++++++++++++++ src/row/scalar.rs | 8 - 2 files changed, 515 insertions(+), 8 deletions(-) diff --git a/src/row/mod.rs b/src/row/mod.rs index 5974753..15f8c10 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -2659,6 +2659,509 @@ pub fn p016_to_rgb_u16_row( scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); } +// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5 prep) ---------- +// +// Scalar prep: dispatchers route through the new RGBA scalar kernels +// (`scalar::*_to_rgba*_row`). The `use_simd` parameter is held in the +// signature so the follow-up SIMD/backend PRs (Ship 8 Tranche 5a/5b) +// can fill in per-arch branches without breaking callers. + +/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p9_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces scalar. SIMD per-arch routes land in the +/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always +/// runs the scalar reference regardless of `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p10_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces scalar. SIMD per-arch routes land in the +/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always +/// runs the scalar reference regardless of `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<10>` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 10) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p12_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces scalar. SIMD per-arch routes land in the +/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always +/// runs the scalar reference regardless of `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p14_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces scalar. SIMD per-arch routes land in the +/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always +/// runs the scalar reference regardless of `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<12>` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 12) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for +/// now this dispatcher always runs the scalar reference regardless of +/// `use_simd`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// Routes through the dedicated 16-bit scalar kernel +/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient +/// for u8 output even at 16-bit input. SIMD per-arch routes land in +/// the follow-up Ship 8 Tranche 5a PR. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element +/// is `0xFFFF` (opaque maximum at 16-bit). +/// +/// Routes through the dedicated 16-bit u16-output scalar kernel +/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply +/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit +/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. +/// +/// Routes through the dedicated 16-bit P016 scalar kernel +/// (`scalar::p16_to_rgba_row`). SIMD per-arch routes land in the +/// follow-up Ship 8 Tranche 5a PR. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed +/// **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. +/// +/// Routes through the dedicated 16-bit u16-output P016 scalar kernel +/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. SIMD +/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); +} + // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB -------------------- // // Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width @@ -3205,6 +3708,18 @@ fn rgb_row_elems(width: usize) -> usize { } } +/// Element count of one packed `u16`-RGBA row (`width × 4`). Identical +/// math to [`rgba_row_bytes`] — the returned value is in `u16` +/// elements, not bytes. Callers use it to size `&mut [u16]` buffers +/// for the high-bit-depth `u16` RGBA output path. +#[cfg_attr(not(tarpaulin), inline(always))] +fn rgba_row_elems(width: usize) -> usize { + match width.checked_mul(4) { + Some(n) => n, + None => panic!("width ({width}) × 4 overflows usize"), + } +} + /// Maximum permitted magnitude of any element of a fused color /// transform handed to a Bayer row dispatcher. /// diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 8b5da89..4188d93 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -632,7 +632,6 @@ pub(crate) fn yuv_420p_n_to_rgb_row( // `row::yuv420p10_to_rgba_row` (and its u16 sibling) lands in the // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no // caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p_n_to_rgba_row( y: &[u16], @@ -801,7 +800,6 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( // Scalar prep for Ship 8 Tranche 5b: the public dispatcher // `row::yuv420p10_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p_n_to_rgba_u16_row( y: &[u16], @@ -1070,7 +1068,6 @@ pub(crate) fn yuv_420p16_to_rgb_row( // Scalar prep for Ship 8 Tranche 5a: the public dispatcher // `row::yuv420p16_to_rgba_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p16_to_rgba_row( y: &[u16], @@ -1178,7 +1175,6 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row( // Scalar prep for Ship 8 Tranche 5b: the public dispatcher // `row::yuv420p16_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_420p16_to_rgba_u16_row( y: &[u16], @@ -1363,7 +1359,6 @@ pub(crate) fn p16_to_rgb_row( // Scalar prep for Ship 8 Tranche 5a: the public dispatcher // `row::p016_to_rgba_row` lands in the follow-up SIMD/dispatcher PR. // Until then this thin wrapper has no caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p16_to_rgba_row( y: &[u16], @@ -1454,7 +1449,6 @@ pub(crate) fn p16_to_rgb_u16_row( // Scalar prep for Ship 8 Tranche 5b: the public dispatcher // `row::p016_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p16_to_rgba_u16_row( y: &[u16], @@ -1570,7 +1564,6 @@ pub(crate) fn p_n_to_rgb_row( // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no // caller. P016 has its own kernel family // ([`p16_to_rgb_or_rgba_row`]) — never routed here. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_row( y: &[u16], @@ -1704,7 +1697,6 @@ pub(crate) fn p_n_to_rgb_u16_row( // the follow-up SIMD/dispatcher PR. Until then this thin wrapper has // no caller. P016 has its own u16 kernel family // ([`p16_to_rgb_or_rgba_u16_row`]) — never routed here. -#[allow(dead_code)] #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn p_n_to_rgba_u16_row( y: &[u16], From ae2a47126aaa8c81d4ff5694bf46805e018d3d0e Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:39:13 +1200 Subject: [PATCH 5/5] Update src/row/scalar.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/row/scalar.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 4188d93..5390a49 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -563,12 +563,20 @@ pub(crate) fn expand_rgb_u16_to_rgba_u16_row( rgba_out: &mut [u16], width: usize, ) { - debug_assert!(rgb.len() >= width * 3, "rgb row too short"); - debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); + const { + assert!(BITS > 0 && BITS <= 16); + } + + let rgb_len = width.checked_mul(3).expect("rgb row length overflow"); + let rgba_len = width.checked_mul(4).expect("rgba row length overflow"); + + debug_assert!(rgb.len() >= rgb_len, "rgb row too short"); + debug_assert!(rgba_out.len() >= rgba_len, "rgba_out row too short"); + let alpha_max: u16 = ((1u32 << BITS) - 1) as u16; - for (rgb_px, rgba_px) in rgb[..width * 3] + for (rgb_px, rgba_px) in rgb[..rgb_len] .chunks_exact(3) - .zip(rgba_out[..width * 4].chunks_exact_mut(4)) + .zip(rgba_out[..rgba_len].chunks_exact_mut(4)) { rgba_px[0] = rgb_px[0]; rgba_px[1] = rgb_px[1];