From 6f6e21321fac1fb47f3a46a64f86c268e057fb42 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 21:51:24 +1200
Subject: [PATCH 1/5] update

---
 src/row/scalar.rs | 336 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 298 insertions(+), 38 deletions(-)
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 71d1018..c25a9e7 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -534,6 +534,43 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi
   }
 }
 
+/// `u16` analogue of [`expand_rgb_to_rgba_row`]: copy each `u16` RGB
+/// triple into a `u16` RGBA quadruple, with the alpha element set to
+/// `(1 << BITS) - 1` (opaque maximum at the input bit depth). Used by
+/// `MixedSinker` Strategy A on the **u16** path when both
+/// `with_rgb_u16` and `with_rgba_u16` are attached — runs the YUV→RGB
+/// math once into the u16 RGB buffer, then this helper fans out to the
+/// u16 RGBA buffer with no second per-pixel kernel call.
+///
+/// `BITS` is a `const` parameter so the alpha constant resolves at
+/// compile time per format (10 / 12 / 16 etc.); the compiler folds the
+/// `(1 << BITS) - 1` expression to a literal in each monomorphization.
+///
+/// # Panics (debug builds)
+///
+/// - `rgb.len() >= 3 * width` (`u16` elements)
+/// - `rgba_out.len() >= 4 * width` (`u16` elements)
+#[cfg(any(feature = "std", feature = "alloc"))]
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn expand_rgb_u16_to_rgba_u16_row<const BITS: u32>(
+  rgb: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
+  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
+  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
+  let alpha_max: u16 = ((1u32 << BITS) - 1) as u16;
+  for (rgb_px, rgba_px) in rgb[..width * 3]
+    .chunks_exact(3)
+    .zip(rgba_out[..width * 4].chunks_exact_mut(4))
+  {
+    rgba_px[0] = rgb_px[0];
+    rgba_px[1] = rgb_px[1];
+    rgba_px[2] = rgb_px[2];
+    rgba_px[3] = alpha_max;
+  }
+}
+
 // ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) -------------
 
 /// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the
@@ -551,6 +588,8 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi
 /// source precision inline rather than converting first at `BITS` and
 /// then downshifting. This keeps the fast path a single Q15 shift.
 ///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even.
@@ -565,18 +604,74 @@ pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+    y, u_half, v_half, rgb_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of high‑bit‑depth 4:2:0 YUV (`u16` samples in the
+/// low `BITS` bits) directly to **8-bit** packed **RGBA**. Same numerical
+/// contract as [`yuv_420p_n_to_rgb_row`]; the only differences are the
+/// per-pixel stride (4 vs 3) and the alpha byte (`0xFF`, opaque).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+    y, u_half, v_half, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Shared kernel for [`yuv_420p_n_to_rgb_row`] (`ALPHA = false`,
+/// 3 bpp store) and [`yuv_420p_n_to_rgba_row`] (`ALPHA = true`,
+/// 4 bpp store with constant `0xFF` alpha).
+///
+/// The compiler monomorphizes into two separate functions; the
+/// `if ALPHA` branches are DCE'd at each call site.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `u_half.len() >= width / 2`,
+///   `v_half.len() >= width / 2`,
+///   `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   // Compile-time guard — fails monomorphization for any BITS outside
   // {9, 10, 12, 14}. 16 would overflow the Q15 chroma sum (16-bit lives
   // in `yuv_420p16_to_rgb_row`'s i64 chroma family); 8 belongs to the
-  // non-const-generic `yuv_420_to_rgb_row`. Without this guard a release
-  // build instantiating ::<16> would silently produce wrong output.
+  // non-const-generic `yuv_420_to_rgb_or_rgba_row`. Without this guard a
+  // release build instantiating ::<16, _> would silently produce wrong
+  // output.
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
   debug_assert!(v_half.len() >= width / 2, "v_half row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<BITS, 8>(full_range);
@@ -603,14 +698,20 @@ pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale);
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma);
+    out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma);
+    out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma);
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
@@ -650,6 +751,8 @@ fn q15_chroma(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 {
 /// downstream work. Callers who only need 8‑bit output should prefer
 /// [`yuv_420p_n_to_rgb_row`], which is ~2× faster.
 ///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even.
@@ -665,25 +768,79 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // Compile-time guard — see note on `yuv_420p_n_to_rgb_row`. The
-  // 16-bit u16-output path is `yuv_420p16_to_rgb_u16_row` (i64 chroma
-  // family).
+  yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+    y, u_half, v_half, rgb_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of high‑bit‑depth 4:2:0 YUV → **native‑depth `u16`
+/// packed RGBA**. Same numerical contract as
+/// [`yuv_420p_n_to_rgb_u16_row`]; the only differences are the
+/// per-pixel stride (4 vs 3 `u16` elements) and the alpha element,
+/// `(1 << BITS) - 1` (opaque maximum at the input bit depth).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+    y, u_half, v_half, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Shared kernel for [`yuv_420p_n_to_rgb_u16_row`] (`ALPHA = false`,
+/// 3 bpp store) and [`yuv_420p_n_to_rgba_u16_row`] (`ALPHA = true`,
+/// 4 bpp store with opaque alpha = `(1 << BITS) - 1`).
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `u_half.len() >= width / 2`,
+///   `v_half.len() >= width / 2`,
+///   `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // Compile-time guard — see note on `yuv_420p_n_to_rgb_or_rgba_row`.
+  // The 16-bit u16-output path is `yuv_420p16_to_rgb_or_rgba_u16_row`
+  // (i64 chroma family).
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
   debug_assert!(v_half.len() >= width / 2, "v_half row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<BITS, BITS>(full_range);
   let bias = chroma_bias::<BITS>();
   let out_max: i32 = (1i32 << BITS) - 1;
   let mask = bits_mask::<BITS>();
+  let alpha_max: u16 = out_max as u16;
 
   // Every sample AND‑masked to the low `BITS` bits — see matching
-  // comment in [`yuv_420p_n_to_rgb_row`]. Critical for the native‑
-  // depth u16 output path: `range_params_n::<10, 10>` uses
+  // comment in [`yuv_420p_n_to_rgb_or_rgba_row`]. Critical for the
+  // native‑depth u16 output path: `range_params_n::<10, 10>` uses
   // `y_scale = c_scale = 32768` (unit Q15 for BITS==OUT_BITS full
   // range), so an unmasked out‑of‑range sample would push `u_d` /
   // `v_d` to ±32256 and the subsequent `coeff * v_d` exceeds i16
@@ -700,14 +857,20 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale);
-    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[x * bpp + 3] = alpha_max;
+    }
 
     let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = alpha_max;
+    }
 
     x += 2;
   }
@@ -1163,6 +1326,8 @@ pub(crate) fn p16_to_rgb_u16_row(
 /// kernel — has its active low bits discarded (producing near‑black
 /// output), matching every SIMD backend.
 ///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even.
@@ -1176,6 +1341,45 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012)
+/// to **8‑bit** packed **RGBA**. Same numerical contract as
+/// [`p_n_to_rgb_row`]; the only differences are the per-pixel stride
+/// (4 vs 3) and the alpha byte (`0xFF`, opaque).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared kernel for [`p_n_to_rgb_row`] (`ALPHA = false`, 3 bpp store)
+/// and [`p_n_to_rgba_row`] (`ALPHA = true`, 4 bpp store with constant
+/// `0xFF` alpha).
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `uv_half.len() >= width`,
+///   `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   // High-bit-packed Pn kernels are only defined for BITS in {10, 12}.
   // Outside that set, `16 - BITS` could under/overflow and the Q15
@@ -1183,12 +1387,13 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
   // the SIMD dispatcher hands control to unsafe code.
   debug_assert!(
     BITS == 10 || BITS == 12,
-    "p_n_to_rgb_row only supports BITS in {{10, 12}}"
+    "p_n_to_rgb_or_rgba_row only supports BITS in {{10, 12}}"
   );
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<BITS, 8>(full_range);
@@ -1215,14 +1420,20 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale);
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma);
+    out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma);
+    out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma);
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
@@ -1241,6 +1452,8 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
 /// extract the `BITS`-bit value from the high-bit packing) and the
 /// interleaved UV layout.
 ///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even.
@@ -1255,22 +1468,63 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // See `p_n_to_rgb_row` for the BITS range rationale. Duplicated
+  p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012)
+/// to **native‑depth `u16`** packed **RGBA** — output is low‑bit‑packed
+/// to match [`p_n_to_rgb_u16_row`]. Alpha is `(1 << BITS) - 1` (opaque
+/// maximum at the input bit depth).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared kernel for [`p_n_to_rgb_u16_row`] (`ALPHA = false`, 3 bpp
+/// store) and [`p_n_to_rgba_u16_row`] (`ALPHA = true`, 4 bpp store
+/// with opaque alpha = `(1 << BITS) - 1`).
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `uv_half.len() >= width`,
+///   `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. Duplicated
   // here so either entry point catches misuse on its own.
   debug_assert!(
     BITS == 10 || BITS == 12,
-    "p_n_to_rgb_u16_row only supports BITS in {{10, 12}}"
+    "p_n_to_rgb_or_rgba_u16_row only supports BITS in {{10, 12}}"
   );
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<BITS, BITS>(full_range);
   let bias = chroma_bias::<BITS>();
   let out_max: i32 = (1i32 << BITS) - 1;
   let shift = 16 - BITS;
+  let alpha_max: u16 = out_max as u16;
 
   let mut x = 0;
   while x < width {
@@ -1285,14 +1539,20 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale);
-    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[x * bpp + 3] = alpha_max;
+    }
 
     let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = alpha_max;
+    }
 
     x += 2;
   }

From 80fbcda987d12f2f40786af62f4b4ed05d945e5a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:04:38 +1200
Subject: [PATCH 2/5] update

---
 src/row/scalar.rs | 281 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 253 insertions(+), 28 deletions(-)

diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index c25a9e7..3bdf51c 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -550,7 +550,13 @@ pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usi
 ///
 /// - `rgb.len() >= 3 * width` (`u16` elements)
 /// - `rgba_out.len() >= 4 * width` (`u16` elements)
+//
+// Scalar prep for Ship 8 Tranche 5: the consumer (MixedSinker Strategy A
+// on the u16 path) lands in the follow-up Tranche 5b PR. `dead_code`
+// allow lets this prep PR ship the foundation without the eventual call
+// site.
 #[cfg(any(feature = "std", feature = "alloc"))]
+#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn expand_rgb_u16_to_rgba_u16_row<const BITS: u32>(
   rgb: &[u16],
@@ -621,6 +627,12 @@ pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
 ///
 /// - `width` must be even.
 /// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant).
+//
+// Scalar prep for Ship 8 Tranche 5a: the public dispatcher
+// `row::yuv420p10_to_rgba_row` (and its u16 sibling) lands in the
+// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
+// caller.
+#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p_n_to_rgba_row<const BITS: u32>(
   y: &[u16],
@@ -785,6 +797,11 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
 ///
 /// - `width` must be even.
 /// - `rgba_out.len() >= 4 * width` (other slices: same as RGB variant).
+//
+// Scalar prep for Ship 8 Tranche 5b: the public dispatcher
+// `row::yuv420p10_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
+// PR. Until then this thin wrapper has no caller.
+#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   y: &[u16],
@@ -1023,6 +1040,8 @@ fn q15_scale64(sample: i32, scale_q15: i32) -> i32 {
 /// `BITS = 16`, just without the AND-mask (no upper-bit-zero
 /// guarantee to enforce at 16 bits).
 ///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even.
@@ -1038,11 +1057,56 @@ pub(crate) fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **8-bit** packed
+/// **RGBA**. Same numerical contract as [`yuv_420p16_to_rgb_row`];
+/// the only differences are the per-pixel stride (4 vs 3) and the
+/// alpha byte (`0xFF`, opaque).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5a: the public dispatcher
+// `row::yuv420p16_to_rgba_row` lands in the follow-up SIMD/dispatcher
+// PR. Until then this thin wrapper has no caller.
+#[allow(dead_code)]
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared 16-bit YUV 4:2:0 → 8-bit RGB / RGBA kernel. `ALPHA = false`
+/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFF` alpha.
+///
+/// 16-bit input has no AND-mask (every `u16` is a valid sample) and
+/// uses i32 chroma — output-target scaling keeps `u_d * coeff` inside
+/// i32 for u8 output (the i64 chroma family lives in
+/// [`yuv_420p16_to_rgb_or_rgba_u16_row`]).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
   debug_assert!(v_half.len() >= width / 2, "v_half row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range);
@@ -1062,14 +1126,20 @@ pub(crate) fn yuv_420p16_to_rgb_row(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale(y[x] as i32 - y_off, y_scale);
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma);
+    out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma);
+    out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma);
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
@@ -1080,6 +1150,8 @@ pub(crate) fn yuv_420p16_to_rgb_row(
 /// chroma matrix multiply in i64** to accommodate the wider
 /// `coeff × u_d` product at 16 → 16-bit scaling.
 ///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Panics (debug builds)
 ///
 /// Same contract as [`yuv_420p16_to_rgb_row`] plus `rgb_out` is
@@ -1094,11 +1166,54 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  yuv_420p16_to_rgb_or_rgba_u16_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — alpha element is `0xFFFF` (opaque maximum at
+/// 16-bit).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5b: the public dispatcher
+// `row::yuv420p16_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
+// PR. Until then this thin wrapper has no caller.
+#[allow(dead_code)]
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_420p16_to_rgb_or_rgba_u16_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared 16-bit YUV 4:2:0 → native-depth `u16` RGB / RGBA kernel.
+/// `ALPHA = false` emits 3 bpp; `ALPHA = true` emits 4 bpp with
+/// constant `0xFFFF` alpha.
+///
+/// Uses i64 chroma multiply (same rationale as
+/// [`yuv_420p16_to_rgb_u16_row`]).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
   debug_assert!(v_half.len() >= width / 2, "v_half row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range);
@@ -1116,14 +1231,20 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row(
     let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale64(y[x] as i32 - y_off, y_scale);
-    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[x * bpp + 3] = 0xFFFF;
+    }
 
     let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFFFF;
+    }
 
     x += 2;
   }
@@ -1219,6 +1340,8 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row(
 /// - `width` must be even.
 /// - `y.len() >= width`, `uv_half.len() >= width`,
 ///   `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p16_to_rgb_row(
   y: &[u16],
@@ -1228,10 +1351,47 @@ pub(crate) fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **8-bit** packed **RGBA**. Same
+/// numerical contract as [`p16_to_rgb_row`] except for the per-pixel
+/// stride (4 vs 3) and the alpha byte (`0xFF`, opaque).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5a: the public dispatcher
+// `row::p016_to_rgba_row` lands in the follow-up SIMD/dispatcher PR.
+// Until then this thin wrapper has no caller.
+#[allow(dead_code)]
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared P016 → 8-bit RGB / RGBA kernel. `ALPHA = false` emits 3 bpp;
+/// `ALPHA = true` emits 4 bpp with constant `0xFF` alpha.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range);
@@ -1250,14 +1410,20 @@ pub(crate) fn p16_to_rgb_row(
     let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale(y[x] as i32 - y_off, y_scale);
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma);
+    out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma);
+    out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma);
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
@@ -1266,6 +1432,8 @@ pub(crate) fn p16_to_rgb_row(
 /// Converts one row of **P016** to **native-depth `u16`** packed
 /// RGB — full-range output in `[0, 65535]`. Chroma matrix multiply
 /// runs in i64 (same reasoning as [`yuv_420p16_to_rgb_u16_row`]).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p16_to_rgb_u16_row(
   y: &[u16],
@@ -1275,10 +1443,49 @@ pub(crate) fn p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed
+/// **RGBA** — alpha element is `0xFFFF` (opaque maximum at 16-bit).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5b: the public dispatcher
+// `row::p016_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
+// PR. Until then this thin wrapper has no caller.
+#[allow(dead_code)]
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p16_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared P016 → native-depth `u16` RGB / RGBA kernel. `ALPHA = false`
+/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFFFF`
+/// alpha.
+///
+/// Uses i64 chroma multiply (same rationale as [`yuv_420p16_to_rgb_or_rgba_u16_row`]).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_half.len() >= width, "uv row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  debug_assert!(out.len() >= width * bpp, "out row too short");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range);
@@ -1298,14 +1505,20 @@ pub(crate) fn p16_to_rgb_u16_row(
     let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d);
 
     let y0 = q15_scale64(y[x] as i32 - y_off, y_scale);
-    rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16;
+    out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[x * bpp + 3] = 0xFFFF;
+    }
 
     let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale);
-    rgb_out[(x + 1) * 3] = (y1 + r_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
-    rgb_out[(x + 1) * 3 + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16;
+    out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFFFF;
+    }
 
     x += 2;
   }
@@ -1351,6 +1564,12 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
 /// (4 vs 3) and the alpha byte (`0xFF`, opaque).
 ///
 /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5a: the public dispatcher
+// `row::p010_to_rgba_row` (and P012/P016 siblings) lands in the
+// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
+// caller.
+#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_row<const BITS: u32>(
   y: &[u16],
@@ -1477,6 +1696,12 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
 /// maximum at the input bit depth).
 ///
 /// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+//
+// Scalar prep for Ship 8 Tranche 5b: the public dispatcher
+// `row::p010_to_rgba_u16_row` (and P012/P016 siblings) lands in the
+// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
+// caller.
+#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_u16_row<const BITS: u32>(
   y: &[u16],

From 809a0f922f6252c655c56e8143bf7007d965006b Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:13:52 +1200
Subject: [PATCH 3/5] update

---
 src/row/scalar.rs | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 3bdf51c..8b5da89 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -1565,10 +1565,11 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
 ///
 /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
 //
-// Scalar prep for Ship 8 Tranche 5a: the public dispatcher
-// `row::p010_to_rgba_row` (and P012/P016 siblings) lands in the
+// Scalar prep for Ship 8 Tranche 5a: the public dispatchers
+// `row::p010_to_rgba_row` and `row::p012_to_rgba_row` land in the
 // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
-// caller.
+// caller. P016 has its own kernel family
+// ([`p16_to_rgb_or_rgba_row`]) — never routed here.
 #[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_row<const BITS: u32>(
@@ -1602,12 +1603,13 @@ pub(crate) fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
 ) {
   // High-bit-packed Pn kernels are only defined for BITS in {10, 12}.
   // Outside that set, `16 - BITS` could under/overflow and the Q15
-  // coefficient table has no corresponding entry. Caught here before
-  // the SIMD dispatcher hands control to unsafe code.
-  debug_assert!(
-    BITS == 10 || BITS == 12,
-    "p_n_to_rgb_or_rgba_row only supports BITS in {{10, 12}}"
-  );
+  // coefficient table has no corresponding entry. P016 (BITS=16) has
+  // its own dedicated kernel family with i64 chroma multiply — using
+  // this i32 path at BITS=16 would silently overflow on high chroma
+  // values. The compile-time assertion fails monomorphization for any
+  // BITS outside {10, 12}, eliminating that release-build corruption
+  // trap.
+  const { assert!(BITS == 10 || BITS == 12) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");
@@ -1697,10 +1699,11 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
 ///
 /// Thin wrapper over [`p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
 //
-// Scalar prep for Ship 8 Tranche 5b: the public dispatcher
-// `row::p010_to_rgba_u16_row` (and P012/P016 siblings) lands in the
-// follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
-// caller.
+// Scalar prep for Ship 8 Tranche 5b: the public dispatchers
+// `row::p010_to_rgba_u16_row` and `row::p012_to_rgba_u16_row` land in
+// the follow-up SIMD/dispatcher PR. Until then this thin wrapper has
+// no caller. P016 has its own u16 kernel family
+// ([`p16_to_rgb_or_rgba_u16_row`]) — never routed here.
 #[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_u16_row<const BITS: u32>(
@@ -1732,12 +1735,12 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. Duplicated
-  // here so either entry point catches misuse on its own.
-  debug_assert!(
-    BITS == 10 || BITS == 12,
-    "p_n_to_rgb_or_rgba_u16_row only supports BITS in {{10, 12}}"
-  );
+  // See `p_n_to_rgb_or_rgba_row` for the BITS range rationale. The
+  // P016 u16 path lives in [`p16_to_rgb_or_rgba_u16_row`] (i64 chroma
+  // multiply); this i32 path would overflow before clamp at 16-bit
+  // chroma. Compile-time assertion eliminates the release-build
+  // corruption trap.
+  const { assert!(BITS == 10 || BITS == 12) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0, "semi-planar high-bit requires even width");
   debug_assert!(y.len() >= width, "y row too short");

From 5fc4ab53d004c22bfdbdd27e0b64ae136d83ebd1 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:26:23 +1200
Subject: [PATCH 4/5] update

---
 src/row/mod.rs    | 515 ++++++++++++++++++++++++++++++++++++++++++++++
 src/row/scalar.rs |   8 -
 2 files changed, 515 insertions(+), 8 deletions(-)

diff --git a/src/row/mod.rs b/src/row/mod.rs
index 5974753..15f8c10 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -2659,6 +2659,509 @@ pub fn p016_to_rgb_u16_row(
   scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
+// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5 prep) ----------
+//
+// Scalar prep: dispatchers route through the new RGBA scalar kernels
+// (`scalar::*_to_rgba*_row`). The `use_simd` parameter is held in the
+// signature so the follow-up SIMD/backend PRs (Ship 8 Tranche 5a/5b)
+// can fill in per-arch branches without breaking callers.
+
+/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p9_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
+/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
+/// runs the scalar reference regardless of `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p10_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
+/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
+/// runs the scalar reference regardless of `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<10>` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 10) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p12_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
+/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
+/// runs the scalar reference regardless of `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p14_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
+/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
+/// runs the scalar reference regardless of `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<12>` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 12) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
+/// now this dispatcher always runs the scalar reference regardless of
+/// `use_simd`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// Routes through the dedicated 16-bit scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
+/// for u8 output even at 16-bit input. SIMD per-arch routes land in
+/// the follow-up Ship 8 Tranche 5a PR.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — full-range output `[0, 65535]`; alpha element
+/// is `0xFFFF` (opaque maximum at 16-bit).
+///
+/// Routes through the dedicated 16-bit u16-output scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
+/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
+/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
+///
+/// Routes through the dedicated 16-bit P016 scalar kernel
+/// (`scalar::p16_to_rgba_row`). SIMD per-arch routes land in the
+/// follow-up Ship 8 Tranche 5a PR.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed
+/// **RGBA** — full-range output `[0, 65535]`; alpha element is
+/// `0xFFFF`.
+///
+/// Routes through the dedicated 16-bit u16-output P016 scalar kernel
+/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. SIMD
+/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
 // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB --------------------
 //
 // Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width
@@ -3205,6 +3708,18 @@ fn rgb_row_elems(width: usize) -> usize {
   }
 }
 
+/// Element count of one packed `u16`-RGBA row (`width × 4`). Identical
+/// math to [`rgba_row_bytes`] — the returned value is in `u16`
+/// elements, not bytes. Callers use it to size `&mut [u16]` buffers
+/// for the high-bit-depth `u16` RGBA output path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn rgba_row_elems(width: usize) -> usize {
+  match width.checked_mul(4) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 4 overflows usize"),
+  }
+}
+
 /// Maximum permitted magnitude of any element of a fused color
 /// transform handed to a Bayer row dispatcher.
 ///
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 8b5da89..4188d93 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -632,7 +632,6 @@ pub(crate) fn yuv_420p_n_to_rgb_row<const BITS: u32>(
 // `row::yuv420p10_to_rgba_row` (and its u16 sibling) lands in the
 // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
 // caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p_n_to_rgba_row<const BITS: u32>(
   y: &[u16],
@@ -801,7 +800,6 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
 // Scalar prep for Ship 8 Tranche 5b: the public dispatcher
 // `row::yuv420p10_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
 // PR. Until then this thin wrapper has no caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   y: &[u16],
@@ -1070,7 +1068,6 @@ pub(crate) fn yuv_420p16_to_rgb_row(
 // Scalar prep for Ship 8 Tranche 5a: the public dispatcher
 // `row::yuv420p16_to_rgba_row` lands in the follow-up SIMD/dispatcher
 // PR. Until then this thin wrapper has no caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p16_to_rgba_row(
   y: &[u16],
@@ -1178,7 +1175,6 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row(
 // Scalar prep for Ship 8 Tranche 5b: the public dispatcher
 // `row::yuv420p16_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
 // PR. Until then this thin wrapper has no caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn yuv_420p16_to_rgba_u16_row(
   y: &[u16],
@@ -1363,7 +1359,6 @@ pub(crate) fn p16_to_rgb_row(
 // Scalar prep for Ship 8 Tranche 5a: the public dispatcher
 // `row::p016_to_rgba_row` lands in the follow-up SIMD/dispatcher PR.
 // Until then this thin wrapper has no caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p16_to_rgba_row(
   y: &[u16],
@@ -1454,7 +1449,6 @@ pub(crate) fn p16_to_rgb_u16_row(
 // Scalar prep for Ship 8 Tranche 5b: the public dispatcher
 // `row::p016_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher
 // PR. Until then this thin wrapper has no caller.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p16_to_rgba_u16_row(
   y: &[u16],
@@ -1570,7 +1564,6 @@ pub(crate) fn p_n_to_rgb_row<const BITS: u32>(
 // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no
 // caller. P016 has its own kernel family
 // ([`p16_to_rgb_or_rgba_row`]) — never routed here.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_row<const BITS: u32>(
   y: &[u16],
@@ -1704,7 +1697,6 @@ pub(crate) fn p_n_to_rgb_u16_row<const BITS: u32>(
 // the follow-up SIMD/dispatcher PR. Until then this thin wrapper has
 // no caller. P016 has its own u16 kernel family
 // ([`p16_to_rgb_or_rgba_u16_row`]) — never routed here.
-#[allow(dead_code)]
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn p_n_to_rgba_u16_row<const BITS: u32>(
   y: &[u16],

From ae2a47126aaa8c81d4ff5694bf46805e018d3d0e Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:39:13 +1200
Subject: [PATCH 5/5] Update src/row/scalar.rs

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/row/scalar.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 4188d93..5390a49 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -563,12 +563,20 @@ pub(crate) fn expand_rgb_u16_to_rgba_u16_row<const BITS: u32>(
   rgba_out: &mut [u16],
   width: usize,
 ) {
-  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
-  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
+  const {
+    assert!(BITS > 0 && BITS <= 16);
+  }
+
+  let rgb_len = width.checked_mul(3).expect("rgb row length overflow");
+  let rgba_len = width.checked_mul(4).expect("rgba row length overflow");
+
+  debug_assert!(rgb.len() >= rgb_len, "rgb row too short");
+  debug_assert!(rgba_out.len() >= rgba_len, "rgba_out row too short");
+
   let alpha_max: u16 = ((1u32 << BITS) - 1) as u16;
-  for (rgb_px, rgba_px) in rgb[..width * 3]
+  for (rgb_px, rgba_px) in rgb[..rgb_len]
     .chunks_exact(3)
-    .zip(rgba_out[..width * 4].chunks_exact_mut(4))
+    .zip(rgba_out[..rgba_len].chunks_exact_mut(4))
   {
     rgba_px[0] = rgb_px[0];
     rgba_px[1] = rgb_px[1];