Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions crates/recording/src/sources/audio_mixer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,7 @@ impl AudioMixerBuilder {
let mut amix = filter_graph.add(
&ffmpeg::filter::find("amix").expect("Failed to find amix filter"),
"amix",
&format!(
"inputs={}:duration=first:dropout_transition=0",
abuffers.len()
),
&format!("inputs={}:duration=longest", abuffers.len()),
)?;

let aformat_args = format!(
Expand Down Expand Up @@ -388,16 +385,21 @@ impl AudioMixer {
let elapsed = Duration::from_secs_f64(self.samples_out as f64 / output_rate);
let timestamp = start.instant() + start_timestamp.duration_since(start) + elapsed;

self.samples_out += filtered.samples();
let frame_samples = filtered.samples();
let mut frame = AudioFrame::new(filtered, Timestamp::Instant(timestamp));

if self
.output
.try_send(AudioFrame::new(filtered, Timestamp::Instant(timestamp)))
.is_err()
{
return Err(());
loop {
match self.output.try_send(frame) {
Ok(()) => break,
Err(err) if err.is_full() => {
frame = err.into_inner();
std::thread::sleep(Duration::from_millis(1));
}
Err(_) => return Err(()),
}
}

self.samples_out += frame_samples;
filtered = ffmpeg::frame::Audio::empty();
}

Expand Down
185 changes: 181 additions & 4 deletions crates/recording/src/sources/microphone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ use crate::{
};
use anyhow::anyhow;
use cap_media_info::AudioInfo;
use cpal::SampleFormat;
use futures::{SinkExt, channel::mpsc};
use std::sync::Arc;
use std::{borrow::Cow, sync::Arc};

const MICROPHONE_TARGET_CHANNELS: u16 = 1;

pub struct Microphone {
info: AudioInfo,
Expand All @@ -25,7 +28,10 @@ impl AudioSource for Microphone {
Self: Sized,
{
async move {
let audio_info = feed_lock.audio_info();
let source_info = feed_lock.audio_info();
let audio_info = source_info.with_max_channels(MICROPHONE_TARGET_CHANNELS);
let source_channels = source_info.channels;
let target_channels = audio_info.channels;
let (tx, rx) = flume::bounded(8);

feed_lock
Expand All @@ -35,17 +41,24 @@ impl AudioSource for Microphone {

tokio::spawn(async move {
while let Ok(frame) = rx.recv_async().await {
let packed = maybe_downmix_channels(
&frame.data,
frame.format,
source_channels,
target_channels,
);

let _ = audio_tx
.send(AudioFrame::new(
audio_info.wrap_frame_with_max_channels(&frame.data, 2),
audio_info.wrap_frame(packed.as_ref()),
frame.timestamp,
))
.await;
}
});

Ok(Self {
info: audio_info.with_max_channels(2),
info: audio_info,
_lock: feed_lock,
})
}
Expand All @@ -55,3 +68,167 @@ impl AudioSource for Microphone {
self.info
}
}

fn maybe_downmix_channels<'a>(
data: &'a [u8],
format: SampleFormat,
source_channels: usize,
target_channels: usize,
) -> Cow<'a, [u8]> {
if target_channels == 0 || source_channels == 0 || target_channels >= source_channels {
return Cow::Borrowed(data);
}

if target_channels == 1 {
if let Some(samples) = downmix_to_mono(data, format, source_channels) {
Cow::Owned(samples)
} else {
Cow::Borrowed(data)
}
} else {
Cow::Borrowed(data)
}
}
Comment on lines +82 to +91
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Potential data/metadata mismatch when downmix fails.

If downmix_to_mono returns None (line 83), the code falls back to the original multi-channel data (line 86). However, the AudioInfo at line 53 indicates mono channels. This mismatch could cause encoder errors or corrupted audio downstream.

Consider either:

  1. Logging the failure and using a zeroed mono frame, or
  2. Propagating the error to halt the stream

Run the following to check how AudioInfo mismatches are handled downstream:


🏁 Script executed:

#!/bin/bash
# Search for how wrapped frames are processed in the encoder
rg -nP --type=rust -C3 'AudioFrame.*wrap_frame|fn.*wrap_frame'

Length of output: 1273



---

🏁 Script executed:

```shell
#!/bin/bash
# Get full implementation of wrap_frame_with_max_channels
sed -n '137,190p' crates/media-info/src/lib.rs

Length of output: 2253


🏁 Script executed:

#!/bin/bash
# Check how maybe_downmix_channels is called and if AudioInfo is updated
rg -nP --type=rust -C5 'maybe_downmix_channels|downmix_to_mono'

Length of output: 5057


🏁 Script executed:

#!/bin/bash
# Search for AudioInfo modification after downmix attempt
rg -nP --type=rust -C3 'AudioInfo.*target_channels|channels.*=' crates/recording/src/

Length of output: 2191



---

🏁 Script executed:

```shell
#!/bin/bash
# Check how frames are actually processed after wrap_frame
rg -nP --type=rust -C3 'wrap_frame.*packed|packed.*wrap_frame' crates/recording/src/sources/microphone.rs

Length of output: 277


🏁 Script executed:

#!/bin/bash
# Check what with_max_channels does
rg -nP --type=rust -B2 -A8 'fn with_max_channels'

Length of output: 580


Fix critical audio data/metadata mismatch when downmix fails.

When downmix_to_mono returns None (line 83), the code returns the original multi-channel data unchanged. However, audio_info at line 32 already has channels set to the target (mono via with_max_channels). This causes wrap_frame at line 53 to interpret multi-channel data with mono metadata:

  • wrap_frame_with_max_channels line 147 calculates packed_sample_size = sample_size * self.channels (1)
  • But the actual data has stereo stride (sample_size * 2)
  • This misalignment corrupts frame count and chunk reading, causing silent audio data corruption

Required fix (lines 82–91):

Either propagate the downmix failure to halt the stream, or create a valid mono frame from the source data (e.g., downsampled or silence). Do not return misaligned data.

🤖 Prompt for AI Agents
In crates/recording/src/sources/microphone.rs around lines 82 to 91, when
downmix_to_mono(data, format, source_channels) returns None the code currently
returns the original multi‑channel data while audio_info (line 32) already
reports mono, causing a data/metadata mismatch and corrupted frames; fix by not
returning misaligned data — either propagate the failure (return an Err or None
so the stream terminates upstream) or synthesize a valid mono buffer (for
example generate a mono buffer by averaging/combining the source channels into a
newly owned Vec<u8> or, if combining is impossible, fill a zeroed mono buffer of
the correct length and sample size) and return Cow::Owned of that buffer so
audio_info channels and the data layout always match.


fn downmix_to_mono(data: &[u8], format: SampleFormat, source_channels: usize) -> Option<Vec<u8>> {
let sample_size = sample_format_size(format)?;

let frame_size = sample_size.checked_mul(source_channels)?;
if frame_size == 0 || !data.len().is_multiple_of(frame_size) {
return None;
}

let frame_count = data.len() / frame_size;
let mut out = vec![0u8; frame_count * sample_size];

for (frame_idx, frame) in data.chunks(frame_size).enumerate() {
let mono = average_frame_sample(format, frame, sample_size, source_channels)?;
let start = frame_idx * sample_size;
write_sample_from_f64(format, mono, &mut out[start..start + sample_size]);
}

Some(out)
}

fn sample_format_size(format: SampleFormat) -> Option<usize> {
Some(match format {
SampleFormat::U8 => 1,
SampleFormat::I16 => 2,
SampleFormat::I32 => 4,
SampleFormat::I64 => 8,
SampleFormat::F32 => 4,
SampleFormat::F64 => 8,
_ => return None,
})
}

fn average_frame_sample(
format: SampleFormat,
frame: &[u8],
sample_size: usize,
channels: usize,
) -> Option<f64> {
let mut sum = 0.0;
for ch in 0..channels {
let start = ch * sample_size;
let end = start + sample_size;
sum += sample_to_f64(format, &frame[start..end])?;
}

Some(sum / channels as f64)
}

fn sample_to_f64(format: SampleFormat, bytes: &[u8]) -> Option<f64> {
match format {
SampleFormat::U8 => bytes.first().copied().map(|v| v as f64),
SampleFormat::I16 => {
let mut buf = [0u8; 2];
buf.copy_from_slice(bytes);
Some(i16::from_ne_bytes(buf) as f64)
}
SampleFormat::I32 => {
let mut buf = [0u8; 4];
buf.copy_from_slice(bytes);
Some(i32::from_ne_bytes(buf) as f64)
}
SampleFormat::I64 => {
let mut buf = [0u8; 8];
buf.copy_from_slice(bytes);
Some(i64::from_ne_bytes(buf) as f64)
}
SampleFormat::F32 => {
let mut buf = [0u8; 4];
buf.copy_from_slice(bytes);
Some(f32::from_ne_bytes(buf) as f64)
}
SampleFormat::F64 => {
let mut buf = [0u8; 8];
buf.copy_from_slice(bytes);
Some(f64::from_ne_bytes(buf))
}
_ => None,
}
}

fn write_sample_from_f64(format: SampleFormat, value: f64, out: &mut [u8]) {
match format {
SampleFormat::U8 => {
let sample = value.round().clamp(u8::MIN as f64, u8::MAX as f64) as u8;
out[0] = sample;
}
SampleFormat::I16 => {
let sample = value.round().clamp(i16::MIN as f64, i16::MAX as f64) as i16;
out.copy_from_slice(&sample.to_ne_bytes());
}
SampleFormat::I32 => {
let sample = value.round().clamp(i32::MIN as f64, i32::MAX as f64) as i32;
out.copy_from_slice(&sample.to_ne_bytes());
}
SampleFormat::I64 => {
let sample = value.round().clamp(i64::MIN as f64, i64::MAX as f64) as i64;
out.copy_from_slice(&sample.to_ne_bytes());
}
SampleFormat::F32 => {
let sample = value as f32;
out.copy_from_slice(&sample.to_ne_bytes());
}
SampleFormat::F64 => {
out.copy_from_slice(&value.to_ne_bytes());
}
_ => {}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn downmixes_stereo_f32_to_mono() {
let frames = [(0.5f32, -0.25f32), (1.0f32, 1.0f32)];
let mut data = Vec::new();

for (left, right) in frames {
data.extend_from_slice(&left.to_ne_bytes());
data.extend_from_slice(&right.to_ne_bytes());
}

let downmixed = maybe_downmix_channels(&data, SampleFormat::F32, 2, 1);
let owned = downmixed.into_owned();
assert_eq!(owned.len(), frames.len() * std::mem::size_of::<f32>());

let first = f32::from_ne_bytes(owned[0..4].try_into().unwrap());
let second = f32::from_ne_bytes(owned[4..8].try_into().unwrap());

assert!((first - 0.125).abs() < f32::EPSILON);
assert!((second - 1.0).abs() < f32::EPSILON);
}

#[test]
fn leaves_mono_buffers_untouched() {
let sample = 0.75f32;
let data = sample.to_ne_bytes().to_vec();
let result = maybe_downmix_channels(&data, SampleFormat::F32, 1, 1);
assert!(matches!(result, Cow::Borrowed(_)));
}
}
Loading