From f8c48ce4cc5be79f5a7df1fdceff781db363f3b5 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 27 Apr 2026 12:07:39 +0800 Subject: [PATCH] fix: handle Tencent QQ Silk/AMR voice format in ensure_wav QQ platform sends voice messages in Silk format (or special AMR variants) that ffmpeg cannot decode, causing STT to fail. - ensure_wav(): detect Silk format and convert via tencent_silk_to_wav() - ensure_wav(): detect AMR, try ffmpeg first, fallback to convert_to_pcm_wav() - _get_audio_magic_type(): broaden Silk detection to handle \x02 prefix and SILK bytes anywhere in the first 16 bytes of the header --- astrbot/core/utils/media_utils.py | 51 +++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/astrbot/core/utils/media_utils.py b/astrbot/core/utils/media_utils.py index 03d7912cb6..59a6f15e6c 100644 --- a/astrbot/core/utils/media_utils.py +++ b/astrbot/core/utils/media_utils.py @@ -295,14 +295,57 @@ async def ensure_wav(audio_path: str, output_path: str | None = None) -> str: """Ensure the audio path points to wav format by extension/guess and convert when needed. If the file appears to already be wav, return it directly to avoid extra conversion. + Handles Tencent QQ special formats (silk / amr) that ffmpeg cannot decode. """ if not audio_path: return audio_path - if _get_audio_magic_type(audio_path) == "wav": + audio_type = _get_audio_magic_type(audio_path) + + if audio_type == "wav": return audio_path + if audio_type in ("silk",): + # Tencent Silk format (commonly used by QQ). ffmpeg cannot decode it. + from astrbot.core.utils.tencent_record_helper import tencent_silk_to_wav + + if not output_path: + from pathlib import Path + from uuid import uuid4 + + from astrbot.core.utils.astrbot_path import get_astrbot_temp_path + + temp_dir = Path(get_astrbot_temp_path()) + temp_dir.mkdir(parents=True, exist_ok=True) + output_path = str(temp_dir / f"media_audio_{uuid4().hex}.wav") + + logger.info(f"Detected Silk audio format, converting to wav: {audio_path}") + return await tencent_silk_to_wav(audio_path, output_path) + + if audio_type in ("amr",): + # AMR from Tencent platforms may also be a variant that ffmpeg misdetects. + # Try ffmpeg first as it handles standard AMR correctly. + try: + return await convert_audio_to_wav(audio_path, output_path) + except Exception as e: + logger.warning( + f"ffmpeg failed to convert amr file, trying pyffmpeg fallback: {e}" + ) + from astrbot.core.utils.tencent_record_helper import convert_to_pcm_wav + + if not output_path: + from pathlib import Path + from uuid import uuid4 + + from astrbot.core.utils.astrbot_path import get_astrbot_temp_path + + temp_dir = Path(get_astrbot_temp_path()) + temp_dir.mkdir(parents=True, exist_ok=True) + output_path = str(temp_dir / f"media_audio_{uuid4().hex}.wav") + + return await convert_to_pcm_wav(audio_path, output_path) + return await convert_audio_to_wav(audio_path, output_path) @@ -341,7 +384,11 @@ def _get_audio_magic_type(audio_path: str) -> str: if header[:4] == b"ftyp" and b"mp4" in header[:8]: return "mp4" - if header[:8] == b"#!SILK_V3": + if ( + header[:8] == b"#!SILK_V3" + or header[1:9] == b"#!SILK_V3" + or b"SILK" in header[:16] + ): return "silk" return ""