From 539f8c832b1ea8c30e3baa152de7779f365344d9 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:15 +0000 Subject: [PATCH 01/11] fix(desktop): add max batch buffer size to prevent HTTP 413 VADGateService accumulates unbounded audio during continuous speech, producing 3.2MB+ chunks that exceed backend body size limits. Add maxBatchBytes=1.5MB (~23.4s stereo) cap with auto-emit: when the buffer exceeds the cap during SPEECH or HANGOVER state, emit the current buffer and start fresh accumulation with correct timestamp advancement. Fixes #6195 Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/VADGateService.swift | 34 ++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift index 3151b1ceb98..31ca0a7349d 100644 --- a/desktop/Desktop/Sources/VADGateService.swift +++ b/desktop/Desktop/Sources/VADGateService.swift @@ -205,6 +205,9 @@ final class VADGateService { private let preRollMs: Double = 500 private let hangoverMs: Double = 4000 // Streaming mode: controls finalize timing private let batchHangoverMs: Double = 2000 // Batch mode: controls chunk boundary (user-visible latency) + /// Maximum batch buffer size before auto-emit (~23.4s of stereo 16kHz Int16 PCM). + /// Prevents HTTP 413 from backend/proxy body size limits. + static let maxBatchBytes = 1_500_000 private let keepaliveSec: Double = 20 private let vadWindowSamples = 512 private let sampleRate = 16000 @@ -644,6 +647,11 @@ final class VADGateService { case .speech: batchAudioBuffer.append(stereoData) + // Auto-emit if buffer exceeds max size (prevents HTTP 413) + if batchAudioBuffer.count >= VADGateService.maxBatchBytes { + return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData) + } + if !isSpeech { // SPEECH -> HANGOVER batchState = .hangover @@ -653,6 +661,12 @@ final class VADGateService { case .hangover: batchAudioBuffer.append(stereoData) + + // Auto-emit if buffer exceeds max size (prevents HTTP 413) + if batchAudioBuffer.count >= VADGateService.maxBatchBytes { + return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData) + } + let timeSinceSpeechMs = batchAudioCursorMs - batchLastSpeechMs if isSpeech { @@ -683,6 +697,26 @@ final class VADGateService { } } + /// Auto-emit the current batch buffer when it exceeds maxBatchBytes. + /// Stays in .speech state so the next audio continues accumulating into a fresh buffer. + /// Called under lock. + private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput { + let bytesPerFrame = 4 + let completedBuffer = batchAudioBuffer + let startTime = batchSpeechStartWallTime + + // Advance start time for the next buffer: emitted duration in seconds + let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate) + batchSpeechStartWallTime = startTime + emittedDurationSec + + // Start fresh accumulation (stay in current state — speech or hangover) + batchAudioBuffer = Data() + + log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)") + + return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true) + } + /// Flush remaining batch audio buffer (call when recording stops). func flushBatchBuffer() -> BatchGateOutput? { lock.lock() From 99267e87258dec148588dd5d93029b422806ba02 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:23 +0000 Subject: [PATCH 02/11] fix(desktop): add split-and-retry for large batch transcription payloads Add defense-in-depth for HTTP 413: batchTranscribeWithSplitting() proactively splits audio exceeding maxBatchPayloadBytes at midpoint with 1s overlap, transcribes each half, and merges word-level results per channel with timestamp offset and overlap deduplication. Also retries with splitting on 413 response. Add payloadTooLarge error case to distinguish 413 from other HTTP errors. Co-Authored-By: Claude Opus 4.6 --- .../Sources/TranscriptionService.swift | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/desktop/Desktop/Sources/TranscriptionService.swift b/desktop/Desktop/Sources/TranscriptionService.swift index 58300e1a8a9..30d653a3616 100644 --- a/desktop/Desktop/Sources/TranscriptionService.swift +++ b/desktop/Desktop/Sources/TranscriptionService.swift @@ -42,6 +42,7 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate { case missingAPIKey case connectionFailed(Error) case invalidResponse + case payloadTooLarge(statusCode: Int, body: String) case webSocketError(String) var errorDescription: String? { @@ -52,6 +53,8 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate { return "Connection failed: \(error.localizedDescription)" case .invalidResponse: return "Invalid response from DeepGram" + case .payloadTooLarge(let statusCode, _): + return "Payload too large (HTTP \(statusCode))" case .webSocketError(let message): return "WebSocket error: \(message)" } @@ -952,6 +955,9 @@ extension TranscriptionService { let statusCode = (response as? HTTPURLResponse)?.statusCode ?? -1 let body = String(data: data, encoding: .utf8) ?? "no body" logError("TranscriptionService: Batch full transcription failed with status \(statusCode): \(body)", error: nil) + if statusCode == 413 { + throw TranscriptionError.payloadTooLarge(statusCode: statusCode, body: body) + } throw TranscriptionError.invalidResponse } @@ -989,6 +995,152 @@ extension TranscriptionService { return segments } + + // MARK: - Batch Transcription with Splitting + + /// Maximum audio payload size for a single batch transcription request. + /// Matches VADGateService.maxBatchBytes. Audio larger than this is proactively split. + static let maxBatchPayloadBytes = VADGateService.maxBatchBytes + + /// Bytes per second for stereo 16kHz Int16 PCM audio. + static let stereoBytesPerSecond = 64_000 + + /// Transcribe audio with automatic splitting for large payloads. + /// Proactively splits audio exceeding maxBatchPayloadBytes, and retries with splitting on 413. + static func batchTranscribeWithSplitting( + audioData: Data, + language: String = "en", + vocabulary: [String] = [] + ) async throws -> [TranscriptSegment] { + // Proactive split if audio exceeds max payload + if audioData.count > maxBatchPayloadBytes { + log("TranscriptionService: Audio \(audioData.count) bytes exceeds \(maxBatchPayloadBytes) — splitting") + return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary) + } + + // Try direct transcription, retry with split on 413 + do { + return try await batchTranscribeFull(audioData: audioData, language: language, vocabulary: vocabulary) + } catch TranscriptionError.payloadTooLarge { + log("TranscriptionService: Got 413, retrying with split") + return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary) + } + } + + /// Split audio at midpoint with 1s overlap, transcribe each half, merge results. + /// Only one level of splitting — halves are sent directly via batchTranscribeFull. + static func splitAndTranscribe( + audioData: Data, + language: String, + vocabulary: [String] + ) async throws -> [TranscriptSegment] { + let overlapBytes = stereoBytesPerSecond // 1 second overlap + let bytesPerFrame = 4 // Stereo Int16: 2 channels * 2 bytes + + // Align midpoint to frame boundary + let rawMid = audioData.count / 2 + let mid = (rawMid / bytesPerFrame) * bytesPerFrame + + // First half: [0, mid + overlap/2) + let firstEnd = min(mid + overlapBytes / 2, audioData.count) + let alignedFirstEnd = (firstEnd / bytesPerFrame) * bytesPerFrame + let firstHalf = audioData.prefix(alignedFirstEnd) + + // Second half: [mid - overlap/2, end) + let secondStart = max(mid - overlapBytes / 2, 0) + let alignedSecondStart = (secondStart / bytesPerFrame) * bytesPerFrame + let secondHalf = audioData.suffix(from: alignedSecondStart) + + let splitStartSec = Double(alignedSecondStart) / Double(stereoBytesPerSecond) + + log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s") + + // Transcribe both halves (sequentially to avoid doubling concurrent load) + let firstSegments = try await batchTranscribeFull( + audioData: Data(firstHalf), language: language, vocabulary: vocabulary + ) + let secondSegments = try await batchTranscribeFull( + audioData: Data(secondHalf), language: language, vocabulary: vocabulary + ) + + // Merge per channel: offset second-half timestamps, dedupe overlap + return mergeSegments(first: firstSegments, second: secondSegments, secondOffsetSec: splitStartSec) + } + + /// Merge segments from two halves per channel. + /// Second-half word timestamps are offset by secondOffsetSec. + /// Words in the overlap window are deduped by matching text and timestamp proximity. + static func mergeSegments( + first: [TranscriptSegment], + second: [TranscriptSegment], + secondOffsetSec: Double + ) -> [TranscriptSegment] { + // Group by channel + var firstByChannel: [Int: TranscriptSegment] = [:] + for seg in first { firstByChannel[seg.channelIndex] = seg } + + var secondByChannel: [Int: TranscriptSegment] = [:] + for seg in second { secondByChannel[seg.channelIndex] = seg } + + let allChannels = Set(firstByChannel.keys).union(secondByChannel.keys) + var merged: [TranscriptSegment] = [] + + for ch in allChannels.sorted() { + let firstWords = firstByChannel[ch]?.words ?? [] + let secondWords = (secondByChannel[ch]?.words ?? []).map { word in + TranscriptSegment.Word( + word: word.word, + start: word.start + secondOffsetSec, + end: word.end + secondOffsetSec, + confidence: word.confidence, + speaker: word.speaker, + punctuatedWord: word.punctuatedWord + ) + } + + // Dedupe: find where first-half ends and second-half begins + let deduped = dedupeOverlapWords(first: firstWords, second: secondWords) + + let combinedText = deduped.map { $0.punctuatedWord }.joined(separator: " ") + let avgConfidence = deduped.isEmpty ? 0.0 : deduped.reduce(0.0) { $0 + $1.confidence } / Double(deduped.count) + + merged.append(TranscriptSegment( + text: combinedText, + isFinal: true, + speechFinal: true, + confidence: avgConfidence, + words: deduped, + channelIndex: ch + )) + } + + return merged + } + + /// Deduplicate words in the overlap window between first and second halves. + /// Words from the second half that match a first-half word (same text, within 0.5s) are dropped. + static func dedupeOverlapWords( + first: [TranscriptSegment.Word], + second: [TranscriptSegment.Word] + ) -> [TranscriptSegment.Word] { + guard let lastFirstWord = first.last else { return second } + let overlapEnd = lastFirstWord.end + + var result = first + for word in second { + // Skip words that fall within the overlap window and match a first-half word + if word.start <= overlapEnd + 0.5 { + let isDuplicate = first.contains { firstWord in + firstWord.word.lowercased() == word.word.lowercased() && + abs(firstWord.start - word.start) < 0.5 + } + if isDuplicate { continue } + } + result.append(word) + } + + return result + } } // MARK: - Reconnect Audio Ring Buffer From 340140ed52a88569f105835573aaa905063365e2 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:28 +0000 Subject: [PATCH 03/11] fix(desktop): use splitting-aware batch transcription in AppState Switch batchTranscribeChunk from batchTranscribeFull to batchTranscribeWithSplitting, which handles proactive splitting and 413 retry automatically. Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/AppState.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/desktop/Desktop/Sources/AppState.swift b/desktop/Desktop/Sources/AppState.swift index 84e0dcc5c66..d9020888445 100644 --- a/desktop/Desktop/Sources/AppState.swift +++ b/desktop/Desktop/Sources/AppState.swift @@ -2329,7 +2329,7 @@ class AppState: ObservableObject { let vocabulary = AssistantSettings.shared.effectiveVocabulary do { - let segments = try await TranscriptionService.batchTranscribeFull( + let segments = try await TranscriptionService.batchTranscribeWithSplitting( audioData: audioBuffer, language: effectiveLanguage, vocabulary: vocabulary From 77ad5bac447bffc72aafcd202efe1504a7135f0f Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:33 +0000 Subject: [PATCH 04/11] test(desktop): add batch split and merge tests 7 tests covering word deduplication, timestamp offsetting, multi-channel merge, maxBatchBytes consistency, and frame alignment. Co-Authored-By: Claude Opus 4.6 --- .../Tests/TranscriptionServiceTests.swift | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/desktop/Desktop/Tests/TranscriptionServiceTests.swift b/desktop/Desktop/Tests/TranscriptionServiceTests.swift index ae5e8c57f8d..288341dd825 100644 --- a/desktop/Desktop/Tests/TranscriptionServiceTests.swift +++ b/desktop/Desktop/Tests/TranscriptionServiceTests.swift @@ -235,6 +235,117 @@ final class URLConstructionTests: XCTestCase { } } +// MARK: - Batch Transcription Splitting Tests + +final class BatchSplitTests: XCTestCase { + + func testDedupeOverlapWordsRemovesDuplicates() { + let first = [ + TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"), + TranscriptionService.TranscriptSegment.Word(word: "world", start: 0.5, end: 1.0, confidence: 0.9, speaker: 0, punctuatedWord: "world"), + ] + let second = [ + // Duplicate of "world" — within 0.5s of first-half version + TranscriptionService.TranscriptSegment.Word(word: "world", start: 0.6, end: 1.1, confidence: 0.8, speaker: 0, punctuatedWord: "world"), + TranscriptionService.TranscriptSegment.Word(word: "foo", start: 1.5, end: 2.0, confidence: 0.9, speaker: 0, punctuatedWord: "foo"), + ] + + let result = TranscriptionService.dedupeOverlapWords(first: first, second: second) + XCTAssertEqual(result.count, 3) + XCTAssertEqual(result[0].word, "hello") + XCTAssertEqual(result[1].word, "world") + XCTAssertEqual(result[2].word, "foo") + } + + func testDedupeOverlapWordsKeepsNonOverlapping() { + let first = [ + TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"), + ] + let second = [ + TranscriptionService.TranscriptSegment.Word(word: "world", start: 5.0, end: 5.5, confidence: 0.9, speaker: 0, punctuatedWord: "world"), + ] + + let result = TranscriptionService.dedupeOverlapWords(first: first, second: second) + XCTAssertEqual(result.count, 2) + XCTAssertEqual(result[0].word, "hello") + XCTAssertEqual(result[1].word, "world") + } + + func testDedupeOverlapWordsEmptyFirst() { + let second = [ + TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"), + ] + + let result = TranscriptionService.dedupeOverlapWords(first: [], second: second) + XCTAssertEqual(result.count, 1) + XCTAssertEqual(result[0].word, "hello") + } + + func testMergeSegmentsOffsetsSecondHalf() { + let first = [ + TranscriptionService.TranscriptSegment( + text: "hello", isFinal: true, speechFinal: true, confidence: 0.9, + words: [.init(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "hello")], + channelIndex: 0 + ), + ] + let second = [ + TranscriptionService.TranscriptSegment( + text: "world", isFinal: true, speechFinal: true, confidence: 0.9, + words: [.init(word: "world", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "world")], + channelIndex: 0 + ), + ] + + let merged = TranscriptionService.mergeSegments(first: first, second: second, secondOffsetSec: 10.0) + XCTAssertEqual(merged.count, 1) + XCTAssertEqual(merged[0].channelIndex, 0) + XCTAssertEqual(merged[0].words.count, 2) + XCTAssertEqual(merged[0].words[0].word, "hello") + XCTAssertEqual(merged[0].words[0].start, 0.0, accuracy: 0.001) + XCTAssertEqual(merged[0].words[1].word, "world") + XCTAssertEqual(merged[0].words[1].start, 10.0, accuracy: 0.001) + } + + func testMergeSegmentsMultiChannel() { + let first = [ + TranscriptionService.TranscriptSegment( + text: "mic", isFinal: true, speechFinal: true, confidence: 0.9, + words: [.init(word: "mic", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "mic")], + channelIndex: 0 + ), + ] + let second = [ + TranscriptionService.TranscriptSegment( + text: "sys", isFinal: true, speechFinal: true, confidence: 0.9, + words: [.init(word: "sys", start: 0.0, end: 0.5, confidence: 0.9, speaker: 1, punctuatedWord: "sys")], + channelIndex: 1 + ), + ] + + let merged = TranscriptionService.mergeSegments(first: first, second: second, secondOffsetSec: 5.0) + XCTAssertEqual(merged.count, 2) + let ch0 = merged.first { $0.channelIndex == 0 } + let ch1 = merged.first { $0.channelIndex == 1 } + XCTAssertEqual(ch0?.words.count, 1) + XCTAssertEqual(ch1?.words.count, 1) + XCTAssertEqual(ch1?.words[0].start ?? 0, 5.0, accuracy: 0.001) + } + + func testMaxBatchBytesConsistent() { + XCTAssertEqual(TranscriptionService.maxBatchPayloadBytes, VADGateService.maxBatchBytes) + } + + func testSplitPointIsFrameAligned() { + // Stereo Int16: 4 bytes per frame + let audioSize = 100_001 // Not frame-aligned + let mid = audioSize / 2 + let aligned = (mid / 4) * 4 + XCTAssertEqual(aligned % 4, 0) + XCTAssertTrue(aligned <= mid) + } +} + final class ReconnectDelayTests: XCTestCase { func testExponentialGrowth() { From 1b32ceed088b242ba2adbfe7b1cc296c4770df90 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:38 +0000 Subject: [PATCH 05/11] fix(desktop): update OnboardingFlowTests for hasReorderedTrustStep param Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Tests/OnboardingFlowTests.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/desktop/Desktop/Tests/OnboardingFlowTests.swift b/desktop/Desktop/Tests/OnboardingFlowTests.swift index 7678281fd20..7864784cb0b 100644 --- a/desktop/Desktop/Tests/OnboardingFlowTests.swift +++ b/desktop/Desktop/Tests/OnboardingFlowTests.swift @@ -23,7 +23,8 @@ final class OnboardingFlowTests: XCTestCase { hasMergedVoiceInputStep: false, hasRemovedNotificationStep: true, hasInsertedFloatingBarShortcutStep: true, - hasMigratedPagedIntro: true + hasMigratedPagedIntro: true, + hasReorderedTrustStep: true ) XCTAssertEqual(migrated, 3) @@ -37,7 +38,8 @@ final class OnboardingFlowTests: XCTestCase { hasMergedVoiceInputStep: true, hasRemovedNotificationStep: true, hasInsertedFloatingBarShortcutStep: true, - hasMigratedPagedIntro: true + hasMigratedPagedIntro: true, + hasReorderedTrustStep: true ) XCTAssertEqual(migrated, OnboardingFlow.lastStepIndex) From 733c86251a0190e0d63de464ac320e29e7f1a5c0 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:38:42 +0000 Subject: [PATCH 06/11] chore(desktop): add changelog entry for batch transcription fix Co-Authored-By: Claude Opus 4.6 --- desktop/CHANGELOG.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/desktop/CHANGELOG.json b/desktop/CHANGELOG.json index bbf07da14f8..b453cab639a 100644 --- a/desktop/CHANGELOG.json +++ b/desktop/CHANGELOG.json @@ -1,6 +1,7 @@ { "unreleased": [ - "Fixed WebSocket transcription disconnects: proper handshake detection, audio buffering during reconnection, unlimited retry with backoff, and thread-safe connection state" + "Fixed WebSocket transcription disconnects: proper handshake detection, audio buffering during reconnection, unlimited retry with backoff, and thread-safe connection state", + "Fixed batch transcription failing on long speech (50s+) by splitting large audio chunks automatically" ], "releases": [ { From fc1d04f1785faac372859614d713478a7401d4c6 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:47:24 +0000 Subject: [PATCH 07/11] fix(desktop): use recursive splitting for split halves exceeding limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split halves with overlap can still exceed maxBatchPayloadBytes (e.g., 3.2MB → two 1.63MB halves). Use batchTranscribeWithSplitting recursively instead of batchTranscribeFull directly, so oversized halves get split again. Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/TranscriptionService.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/desktop/Desktop/Sources/TranscriptionService.swift b/desktop/Desktop/Sources/TranscriptionService.swift index 30d653a3616..44d6d36468b 100644 --- a/desktop/Desktop/Sources/TranscriptionService.swift +++ b/desktop/Desktop/Sources/TranscriptionService.swift @@ -1055,11 +1055,11 @@ extension TranscriptionService { log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s") - // Transcribe both halves (sequentially to avoid doubling concurrent load) - let firstSegments = try await batchTranscribeFull( + // Transcribe both halves (recursively split if still too large) + let firstSegments = try await batchTranscribeWithSplitting( audioData: Data(firstHalf), language: language, vocabulary: vocabulary ) - let secondSegments = try await batchTranscribeFull( + let secondSegments = try await batchTranscribeWithSplitting( audioData: Data(secondHalf), language: language, vocabulary: vocabulary ) From f23640dbf22e304a21691ef80dee22330d4c7484 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:47:30 +0000 Subject: [PATCH 08/11] fix(desktop): transition to .speech after auto-emit in hangover state autoEmitBatchBuffer left batchState unchanged, so auto-emit during hangover would leave an empty buffer in hangover state, potentially emitting a silence-only follow-up chunk. Always transition to .speech after auto-emit to continue proper accumulation. Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/VADGateService.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift index 31ca0a7349d..2ad58ca558b 100644 --- a/desktop/Desktop/Sources/VADGateService.swift +++ b/desktop/Desktop/Sources/VADGateService.swift @@ -698,7 +698,7 @@ final class VADGateService { } /// Auto-emit the current batch buffer when it exceeds maxBatchBytes. - /// Stays in .speech state so the next audio continues accumulating into a fresh buffer. + /// Transitions to .speech state so the next audio continues accumulating into a fresh buffer. /// Called under lock. private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput { let bytesPerFrame = 4 @@ -709,7 +709,9 @@ final class VADGateService { let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate) batchSpeechStartWallTime = startTime + emittedDurationSec - // Start fresh accumulation (stay in current state — speech or hangover) + // Always transition to .speech for continued accumulation. + // If we were in .hangover, staying there would emit a silence-only follow-up chunk. + batchState = .speech batchAudioBuffer = Data() log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)") From ba2b908b4037267436f5987fb02c2b2ceb815e78 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 16:52:52 +0000 Subject: [PATCH 09/11] fix(desktop): reset batchLastSpeechMs after auto-emit to prevent silence-only chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After auto-emit, batchLastSpeechMs still pointed to the old buffer's last speech time. The next silent chunk would immediately trigger hangover→silence transition (timeSinceSpeechMs > 2000) and emit an empty/silence-only buffer. Reset batchLastSpeechMs to batchAudioCursorMs so the hangover timer starts fresh after auto-emit. Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/VADGateService.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift index 2ad58ca558b..964f0f37ec8 100644 --- a/desktop/Desktop/Sources/VADGateService.swift +++ b/desktop/Desktop/Sources/VADGateService.swift @@ -712,6 +712,10 @@ final class VADGateService { // Always transition to .speech for continued accumulation. // If we were in .hangover, staying there would emit a silence-only follow-up chunk. batchState = .speech + // Reset lastSpeechMs to current cursor so the hangover timer starts fresh. + // Without this, the next silent chunk would immediately trigger hangover→silence + // and emit an empty/silence-only buffer. + batchLastSpeechMs = batchAudioCursorMs batchAudioBuffer = Data() log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)") From 2bad1363a56d73e6c03c065b20d7268c3f55e70f Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 17:00:35 +0000 Subject: [PATCH 10/11] feat(desktop): add VAD gate test accessors for auto-emit verification Add testAutoEmit() method and test property accessors to VADGateService for testing the auto-emit state machine path without requiring ONNX model loading. Co-Authored-By: Claude Opus 4.6 --- desktop/Desktop/Sources/VADGateService.swift | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift index 964f0f37ec8..6fb15e87a76 100644 --- a/desktop/Desktop/Sources/VADGateService.swift +++ b/desktop/Desktop/Sources/VADGateService.swift @@ -723,6 +723,32 @@ final class VADGateService { return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true) } + // MARK: - Test Accessors (internal, accessible via @testable import) + + /// Directly invoke auto-emit for testing. Sets up state, calls autoEmitBatchBuffer, returns result. + func testAutoEmit( + batchBuffer: Data, + startState: GateState, + speechStartWallTime: Double, + audioCursorMs: Double, + lastSpeechMs: Double + ) -> (output: BatchGateOutput, resultState: GateState, resultLastSpeechMs: Double, resultStartWallTime: Double) { + lock.lock() + defer { lock.unlock() } + batchAudioBuffer = batchBuffer + batchState = startState + batchSpeechStartWallTime = speechStartWallTime + batchAudioCursorMs = audioCursorMs + batchLastSpeechMs = lastSpeechMs + + let output = autoEmitBatchBuffer(nextChunkMs: 100, nextChunkData: Data()) + return (output, batchState, batchLastSpeechMs, batchSpeechStartWallTime) + } + + /// Read batch state for assertions. + var testBatchState: GateState { batchState } + var testBatchBufferCount: Int { batchAudioBuffer.count } + /// Flush remaining batch audio buffer (call when recording stops). func flushBatchBuffer() -> BatchGateOutput? { lock.lock() From d63840304b35420d8ab1638d77a54cb4796d40d2 Mon Sep 17 00:00:00 2001 From: beastoin Date: Tue, 31 Mar 2026 17:00:41 +0000 Subject: [PATCH 11/11] test(desktop): add VAD auto-emit state machine tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4 tests verifying: speech→speech transition, hangover→speech transition (prevents silence-only follow-up), batchLastSpeechMs reset, and start wall time advancement. Co-Authored-By: Claude Opus 4.6 --- .../Tests/TranscriptionServiceTests.swift | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/desktop/Desktop/Tests/TranscriptionServiceTests.swift b/desktop/Desktop/Tests/TranscriptionServiceTests.swift index 288341dd825..bac514f0ac0 100644 --- a/desktop/Desktop/Tests/TranscriptionServiceTests.swift +++ b/desktop/Desktop/Tests/TranscriptionServiceTests.swift @@ -235,6 +235,88 @@ final class URLConstructionTests: XCTestCase { } } +// MARK: - VAD Gate Auto-Emit Tests + +final class VADGateAutoEmitTests: XCTestCase { + + func testAutoEmitFromSpeechTransitionsToSpeech() { + let gate = VADGateService() + let buffer = Data(repeating: 0xAA, count: 1_500_000) + let result = gate.testAutoEmit( + batchBuffer: buffer, + startState: .speech, + speechStartWallTime: 100.0, + audioCursorMs: 23400, + lastSpeechMs: 23400 + ) + + // Should emit the buffer + XCTAssertEqual(result.output.audioBuffer?.count, 1_500_000) + XCTAssertTrue(result.output.isComplete) + XCTAssertEqual(result.output.speechStartWallTime, 100.0, accuracy: 0.001) + + // Should stay in .speech + XCTAssertEqual(result.resultState, .speech) + + // Buffer should be cleared + XCTAssertEqual(gate.testBatchBufferCount, 0) + } + + func testAutoEmitFromHangoverTransitionsToSpeech() { + let gate = VADGateService() + let buffer = Data(repeating: 0xBB, count: 1_500_000) + let result = gate.testAutoEmit( + batchBuffer: buffer, + startState: .hangover, + speechStartWallTime: 50.0, + audioCursorMs: 25000, + lastSpeechMs: 21000 + ) + + // Should emit the buffer + XCTAssertTrue(result.output.isComplete) + XCTAssertEqual(result.output.audioBuffer?.count, 1_500_000) + + // Should transition to .speech (not stay in .hangover) + XCTAssertEqual(result.resultState, .speech) + } + + func testAutoEmitResetsBatchLastSpeechMs() { + let gate = VADGateService() + let buffer = Data(repeating: 0xCC, count: 1_500_000) + let result = gate.testAutoEmit( + batchBuffer: buffer, + startState: .hangover, + speechStartWallTime: 50.0, + audioCursorMs: 25000, + lastSpeechMs: 21000 // Old speech time from previous buffer + ) + + // batchLastSpeechMs should be reset to batchAudioCursorMs + XCTAssertEqual(result.resultLastSpeechMs, 25000, accuracy: 0.001) + } + + func testAutoEmitAdvancesStartWallTime() { + let gate = VADGateService() + // 640000 bytes = 10 seconds of stereo 16kHz Int16 audio + let buffer = Data(repeating: 0xDD, count: 640_000) + let result = gate.testAutoEmit( + batchBuffer: buffer, + startState: .speech, + speechStartWallTime: 100.0, + audioCursorMs: 10000, + lastSpeechMs: 10000 + ) + + // emittedDuration = 640000 / 4 / 16000 = 10.0s + // New start wall time should be 100.0 + 10.0 = 110.0 + XCTAssertEqual(result.resultStartWallTime, 110.0, accuracy: 0.001) + + // Emitted output should have old start time + XCTAssertEqual(result.output.speechStartWallTime, 100.0, accuracy: 0.001) + } +} + // MARK: - Batch Transcription Splitting Tests final class BatchSplitTests: XCTestCase {