From 539f8c832b1ea8c30e3baa152de7779f365344d9 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:15 +0000
Subject: [PATCH 01/11] fix(desktop): add max batch buffer size to prevent HTTP
 413

VADGateService accumulates unbounded audio during continuous speech,
producing 3.2MB+ chunks that exceed backend body size limits. Add
maxBatchBytes=1.5MB (~23.4s stereo) cap with auto-emit: when the
buffer exceeds the cap during SPEECH or HANGOVER state, emit the
current buffer and start fresh accumulation with correct timestamp
advancement.

Fixes #6195

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/VADGateService.swift | 34 ++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift
index 3151b1ceb98..31ca0a7349d 100644
--- a/desktop/Desktop/Sources/VADGateService.swift
+++ b/desktop/Desktop/Sources/VADGateService.swift
@@ -205,6 +205,9 @@ final class VADGateService {
     private let preRollMs: Double = 500
     private let hangoverMs: Double = 4000       // Streaming mode: controls finalize timing
     private let batchHangoverMs: Double = 2000  // Batch mode: controls chunk boundary (user-visible latency)
+    /// Maximum batch buffer size before auto-emit (~23.4s of stereo 16kHz Int16 PCM).
+    /// Prevents HTTP 413 from backend/proxy body size limits.
+    static let maxBatchBytes = 1_500_000
     private let keepaliveSec: Double = 20
     private let vadWindowSamples = 512
     private let sampleRate = 16000
@@ -644,6 +647,11 @@ final class VADGateService {
         case .speech:
             batchAudioBuffer.append(stereoData)
 
+            // Auto-emit if buffer exceeds max size (prevents HTTP 413)
+            if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
+                return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
+            }
+
             if !isSpeech {
                 // SPEECH -> HANGOVER
                 batchState = .hangover
@@ -653,6 +661,12 @@ final class VADGateService {
 
         case .hangover:
             batchAudioBuffer.append(stereoData)
+
+            // Auto-emit if buffer exceeds max size (prevents HTTP 413)
+            if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
+                return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
+            }
+
             let timeSinceSpeechMs = batchAudioCursorMs - batchLastSpeechMs
 
             if isSpeech {
@@ -683,6 +697,26 @@ final class VADGateService {
         }
     }
 
+    /// Auto-emit the current batch buffer when it exceeds maxBatchBytes.
+    /// Stays in .speech state so the next audio continues accumulating into a fresh buffer.
+    /// Called under lock.
+    private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
+        let bytesPerFrame = 4
+        let completedBuffer = batchAudioBuffer
+        let startTime = batchSpeechStartWallTime
+
+        // Advance start time for the next buffer: emitted duration in seconds
+        let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate)
+        batchSpeechStartWallTime = startTime + emittedDurationSec
+
+        // Start fresh accumulation (stay in current state — speech or hangover)
+        batchAudioBuffer = Data()
+
+        log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)")
+
+        return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true)
+    }
+
     /// Flush remaining batch audio buffer (call when recording stops).
     func flushBatchBuffer() -> BatchGateOutput? {
         lock.lock()

From 99267e87258dec148588dd5d93029b422806ba02 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:23 +0000
Subject: [PATCH 02/11] fix(desktop): add split-and-retry for large batch
 transcription payloads

Add defense-in-depth for HTTP 413: batchTranscribeWithSplitting()
proactively splits audio exceeding maxBatchPayloadBytes at midpoint
with 1s overlap, transcribes each half, and merges word-level results
per channel with timestamp offset and overlap deduplication. Also
retries with splitting on 413 response. Add payloadTooLarge error
case to distinguish 413 from other HTTP errors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../Sources/TranscriptionService.swift        | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/desktop/Desktop/Sources/TranscriptionService.swift b/desktop/Desktop/Sources/TranscriptionService.swift
index 58300e1a8a9..30d653a3616 100644
--- a/desktop/Desktop/Sources/TranscriptionService.swift
+++ b/desktop/Desktop/Sources/TranscriptionService.swift
@@ -42,6 +42,7 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
         case missingAPIKey
         case connectionFailed(Error)
         case invalidResponse
+        case payloadTooLarge(statusCode: Int, body: String)
         case webSocketError(String)
 
         var errorDescription: String? {
@@ -52,6 +53,8 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
                 return "Connection failed: \(error.localizedDescription)"
             case .invalidResponse:
                 return "Invalid response from DeepGram"
+            case .payloadTooLarge(let statusCode, _):
+                return "Payload too large (HTTP \(statusCode))"
             case .webSocketError(let message):
                 return "WebSocket error: \(message)"
             }
@@ -952,6 +955,9 @@ extension TranscriptionService {
             let statusCode = (response as? HTTPURLResponse)?.statusCode ?? -1
             let body = String(data: data, encoding: .utf8) ?? "no body"
             logError("TranscriptionService: Batch full transcription failed with status \(statusCode): \(body)", error: nil)
+            if statusCode == 413 {
+                throw TranscriptionError.payloadTooLarge(statusCode: statusCode, body: body)
+            }
             throw TranscriptionError.invalidResponse
         }
 
@@ -989,6 +995,152 @@ extension TranscriptionService {
 
         return segments
     }
+
+    // MARK: - Batch Transcription with Splitting
+
+    /// Maximum audio payload size for a single batch transcription request.
+    /// Matches VADGateService.maxBatchBytes. Audio larger than this is proactively split.
+    static let maxBatchPayloadBytes = VADGateService.maxBatchBytes
+
+    /// Bytes per second for stereo 16kHz Int16 PCM audio.
+    static let stereoBytesPerSecond = 64_000
+
+    /// Transcribe audio with automatic splitting for large payloads.
+    /// Proactively splits audio exceeding maxBatchPayloadBytes, and retries with splitting on 413.
+    static func batchTranscribeWithSplitting(
+        audioData: Data,
+        language: String = "en",
+        vocabulary: [String] = []
+    ) async throws -> [TranscriptSegment] {
+        // Proactive split if audio exceeds max payload
+        if audioData.count > maxBatchPayloadBytes {
+            log("TranscriptionService: Audio \(audioData.count) bytes exceeds \(maxBatchPayloadBytes) — splitting")
+            return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
+        }
+
+        // Try direct transcription, retry with split on 413
+        do {
+            return try await batchTranscribeFull(audioData: audioData, language: language, vocabulary: vocabulary)
+        } catch TranscriptionError.payloadTooLarge {
+            log("TranscriptionService: Got 413, retrying with split")
+            return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
+        }
+    }
+
+    /// Split audio at midpoint with 1s overlap, transcribe each half, merge results.
+    /// Only one level of splitting — halves are sent directly via batchTranscribeFull.
+    static func splitAndTranscribe(
+        audioData: Data,
+        language: String,
+        vocabulary: [String]
+    ) async throws -> [TranscriptSegment] {
+        let overlapBytes = stereoBytesPerSecond  // 1 second overlap
+        let bytesPerFrame = 4  // Stereo Int16: 2 channels * 2 bytes
+
+        // Align midpoint to frame boundary
+        let rawMid = audioData.count / 2
+        let mid = (rawMid / bytesPerFrame) * bytesPerFrame
+
+        // First half: [0, mid + overlap/2)
+        let firstEnd = min(mid + overlapBytes / 2, audioData.count)
+        let alignedFirstEnd = (firstEnd / bytesPerFrame) * bytesPerFrame
+        let firstHalf = audioData.prefix(alignedFirstEnd)
+
+        // Second half: [mid - overlap/2, end)
+        let secondStart = max(mid - overlapBytes / 2, 0)
+        let alignedSecondStart = (secondStart / bytesPerFrame) * bytesPerFrame
+        let secondHalf = audioData.suffix(from: alignedSecondStart)
+
+        let splitStartSec = Double(alignedSecondStart) / Double(stereoBytesPerSecond)
+
+        log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s")
+
+        // Transcribe both halves (sequentially to avoid doubling concurrent load)
+        let firstSegments = try await batchTranscribeFull(
+            audioData: Data(firstHalf), language: language, vocabulary: vocabulary
+        )
+        let secondSegments = try await batchTranscribeFull(
+            audioData: Data(secondHalf), language: language, vocabulary: vocabulary
+        )
+
+        // Merge per channel: offset second-half timestamps, dedupe overlap
+        return mergeSegments(first: firstSegments, second: secondSegments, secondOffsetSec: splitStartSec)
+    }
+
+    /// Merge segments from two halves per channel.
+    /// Second-half word timestamps are offset by secondOffsetSec.
+    /// Words in the overlap window are deduped by matching text and timestamp proximity.
+    static func mergeSegments(
+        first: [TranscriptSegment],
+        second: [TranscriptSegment],
+        secondOffsetSec: Double
+    ) -> [TranscriptSegment] {
+        // Group by channel
+        var firstByChannel: [Int: TranscriptSegment] = [:]
+        for seg in first { firstByChannel[seg.channelIndex] = seg }
+
+        var secondByChannel: [Int: TranscriptSegment] = [:]
+        for seg in second { secondByChannel[seg.channelIndex] = seg }
+
+        let allChannels = Set(firstByChannel.keys).union(secondByChannel.keys)
+        var merged: [TranscriptSegment] = []
+
+        for ch in allChannels.sorted() {
+            let firstWords = firstByChannel[ch]?.words ?? []
+            let secondWords = (secondByChannel[ch]?.words ?? []).map { word in
+                TranscriptSegment.Word(
+                    word: word.word,
+                    start: word.start + secondOffsetSec,
+                    end: word.end + secondOffsetSec,
+                    confidence: word.confidence,
+                    speaker: word.speaker,
+                    punctuatedWord: word.punctuatedWord
+                )
+            }
+
+            // Dedupe: find where first-half ends and second-half begins
+            let deduped = dedupeOverlapWords(first: firstWords, second: secondWords)
+
+            let combinedText = deduped.map { $0.punctuatedWord }.joined(separator: " ")
+            let avgConfidence = deduped.isEmpty ? 0.0 : deduped.reduce(0.0) { $0 + $1.confidence } / Double(deduped.count)
+
+            merged.append(TranscriptSegment(
+                text: combinedText,
+                isFinal: true,
+                speechFinal: true,
+                confidence: avgConfidence,
+                words: deduped,
+                channelIndex: ch
+            ))
+        }
+
+        return merged
+    }
+
+    /// Deduplicate words in the overlap window between first and second halves.
+    /// Words from the second half that match a first-half word (same text, within 0.5s) are dropped.
+    static func dedupeOverlapWords(
+        first: [TranscriptSegment.Word],
+        second: [TranscriptSegment.Word]
+    ) -> [TranscriptSegment.Word] {
+        guard let lastFirstWord = first.last else { return second }
+        let overlapEnd = lastFirstWord.end
+
+        var result = first
+        for word in second {
+            // Skip words that fall within the overlap window and match a first-half word
+            if word.start <= overlapEnd + 0.5 {
+                let isDuplicate = first.contains { firstWord in
+                    firstWord.word.lowercased() == word.word.lowercased() &&
+                    abs(firstWord.start - word.start) < 0.5
+                }
+                if isDuplicate { continue }
+            }
+            result.append(word)
+        }
+
+        return result
+    }
 }
 
 // MARK: - Reconnect Audio Ring Buffer

From 340140ed52a88569f105835573aaa905063365e2 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:28 +0000
Subject: [PATCH 03/11] fix(desktop): use splitting-aware batch transcription
 in AppState

Switch batchTranscribeChunk from batchTranscribeFull to
batchTranscribeWithSplitting, which handles proactive splitting
and 413 retry automatically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/AppState.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/desktop/Desktop/Sources/AppState.swift b/desktop/Desktop/Sources/AppState.swift
index 84e0dcc5c66..d9020888445 100644
--- a/desktop/Desktop/Sources/AppState.swift
+++ b/desktop/Desktop/Sources/AppState.swift
@@ -2329,7 +2329,7 @@ class AppState: ObservableObject {
     let vocabulary = AssistantSettings.shared.effectiveVocabulary
 
     do {
-      let segments = try await TranscriptionService.batchTranscribeFull(
+      let segments = try await TranscriptionService.batchTranscribeWithSplitting(
         audioData: audioBuffer,
         language: effectiveLanguage,
         vocabulary: vocabulary

From 77ad5bac447bffc72aafcd202efe1504a7135f0f Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:33 +0000
Subject: [PATCH 04/11] test(desktop): add batch split and merge tests

7 tests covering word deduplication, timestamp offsetting,
multi-channel merge, maxBatchBytes consistency, and frame alignment.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../Tests/TranscriptionServiceTests.swift     | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/desktop/Desktop/Tests/TranscriptionServiceTests.swift b/desktop/Desktop/Tests/TranscriptionServiceTests.swift
index ae5e8c57f8d..288341dd825 100644
--- a/desktop/Desktop/Tests/TranscriptionServiceTests.swift
+++ b/desktop/Desktop/Tests/TranscriptionServiceTests.swift
@@ -235,6 +235,117 @@ final class URLConstructionTests: XCTestCase {
     }
 }
 
+// MARK: - Batch Transcription Splitting Tests
+
+final class BatchSplitTests: XCTestCase {
+
+    func testDedupeOverlapWordsRemovesDuplicates() {
+        let first = [
+            TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"),
+            TranscriptionService.TranscriptSegment.Word(word: "world", start: 0.5, end: 1.0, confidence: 0.9, speaker: 0, punctuatedWord: "world"),
+        ]
+        let second = [
+            // Duplicate of "world" — within 0.5s of first-half version
+            TranscriptionService.TranscriptSegment.Word(word: "world", start: 0.6, end: 1.1, confidence: 0.8, speaker: 0, punctuatedWord: "world"),
+            TranscriptionService.TranscriptSegment.Word(word: "foo", start: 1.5, end: 2.0, confidence: 0.9, speaker: 0, punctuatedWord: "foo"),
+        ]
+
+        let result = TranscriptionService.dedupeOverlapWords(first: first, second: second)
+        XCTAssertEqual(result.count, 3)
+        XCTAssertEqual(result[0].word, "hello")
+        XCTAssertEqual(result[1].word, "world")
+        XCTAssertEqual(result[2].word, "foo")
+    }
+
+    func testDedupeOverlapWordsKeepsNonOverlapping() {
+        let first = [
+            TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"),
+        ]
+        let second = [
+            TranscriptionService.TranscriptSegment.Word(word: "world", start: 5.0, end: 5.5, confidence: 0.9, speaker: 0, punctuatedWord: "world"),
+        ]
+
+        let result = TranscriptionService.dedupeOverlapWords(first: first, second: second)
+        XCTAssertEqual(result.count, 2)
+        XCTAssertEqual(result[0].word, "hello")
+        XCTAssertEqual(result[1].word, "world")
+    }
+
+    func testDedupeOverlapWordsEmptyFirst() {
+        let second = [
+            TranscriptionService.TranscriptSegment.Word(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "Hello"),
+        ]
+
+        let result = TranscriptionService.dedupeOverlapWords(first: [], second: second)
+        XCTAssertEqual(result.count, 1)
+        XCTAssertEqual(result[0].word, "hello")
+    }
+
+    func testMergeSegmentsOffsetsSecondHalf() {
+        let first = [
+            TranscriptionService.TranscriptSegment(
+                text: "hello", isFinal: true, speechFinal: true, confidence: 0.9,
+                words: [.init(word: "hello", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "hello")],
+                channelIndex: 0
+            ),
+        ]
+        let second = [
+            TranscriptionService.TranscriptSegment(
+                text: "world", isFinal: true, speechFinal: true, confidence: 0.9,
+                words: [.init(word: "world", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "world")],
+                channelIndex: 0
+            ),
+        ]
+
+        let merged = TranscriptionService.mergeSegments(first: first, second: second, secondOffsetSec: 10.0)
+        XCTAssertEqual(merged.count, 1)
+        XCTAssertEqual(merged[0].channelIndex, 0)
+        XCTAssertEqual(merged[0].words.count, 2)
+        XCTAssertEqual(merged[0].words[0].word, "hello")
+        XCTAssertEqual(merged[0].words[0].start, 0.0, accuracy: 0.001)
+        XCTAssertEqual(merged[0].words[1].word, "world")
+        XCTAssertEqual(merged[0].words[1].start, 10.0, accuracy: 0.001)
+    }
+
+    func testMergeSegmentsMultiChannel() {
+        let first = [
+            TranscriptionService.TranscriptSegment(
+                text: "mic", isFinal: true, speechFinal: true, confidence: 0.9,
+                words: [.init(word: "mic", start: 0.0, end: 0.5, confidence: 0.9, speaker: 0, punctuatedWord: "mic")],
+                channelIndex: 0
+            ),
+        ]
+        let second = [
+            TranscriptionService.TranscriptSegment(
+                text: "sys", isFinal: true, speechFinal: true, confidence: 0.9,
+                words: [.init(word: "sys", start: 0.0, end: 0.5, confidence: 0.9, speaker: 1, punctuatedWord: "sys")],
+                channelIndex: 1
+            ),
+        ]
+
+        let merged = TranscriptionService.mergeSegments(first: first, second: second, secondOffsetSec: 5.0)
+        XCTAssertEqual(merged.count, 2)
+        let ch0 = merged.first { $0.channelIndex == 0 }
+        let ch1 = merged.first { $0.channelIndex == 1 }
+        XCTAssertEqual(ch0?.words.count, 1)
+        XCTAssertEqual(ch1?.words.count, 1)
+        XCTAssertEqual(ch1?.words[0].start ?? 0, 5.0, accuracy: 0.001)
+    }
+
+    func testMaxBatchBytesConsistent() {
+        XCTAssertEqual(TranscriptionService.maxBatchPayloadBytes, VADGateService.maxBatchBytes)
+    }
+
+    func testSplitPointIsFrameAligned() {
+        // Stereo Int16: 4 bytes per frame
+        let audioSize = 100_001  // Not frame-aligned
+        let mid = audioSize / 2
+        let aligned = (mid / 4) * 4
+        XCTAssertEqual(aligned % 4, 0)
+        XCTAssertTrue(aligned <= mid)
+    }
+}
+
 final class ReconnectDelayTests: XCTestCase {
 
     func testExponentialGrowth() {

From 1b32ceed088b242ba2adbfe7b1cc296c4770df90 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:38 +0000
Subject: [PATCH 05/11] fix(desktop): update OnboardingFlowTests for
 hasReorderedTrustStep param

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Tests/OnboardingFlowTests.swift | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/desktop/Desktop/Tests/OnboardingFlowTests.swift b/desktop/Desktop/Tests/OnboardingFlowTests.swift
index 7678281fd20..7864784cb0b 100644
--- a/desktop/Desktop/Tests/OnboardingFlowTests.swift
+++ b/desktop/Desktop/Tests/OnboardingFlowTests.swift
@@ -23,7 +23,8 @@ final class OnboardingFlowTests: XCTestCase {
       hasMergedVoiceInputStep: false,
       hasRemovedNotificationStep: true,
       hasInsertedFloatingBarShortcutStep: true,
-      hasMigratedPagedIntro: true
+      hasMigratedPagedIntro: true,
+      hasReorderedTrustStep: true
     )
 
     XCTAssertEqual(migrated, 3)
@@ -37,7 +38,8 @@ final class OnboardingFlowTests: XCTestCase {
       hasMergedVoiceInputStep: true,
       hasRemovedNotificationStep: true,
       hasInsertedFloatingBarShortcutStep: true,
-      hasMigratedPagedIntro: true
+      hasMigratedPagedIntro: true,
+      hasReorderedTrustStep: true
     )
 
     XCTAssertEqual(migrated, OnboardingFlow.lastStepIndex)

From 733c86251a0190e0d63de464ac320e29e7f1a5c0 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:38:42 +0000
Subject: [PATCH 06/11] chore(desktop): add changelog entry for batch
 transcription fix

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/CHANGELOG.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/desktop/CHANGELOG.json b/desktop/CHANGELOG.json
index bbf07da14f8..b453cab639a 100644
--- a/desktop/CHANGELOG.json
+++ b/desktop/CHANGELOG.json
@@ -1,6 +1,7 @@
 {
   "unreleased": [
-    "Fixed WebSocket transcription disconnects: proper handshake detection, audio buffering during reconnection, unlimited retry with backoff, and thread-safe connection state"
+    "Fixed WebSocket transcription disconnects: proper handshake detection, audio buffering during reconnection, unlimited retry with backoff, and thread-safe connection state",
+    "Fixed batch transcription failing on long speech (50s+) by splitting large audio chunks automatically"
   ],
   "releases": [
     {

From fc1d04f1785faac372859614d713478a7401d4c6 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:47:24 +0000
Subject: [PATCH 07/11] fix(desktop): use recursive splitting for split halves
 exceeding limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Split halves with overlap can still exceed maxBatchPayloadBytes
(e.g., 3.2MB → two 1.63MB halves). Use batchTranscribeWithSplitting
recursively instead of batchTranscribeFull directly, so oversized
halves get split again.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/TranscriptionService.swift | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/desktop/Desktop/Sources/TranscriptionService.swift b/desktop/Desktop/Sources/TranscriptionService.swift
index 30d653a3616..44d6d36468b 100644
--- a/desktop/Desktop/Sources/TranscriptionService.swift
+++ b/desktop/Desktop/Sources/TranscriptionService.swift
@@ -1055,11 +1055,11 @@ extension TranscriptionService {
 
         log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s")
 
-        // Transcribe both halves (sequentially to avoid doubling concurrent load)
-        let firstSegments = try await batchTranscribeFull(
+        // Transcribe both halves (recursively split if still too large)
+        let firstSegments = try await batchTranscribeWithSplitting(
             audioData: Data(firstHalf), language: language, vocabulary: vocabulary
         )
-        let secondSegments = try await batchTranscribeFull(
+        let secondSegments = try await batchTranscribeWithSplitting(
             audioData: Data(secondHalf), language: language, vocabulary: vocabulary
         )
 

From f23640dbf22e304a21691ef80dee22330d4c7484 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:47:30 +0000
Subject: [PATCH 08/11] fix(desktop): transition to .speech after auto-emit in
 hangover state

autoEmitBatchBuffer left batchState unchanged, so auto-emit during
hangover would leave an empty buffer in hangover state, potentially
emitting a silence-only follow-up chunk. Always transition to .speech
after auto-emit to continue proper accumulation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/VADGateService.swift | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift
index 31ca0a7349d..2ad58ca558b 100644
--- a/desktop/Desktop/Sources/VADGateService.swift
+++ b/desktop/Desktop/Sources/VADGateService.swift
@@ -698,7 +698,7 @@ final class VADGateService {
     }
 
     /// Auto-emit the current batch buffer when it exceeds maxBatchBytes.
-    /// Stays in .speech state so the next audio continues accumulating into a fresh buffer.
+    /// Transitions to .speech state so the next audio continues accumulating into a fresh buffer.
     /// Called under lock.
     private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
         let bytesPerFrame = 4
@@ -709,7 +709,9 @@ final class VADGateService {
         let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate)
         batchSpeechStartWallTime = startTime + emittedDurationSec
 
-        // Start fresh accumulation (stay in current state — speech or hangover)
+        // Always transition to .speech for continued accumulation.
+        // If we were in .hangover, staying there would emit a silence-only follow-up chunk.
+        batchState = .speech
         batchAudioBuffer = Data()
 
         log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)")

From ba2b908b4037267436f5987fb02c2b2ceb815e78 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 16:52:52 +0000
Subject: [PATCH 09/11] fix(desktop): reset batchLastSpeechMs after auto-emit
 to prevent silence-only chunk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After auto-emit, batchLastSpeechMs still pointed to the old buffer's
last speech time. The next silent chunk would immediately trigger
hangover→silence transition (timeSinceSpeechMs > 2000) and emit an
empty/silence-only buffer. Reset batchLastSpeechMs to batchAudioCursorMs
so the hangover timer starts fresh after auto-emit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/VADGateService.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift
index 2ad58ca558b..964f0f37ec8 100644
--- a/desktop/Desktop/Sources/VADGateService.swift
+++ b/desktop/Desktop/Sources/VADGateService.swift
@@ -712,6 +712,10 @@ final class VADGateService {
         // Always transition to .speech for continued accumulation.
         // If we were in .hangover, staying there would emit a silence-only follow-up chunk.
         batchState = .speech
+        // Reset lastSpeechMs to current cursor so the hangover timer starts fresh.
+        // Without this, the next silent chunk would immediately trigger hangover→silence
+        // and emit an empty/silence-only buffer.
+        batchLastSpeechMs = batchAudioCursorMs
         batchAudioBuffer = Data()
 
         log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)")

From 2bad1363a56d73e6c03c065b20d7268c3f55e70f Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 17:00:35 +0000
Subject: [PATCH 10/11] feat(desktop): add VAD gate test accessors for
 auto-emit verification

Add testAutoEmit() method and test property accessors to
VADGateService for testing the auto-emit state machine path
without requiring ONNX model loading.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 desktop/Desktop/Sources/VADGateService.swift | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift
index 964f0f37ec8..6fb15e87a76 100644
--- a/desktop/Desktop/Sources/VADGateService.swift
+++ b/desktop/Desktop/Sources/VADGateService.swift
@@ -723,6 +723,32 @@ final class VADGateService {
         return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true)
     }
 
+    // MARK: - Test Accessors (internal, accessible via @testable import)
+
+    /// Directly invoke auto-emit for testing. Sets up state, calls autoEmitBatchBuffer, returns result.
+    func testAutoEmit(
+        batchBuffer: Data,
+        startState: GateState,
+        speechStartWallTime: Double,
+        audioCursorMs: Double,
+        lastSpeechMs: Double
+    ) -> (output: BatchGateOutput, resultState: GateState, resultLastSpeechMs: Double, resultStartWallTime: Double) {
+        lock.lock()
+        defer { lock.unlock() }
+        batchAudioBuffer = batchBuffer
+        batchState = startState
+        batchSpeechStartWallTime = speechStartWallTime
+        batchAudioCursorMs = audioCursorMs
+        batchLastSpeechMs = lastSpeechMs
+
+        let output = autoEmitBatchBuffer(nextChunkMs: 100, nextChunkData: Data())
+        return (output, batchState, batchLastSpeechMs, batchSpeechStartWallTime)
+    }
+
+    /// Read batch state for assertions.
+    var testBatchState: GateState { batchState }
+    var testBatchBufferCount: Int { batchAudioBuffer.count }
+
     /// Flush remaining batch audio buffer (call when recording stops).
     func flushBatchBuffer() -> BatchGateOutput? {
         lock.lock()

From d63840304b35420d8ab1638d77a54cb4796d40d2 Mon Sep 17 00:00:00 2001
From: beastoin <ngocthinhdp@gmail.com>
Date: Tue, 31 Mar 2026 17:00:41 +0000
Subject: [PATCH 11/11] test(desktop): add VAD auto-emit state machine tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

4 tests verifying: speech→speech transition, hangover→speech
transition (prevents silence-only follow-up), batchLastSpeechMs
reset, and start wall time advancement.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../Tests/TranscriptionServiceTests.swift     | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/desktop/Desktop/Tests/TranscriptionServiceTests.swift b/desktop/Desktop/Tests/TranscriptionServiceTests.swift
index 288341dd825..bac514f0ac0 100644
--- a/desktop/Desktop/Tests/TranscriptionServiceTests.swift
+++ b/desktop/Desktop/Tests/TranscriptionServiceTests.swift
@@ -235,6 +235,88 @@ final class URLConstructionTests: XCTestCase {
     }
 }
 
+// MARK: - VAD Gate Auto-Emit Tests
+
+final class VADGateAutoEmitTests: XCTestCase {
+
+    func testAutoEmitFromSpeechTransitionsToSpeech() {
+        let gate = VADGateService()
+        let buffer = Data(repeating: 0xAA, count: 1_500_000)
+        let result = gate.testAutoEmit(
+            batchBuffer: buffer,
+            startState: .speech,
+            speechStartWallTime: 100.0,
+            audioCursorMs: 23400,
+            lastSpeechMs: 23400
+        )
+
+        // Should emit the buffer
+        XCTAssertEqual(result.output.audioBuffer?.count, 1_500_000)
+        XCTAssertTrue(result.output.isComplete)
+        XCTAssertEqual(result.output.speechStartWallTime, 100.0, accuracy: 0.001)
+
+        // Should stay in .speech
+        XCTAssertEqual(result.resultState, .speech)
+
+        // Buffer should be cleared
+        XCTAssertEqual(gate.testBatchBufferCount, 0)
+    }
+
+    func testAutoEmitFromHangoverTransitionsToSpeech() {
+        let gate = VADGateService()
+        let buffer = Data(repeating: 0xBB, count: 1_500_000)
+        let result = gate.testAutoEmit(
+            batchBuffer: buffer,
+            startState: .hangover,
+            speechStartWallTime: 50.0,
+            audioCursorMs: 25000,
+            lastSpeechMs: 21000
+        )
+
+        // Should emit the buffer
+        XCTAssertTrue(result.output.isComplete)
+        XCTAssertEqual(result.output.audioBuffer?.count, 1_500_000)
+
+        // Should transition to .speech (not stay in .hangover)
+        XCTAssertEqual(result.resultState, .speech)
+    }
+
+    func testAutoEmitResetsBatchLastSpeechMs() {
+        let gate = VADGateService()
+        let buffer = Data(repeating: 0xCC, count: 1_500_000)
+        let result = gate.testAutoEmit(
+            batchBuffer: buffer,
+            startState: .hangover,
+            speechStartWallTime: 50.0,
+            audioCursorMs: 25000,
+            lastSpeechMs: 21000  // Old speech time from previous buffer
+        )
+
+        // batchLastSpeechMs should be reset to batchAudioCursorMs
+        XCTAssertEqual(result.resultLastSpeechMs, 25000, accuracy: 0.001)
+    }
+
+    func testAutoEmitAdvancesStartWallTime() {
+        let gate = VADGateService()
+        // 640000 bytes = 10 seconds of stereo 16kHz Int16 audio
+        let buffer = Data(repeating: 0xDD, count: 640_000)
+        let result = gate.testAutoEmit(
+            batchBuffer: buffer,
+            startState: .speech,
+            speechStartWallTime: 100.0,
+            audioCursorMs: 10000,
+            lastSpeechMs: 10000
+        )
+
+        // emittedDuration = 640000 / 4 / 16000 = 10.0s
+        // New start wall time should be 100.0 + 10.0 = 110.0
+        XCTAssertEqual(result.resultStartWallTime, 110.0, accuracy: 0.001)
+
+        // Emitted output should have old start time
+        XCTAssertEqual(result.output.speechStartWallTime, 100.0, accuracy: 0.001)
+    }
+}
+
 // MARK: - Batch Transcription Splitting Tests
 
 final class BatchSplitTests: XCTestCase {