BasedHardware · beastoin · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/desktop/CHANGELOG.json b/desktop/CHANGELOG.json
@@ -1,5 +1,7 @@
 {
-  "unreleased": [],
+  "unreleased": [
+    "Fixed batch transcription failing on long speech (50s+) by splitting large audio chunks automatically"
+  ],
   "releases": [
     {
       "version": "0.11.202",

diff --git a/desktop/Desktop/Sources/AppState.swift b/desktop/Desktop/Sources/AppState.swift
@@ -2329,7 +2329,7 @@ class AppState: ObservableObject {
     let vocabulary = AssistantSettings.shared.effectiveVocabulary
 
     do {
-      let segments = try await TranscriptionService.batchTranscribeFull(
+      let segments = try await TranscriptionService.batchTranscribeWithSplitting(
         audioData: audioBuffer,
         language: effectiveLanguage,
         vocabulary: vocabulary

diff --git a/desktop/Desktop/Sources/TranscriptionService.swift b/desktop/Desktop/Sources/TranscriptionService.swift
@@ -42,6 +42,7 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
         case missingAPIKey
         case connectionFailed(Error)
         case invalidResponse
+        case payloadTooLarge(statusCode: Int, body: String)
         case webSocketError(String)
 
         var errorDescription: String? {
@@ -52,6 +53,8 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
                 return "Connection failed: \(error.localizedDescription)"
             case .invalidResponse:
                 return "Invalid response from DeepGram"
+            case .payloadTooLarge(let statusCode, _):
+                return "Payload too large (HTTP \(statusCode))"
             case .webSocketError(let message):
                 return "WebSocket error: \(message)"
             }
@@ -952,6 +955,9 @@ extension TranscriptionService {
             let statusCode = (response as? HTTPURLResponse)?.statusCode ?? -1
             let body = String(data: data, encoding: .utf8) ?? "no body"
             logError("TranscriptionService: Batch full transcription failed with status \(statusCode): \(body)", error: nil)
+            if statusCode == 413 {
+                throw TranscriptionError.payloadTooLarge(statusCode: statusCode, body: body)
+            }
             throw TranscriptionError.invalidResponse
         }
 
@@ -989,6 +995,152 @@ extension TranscriptionService {
 
         return segments
     }
+
+    // MARK: - Batch Transcription with Splitting
+
+    /// Maximum audio payload size for a single batch transcription request.
+    /// Matches VADGateService.maxBatchBytes. Audio larger than this is proactively split.
+    static let maxBatchPayloadBytes = VADGateService.maxBatchBytes
+
+    /// Bytes per second for stereo 16kHz Int16 PCM audio.
+    static let stereoBytesPerSecond = 64_000
+
+    /// Transcribe audio with automatic splitting for large payloads.
+    /// Proactively splits audio exceeding maxBatchPayloadBytes, and retries with splitting on 413.
+    static func batchTranscribeWithSplitting(
+        audioData: Data,
+        language: String = "en",
+        vocabulary: [String] = []
+    ) async throws -> [TranscriptSegment] {
+        // Proactive split if audio exceeds max payload
+        if audioData.count > maxBatchPayloadBytes {
+            log("TranscriptionService: Audio \(audioData.count) bytes exceeds \(maxBatchPayloadBytes) — splitting")
+            return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
+        }
+
+        // Try direct transcription, retry with split on 413
+        do {
+            return try await batchTranscribeFull(audioData: audioData, language: language, vocabulary: vocabulary)
+        } catch TranscriptionError.payloadTooLarge {
+            log("TranscriptionService: Got 413, retrying with split")
+            return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
+        }
+    }
+
+    /// Split audio at midpoint with 1s overlap, transcribe each half, merge results.
+    /// Only one level of splitting — halves are sent directly via batchTranscribeFull.
+    static func splitAndTranscribe(
+        audioData: Data,
+        language: String,
+        vocabulary: [String]
+    ) async throws -> [TranscriptSegment] {
+        let overlapBytes = stereoBytesPerSecond  // 1 second overlap
+        let bytesPerFrame = 4  // Stereo Int16: 2 channels * 2 bytes
+
+        // Align midpoint to frame boundary
+        let rawMid = audioData.count / 2
+        let mid = (rawMid / bytesPerFrame) * bytesPerFrame
+
+        // First half: [0, mid + overlap/2)
+        let firstEnd = min(mid + overlapBytes / 2, audioData.count)
+        let alignedFirstEnd = (firstEnd / bytesPerFrame) * bytesPerFrame
+        let firstHalf = audioData.prefix(alignedFirstEnd)
+
+        // Second half: [mid - overlap/2, end)
+        let secondStart = max(mid - overlapBytes / 2, 0)
+        let alignedSecondStart = (secondStart / bytesPerFrame) * bytesPerFrame
+        let secondHalf = audioData.suffix(from: alignedSecondStart)
+
+        let splitStartSec = Double(alignedSecondStart) / Double(stereoBytesPerSecond)
+
+        log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s")
+
+        // Transcribe both halves (recursively split if still too large)
+        let firstSegments = try await batchTranscribeWithSplitting(
+            audioData: Data(firstHalf), language: language, vocabulary: vocabulary
+        )
+        let secondSegments = try await batchTranscribeWithSplitting(
+            audioData: Data(secondHalf), language: language, vocabulary: vocabulary
+        )
+
+        // Merge per channel: offset second-half timestamps, dedupe overlap
+        return mergeSegments(first: firstSegments, second: secondSegments, secondOffsetSec: splitStartSec)
+    }
+
+    /// Merge segments from two halves per channel.
+    /// Second-half word timestamps are offset by secondOffsetSec.
+    /// Words in the overlap window are deduped by matching text and timestamp proximity.
+    static func mergeSegments(
+        first: [TranscriptSegment],
+        second: [TranscriptSegment],
+        secondOffsetSec: Double
+    ) -> [TranscriptSegment] {
+        // Group by channel
+        var firstByChannel: [Int: TranscriptSegment] = [:]
+        for seg in first { firstByChannel[seg.channelIndex] = seg }
+
+        var secondByChannel: [Int: TranscriptSegment] = [:]
+        for seg in second { secondByChannel[seg.channelIndex] = seg }
+
+        let allChannels = Set(firstByChannel.keys).union(secondByChannel.keys)
+        var merged: [TranscriptSegment] = []
+
+        for ch in allChannels.sorted() {
+            let firstWords = firstByChannel[ch]?.words ?? []
+            let secondWords = (secondByChannel[ch]?.words ?? []).map { word in
+                TranscriptSegment.Word(
+                    word: word.word,
+                    start: word.start + secondOffsetSec,
+                    end: word.end + secondOffsetSec,
+                    confidence: word.confidence,
+                    speaker: word.speaker,
+                    punctuatedWord: word.punctuatedWord
+                )
+            }
+
+            // Dedupe: find where first-half ends and second-half begins
+            let deduped = dedupeOverlapWords(first: firstWords, second: secondWords)
+
+            let combinedText = deduped.map { $0.punctuatedWord }.joined(separator: " ")
+            let avgConfidence = deduped.isEmpty ? 0.0 : deduped.reduce(0.0) { $0 + $1.confidence } / Double(deduped.count)
+
+            merged.append(TranscriptSegment(
+                text: combinedText,
+                isFinal: true,
+                speechFinal: true,
+                confidence: avgConfidence,
+                words: deduped,
+                channelIndex: ch
+            ))
+        }
+
+        return merged
+    }
+
+    /// Deduplicate words in the overlap window between first and second halves.
+    /// Words from the second half that match a first-half word (same text, within 0.5s) are dropped.
+    static func dedupeOverlapWords(
+        first: [TranscriptSegment.Word],
+        second: [TranscriptSegment.Word]
+    ) -> [TranscriptSegment.Word] {
+        guard let lastFirstWord = first.last else { return second }
+        let overlapEnd = lastFirstWord.end
+
+        var result = first
+        for word in second {
+            // Skip words that fall within the overlap window and match a first-half word
+            if word.start <= overlapEnd + 0.5 {
+                let isDuplicate = first.contains { firstWord in
+                    firstWord.word.lowercased() == word.word.lowercased() &&
+                    abs(firstWord.start - word.start) < 0.5
+                }
+                if isDuplicate { continue }
+            }
+            result.append(word)
+        }
+
+        return result
+    }
 }
 
 // MARK: - Reconnect Audio Ring Buffer

diff --git a/desktop/Desktop/Sources/VADGateService.swift b/desktop/Desktop/Sources/VADGateService.swift
@@ -205,6 +205,9 @@ final class VADGateService {
     private let preRollMs: Double = 500
     private let hangoverMs: Double = 4000       // Streaming mode: controls finalize timing
     private let batchHangoverMs: Double = 2000  // Batch mode: controls chunk boundary (user-visible latency)
+    /// Maximum batch buffer size before auto-emit (~23.4s of stereo 16kHz Int16 PCM).
+    /// Prevents HTTP 413 from backend/proxy body size limits.
+    static let maxBatchBytes = 1_500_000
     private let keepaliveSec: Double = 20
     private let vadWindowSamples = 512
     private let sampleRate = 16000
@@ -644,6 +647,11 @@ final class VADGateService {
         case .speech:
             batchAudioBuffer.append(stereoData)
 
+            // Auto-emit if buffer exceeds max size (prevents HTTP 413)
+            if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
+                return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
+            }
+
             if !isSpeech {
                 // SPEECH -> HANGOVER
                 batchState = .hangover
@@ -653,6 +661,12 @@ final class VADGateService {
 
         case .hangover:
             batchAudioBuffer.append(stereoData)
+
+            // Auto-emit if buffer exceeds max size (prevents HTTP 413)
+            if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
+                return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
+            }
+
             let timeSinceSpeechMs = batchAudioCursorMs - batchLastSpeechMs
 
             if isSpeech {
@@ -683,6 +697,58 @@ final class VADGateService {
         }
     }
 
+    /// Auto-emit the current batch buffer when it exceeds maxBatchBytes.
+    /// Transitions to .speech state so the next audio continues accumulating into a fresh buffer.
+    /// Called under lock.
+    private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
-    private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
+    private func autoEmitBatchBuffer() -> BatchGateOutput {
-    private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
+    private func autoEmitBatchBuffer() -> BatchGateOutput {
+        let bytesPerFrame = 4
+        let completedBuffer = batchAudioBuffer
+        let startTime = batchSpeechStartWallTime
+
+        // Advance start time for the next buffer: emitted duration in seconds
+        let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate)
+        batchSpeechStartWallTime = startTime + emittedDurationSec
+
+        // Always transition to .speech for continued accumulation.
+        // If we were in .hangover, staying there would emit a silence-only follow-up chunk.
+        batchState = .speech
+        // Reset lastSpeechMs to current cursor so the hangover timer starts fresh.
+        // Without this, the next silent chunk would immediately trigger hangover→silence
+        // and emit an empty/silence-only buffer.
+        batchLastSpeechMs = batchAudioCursorMs
+        batchAudioBuffer = Data()
+
+        log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)")
+
+        return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true)
+    }
+
+    // MARK: - Test Accessors (internal, accessible via @testable import)
+
+    /// Directly invoke auto-emit for testing. Sets up state, calls autoEmitBatchBuffer, returns result.
+    func testAutoEmit(
+        batchBuffer: Data,
+        startState: GateState,
+        speechStartWallTime: Double,
+        audioCursorMs: Double,
+        lastSpeechMs: Double
+    ) -> (output: BatchGateOutput, resultState: GateState, resultLastSpeechMs: Double, resultStartWallTime: Double) {
+        lock.lock()
+        defer { lock.unlock() }
+        batchAudioBuffer = batchBuffer
+        batchState = startState
+        batchSpeechStartWallTime = speechStartWallTime
+        batchAudioCursorMs = audioCursorMs
+        batchLastSpeechMs = lastSpeechMs
+
+        let output = autoEmitBatchBuffer(nextChunkMs: 100, nextChunkData: Data())
+        return (output, batchState, batchLastSpeechMs, batchSpeechStartWallTime)
+    }
+
+    /// Read batch state for assertions.
+    var testBatchState: GateState { batchState }
+    var testBatchBufferCount: Int { batchAudioBuffer.count }
+
     /// Flush remaining batch audio buffer (call when recording stops).
     func flushBatchBuffer() -> BatchGateOutput? {
         lock.lock()

diff --git a/desktop/Desktop/Tests/OnboardingFlowTests.swift b/desktop/Desktop/Tests/OnboardingFlowTests.swift
@@ -23,7 +23,8 @@ final class OnboardingFlowTests: XCTestCase {
       hasMergedVoiceInputStep: false,
       hasRemovedNotificationStep: true,
       hasInsertedFloatingBarShortcutStep: true,
-      hasMigratedPagedIntro: true
+      hasMigratedPagedIntro: true,
+      hasReorderedTrustStep: true
     )
 
     XCTAssertEqual(migrated, 3)
@@ -37,7 +38,8 @@ final class OnboardingFlowTests: XCTestCase {
       hasMergedVoiceInputStep: true,
       hasRemovedNotificationStep: true,
       hasInsertedFloatingBarShortcutStep: true,
-      hasMigratedPagedIntro: true
+      hasMigratedPagedIntro: true,
+      hasReorderedTrustStep: true
     )
 
     XCTAssertEqual(migrated, OnboardingFlow.lastStepIndex)