Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion desktop/CHANGELOG.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"unreleased": [],
"unreleased": [
"Fixed batch transcription failing on long speech (50s+) by splitting large audio chunks automatically"
],
"releases": [
{
"version": "0.11.202",
Expand Down
2 changes: 1 addition & 1 deletion desktop/Desktop/Sources/AppState.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2329,7 +2329,7 @@ class AppState: ObservableObject {
let vocabulary = AssistantSettings.shared.effectiveVocabulary

do {
let segments = try await TranscriptionService.batchTranscribeFull(
let segments = try await TranscriptionService.batchTranscribeWithSplitting(
audioData: audioBuffer,
language: effectiveLanguage,
vocabulary: vocabulary
Expand Down
152 changes: 152 additions & 0 deletions desktop/Desktop/Sources/TranscriptionService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
case missingAPIKey
case connectionFailed(Error)
case invalidResponse
case payloadTooLarge(statusCode: Int, body: String)
case webSocketError(String)

var errorDescription: String? {
Expand All @@ -52,6 +53,8 @@ class TranscriptionService: NSObject, URLSessionWebSocketDelegate {
return "Connection failed: \(error.localizedDescription)"
case .invalidResponse:
return "Invalid response from DeepGram"
case .payloadTooLarge(let statusCode, _):
return "Payload too large (HTTP \(statusCode))"
case .webSocketError(let message):
return "WebSocket error: \(message)"
}
Expand Down Expand Up @@ -952,6 +955,9 @@ extension TranscriptionService {
let statusCode = (response as? HTTPURLResponse)?.statusCode ?? -1
let body = String(data: data, encoding: .utf8) ?? "no body"
logError("TranscriptionService: Batch full transcription failed with status \(statusCode): \(body)", error: nil)
if statusCode == 413 {
throw TranscriptionError.payloadTooLarge(statusCode: statusCode, body: body)
}
throw TranscriptionError.invalidResponse
}

Expand Down Expand Up @@ -989,6 +995,152 @@ extension TranscriptionService {

return segments
}

// MARK: - Batch Transcription with Splitting

/// Maximum audio payload size for a single batch transcription request.
/// Matches VADGateService.maxBatchBytes. Audio larger than this is proactively split.
static let maxBatchPayloadBytes = VADGateService.maxBatchBytes

/// Bytes per second for stereo 16kHz Int16 PCM audio.
static let stereoBytesPerSecond = 64_000

/// Transcribe audio with automatic splitting for large payloads.
/// Proactively splits audio exceeding maxBatchPayloadBytes, and retries with splitting on 413.
static func batchTranscribeWithSplitting(
audioData: Data,
language: String = "en",
vocabulary: [String] = []
) async throws -> [TranscriptSegment] {
// Proactive split if audio exceeds max payload
if audioData.count > maxBatchPayloadBytes {
log("TranscriptionService: Audio \(audioData.count) bytes exceeds \(maxBatchPayloadBytes) — splitting")
return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
}

// Try direct transcription, retry with split on 413
do {
return try await batchTranscribeFull(audioData: audioData, language: language, vocabulary: vocabulary)
} catch TranscriptionError.payloadTooLarge {
log("TranscriptionService: Got 413, retrying with split")
return try await splitAndTranscribe(audioData: audioData, language: language, vocabulary: vocabulary)
}
}

/// Split audio at midpoint with 1s overlap, transcribe each half, merge results.
/// Only one level of splitting — halves are sent directly via batchTranscribeFull.
static func splitAndTranscribe(
audioData: Data,
language: String,
vocabulary: [String]
) async throws -> [TranscriptSegment] {
let overlapBytes = stereoBytesPerSecond // 1 second overlap
let bytesPerFrame = 4 // Stereo Int16: 2 channels * 2 bytes

// Align midpoint to frame boundary
let rawMid = audioData.count / 2
let mid = (rawMid / bytesPerFrame) * bytesPerFrame

// First half: [0, mid + overlap/2)
let firstEnd = min(mid + overlapBytes / 2, audioData.count)
let alignedFirstEnd = (firstEnd / bytesPerFrame) * bytesPerFrame
let firstHalf = audioData.prefix(alignedFirstEnd)

// Second half: [mid - overlap/2, end)
let secondStart = max(mid - overlapBytes / 2, 0)
let alignedSecondStart = (secondStart / bytesPerFrame) * bytesPerFrame
let secondHalf = audioData.suffix(from: alignedSecondStart)

let splitStartSec = Double(alignedSecondStart) / Double(stereoBytesPerSecond)

log("TranscriptionService: Split — first=\(firstHalf.count) bytes, second=\(secondHalf.count) bytes, offset=\(String(format: "%.1f", splitStartSec))s")

// Transcribe both halves (recursively split if still too large)
let firstSegments = try await batchTranscribeWithSplitting(
audioData: Data(firstHalf), language: language, vocabulary: vocabulary
)
let secondSegments = try await batchTranscribeWithSplitting(
audioData: Data(secondHalf), language: language, vocabulary: vocabulary
)

// Merge per channel: offset second-half timestamps, dedupe overlap
return mergeSegments(first: firstSegments, second: secondSegments, secondOffsetSec: splitStartSec)
}

/// Merge segments from two halves per channel.
/// Second-half word timestamps are offset by secondOffsetSec.
/// Words in the overlap window are deduped by matching text and timestamp proximity.
static func mergeSegments(
first: [TranscriptSegment],
second: [TranscriptSegment],
secondOffsetSec: Double
) -> [TranscriptSegment] {
// Group by channel
var firstByChannel: [Int: TranscriptSegment] = [:]
for seg in first { firstByChannel[seg.channelIndex] = seg }

var secondByChannel: [Int: TranscriptSegment] = [:]
for seg in second { secondByChannel[seg.channelIndex] = seg }

let allChannels = Set(firstByChannel.keys).union(secondByChannel.keys)
var merged: [TranscriptSegment] = []

for ch in allChannels.sorted() {
let firstWords = firstByChannel[ch]?.words ?? []
let secondWords = (secondByChannel[ch]?.words ?? []).map { word in
TranscriptSegment.Word(
word: word.word,
start: word.start + secondOffsetSec,
end: word.end + secondOffsetSec,
confidence: word.confidence,
speaker: word.speaker,
punctuatedWord: word.punctuatedWord
)
}

// Dedupe: find where first-half ends and second-half begins
let deduped = dedupeOverlapWords(first: firstWords, second: secondWords)

let combinedText = deduped.map { $0.punctuatedWord }.joined(separator: " ")
let avgConfidence = deduped.isEmpty ? 0.0 : deduped.reduce(0.0) { $0 + $1.confidence } / Double(deduped.count)

merged.append(TranscriptSegment(
text: combinedText,
isFinal: true,
speechFinal: true,
confidence: avgConfidence,
words: deduped,
channelIndex: ch
))
}

return merged
}

/// Deduplicate words in the overlap window between first and second halves.
/// Words from the second half that match a first-half word (same text, within 0.5s) are dropped.
static func dedupeOverlapWords(
first: [TranscriptSegment.Word],
second: [TranscriptSegment.Word]
) -> [TranscriptSegment.Word] {
guard let lastFirstWord = first.last else { return second }
let overlapEnd = lastFirstWord.end

var result = first
for word in second {
// Skip words that fall within the overlap window and match a first-half word
if word.start <= overlapEnd + 0.5 {
let isDuplicate = first.contains { firstWord in
firstWord.word.lowercased() == word.word.lowercased() &&
abs(firstWord.start - word.start) < 0.5
}
if isDuplicate { continue }
}
result.append(word)
}

return result
}
}

// MARK: - Reconnect Audio Ring Buffer
Expand Down
66 changes: 66 additions & 0 deletions desktop/Desktop/Sources/VADGateService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ final class VADGateService {
private let preRollMs: Double = 500
private let hangoverMs: Double = 4000 // Streaming mode: controls finalize timing
private let batchHangoverMs: Double = 2000 // Batch mode: controls chunk boundary (user-visible latency)
/// Maximum batch buffer size before auto-emit (~23.4s of stereo 16kHz Int16 PCM).
/// Prevents HTTP 413 from backend/proxy body size limits.
static let maxBatchBytes = 1_500_000
private let keepaliveSec: Double = 20
private let vadWindowSamples = 512
private let sampleRate = 16000
Expand Down Expand Up @@ -644,6 +647,11 @@ final class VADGateService {
case .speech:
batchAudioBuffer.append(stereoData)

// Auto-emit if buffer exceeds max size (prevents HTTP 413)
if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
}

if !isSpeech {
// SPEECH -> HANGOVER
batchState = .hangover
Expand All @@ -653,6 +661,12 @@ final class VADGateService {

case .hangover:
batchAudioBuffer.append(stereoData)

// Auto-emit if buffer exceeds max size (prevents HTTP 413)
if batchAudioBuffer.count >= VADGateService.maxBatchBytes {
return autoEmitBatchBuffer(nextChunkMs: chunkMs, nextChunkData: stereoData)
}

let timeSinceSpeechMs = batchAudioCursorMs - batchLastSpeechMs

if isSpeech {
Expand Down Expand Up @@ -683,6 +697,58 @@ final class VADGateService {
}
}

/// Auto-emit the current batch buffer when it exceeds maxBatchBytes.
/// Transitions to .speech state so the next audio continues accumulating into a fresh buffer.
/// Called under lock.
private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Unused parameters nextChunkData and nextChunkMs

Both parameters are declared but never referenced inside the function body. Because batchAudioBuffer.append(stereoData) runs before the overflow check in both the .speech and .hangover cases, the triggering chunk is already inside completedBuffer when this function is called — making the carry-over parameters redundant.

If the intent was to seed the new buffer with the triggering chunk (for context continuity between consecutive auto-emitted chunks), nextChunkData needs to actually be appended to batchAudioBuffer and batchSpeechStartWallTime should be offset only by completedBuffer.count - nextChunkData.count frames. If the current behaviour (chunk included in emitted buffer, fresh empty start) is intentional, the parameters should be removed to avoid confusion.

Suggested change
private func autoEmitBatchBuffer(nextChunkMs: Double, nextChunkData: Data) -> BatchGateOutput {
private func autoEmitBatchBuffer() -> BatchGateOutput {

let bytesPerFrame = 4
let completedBuffer = batchAudioBuffer
let startTime = batchSpeechStartWallTime

// Advance start time for the next buffer: emitted duration in seconds
let emittedDurationSec = Double(completedBuffer.count / bytesPerFrame) / Double(sampleRate)
batchSpeechStartWallTime = startTime + emittedDurationSec

// Always transition to .speech for continued accumulation.
// If we were in .hangover, staying there would emit a silence-only follow-up chunk.
batchState = .speech
// Reset lastSpeechMs to current cursor so the hangover timer starts fresh.
// Without this, the next silent chunk would immediately trigger hangover→silence
// and emit an empty/silence-only buffer.
batchLastSpeechMs = batchAudioCursorMs
batchAudioBuffer = Data()

log("VADGate [batch]: Auto-emit (max size) — \(completedBuffer.count) bytes (\(String(format: "%.1f", emittedDurationSec))s)")

return BatchGateOutput(audioBuffer: completedBuffer, speechStartWallTime: startTime, isComplete: true)
}

// MARK: - Test Accessors (internal, accessible via @testable import)

/// Directly invoke auto-emit for testing. Sets up state, calls autoEmitBatchBuffer, returns result.
func testAutoEmit(
batchBuffer: Data,
startState: GateState,
speechStartWallTime: Double,
audioCursorMs: Double,
lastSpeechMs: Double
) -> (output: BatchGateOutput, resultState: GateState, resultLastSpeechMs: Double, resultStartWallTime: Double) {
lock.lock()
defer { lock.unlock() }
batchAudioBuffer = batchBuffer
batchState = startState
batchSpeechStartWallTime = speechStartWallTime
batchAudioCursorMs = audioCursorMs
batchLastSpeechMs = lastSpeechMs

let output = autoEmitBatchBuffer(nextChunkMs: 100, nextChunkData: Data())
return (output, batchState, batchLastSpeechMs, batchSpeechStartWallTime)
}

/// Read batch state for assertions.
var testBatchState: GateState { batchState }
var testBatchBufferCount: Int { batchAudioBuffer.count }

/// Flush remaining batch audio buffer (call when recording stops).
func flushBatchBuffer() -> BatchGateOutput? {
lock.lock()
Expand Down
6 changes: 4 additions & 2 deletions desktop/Desktop/Tests/OnboardingFlowTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ final class OnboardingFlowTests: XCTestCase {
hasMergedVoiceInputStep: false,
hasRemovedNotificationStep: true,
hasInsertedFloatingBarShortcutStep: true,
hasMigratedPagedIntro: true
hasMigratedPagedIntro: true,
hasReorderedTrustStep: true
)

XCTAssertEqual(migrated, 3)
Expand All @@ -37,7 +38,8 @@ final class OnboardingFlowTests: XCTestCase {
hasMergedVoiceInputStep: true,
hasRemovedNotificationStep: true,
hasInsertedFloatingBarShortcutStep: true,
hasMigratedPagedIntro: true
hasMigratedPagedIntro: true,
hasReorderedTrustStep: true
)

XCTAssertEqual(migrated, OnboardingFlow.lastStepIndex)
Expand Down
Loading
Loading