From ea79accdaa16acef92727b7ccfd0ef4c34affd08 Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Thu, 19 Feb 2026 01:06:32 +0900 Subject: [PATCH 1/5] fix: eliminate duplicate translations and add sentence-based segmentation - Remove translateWithContext which caused duplicate translations by including context translations in the output when Apple Translation didn't preserve newline structure - Add extractCompleteSentences() that detects sentence boundaries via linguistic analysis and creates subtitle entries immediately on punctuation (. ! ?) instead of waiting for pause timer - Simplify handlePartialTextStabilized() to consume remaining liveText directly as a fallback for unpunctuated speech - Remove dead translateWithContext method from TranslationService --- OST/Sources/App/AppState.swift | 80 ++++++++++++++----- .../Translation/TranslationService.swift | 19 ----- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/OST/Sources/App/AppState.swift b/OST/Sources/App/AppState.swift index 76ae1e6..792030b 100644 --- a/OST/Sources/App/AppState.swift +++ b/OST/Sources/App/AppState.swift @@ -213,6 +213,8 @@ final class AppState: ObservableObject { } self.detectLanguageIfNeeded(currentText) + // Extract completed sentences immediately (triggered by punctuation) + self.extractCompleteSentences() self.resetSpeechPauseTimer() } .store(in: &cancellables) @@ -228,32 +230,71 @@ final class AppState: ObservableObject { } } - /// When partial recognition text hasn't changed for the configured pause duration, treat it as a completed segment. + /// When partial recognition text hasn't changed for the configured pause duration, consume it. private func handlePartialTextStabilized() { - let fullText = speechRecognizer.currentText - guard !fullText.isEmpty, fullText != lastConsumedPartial, isCapturing else { return } + guard !liveText.isEmpty, isCapturing else { return } - // Extract only the NEW portion since last consumption - let newText: String - if !lastConsumedPartial.isEmpty && fullText.hasPrefix(lastConsumedPartial) { - newText = String(fullText.dropFirst(lastConsumedPartial.count)).trimmingCharacters(in: .whitespaces) - } else { - newText = fullText + let text = liveText + liveText = "" + lastConsumedPartial = speechRecognizer.currentText + + AppLogger.shared.log("Pause-triggered consume: \"\(text)\"", category: .speech) + + let chunks = splitIntoChunks(text) + for sentence in chunks { + let entry = SubtitleEntry(timestamp: Date(), recognized: sentence, isFinal: true) + subtitleEntries.append(entry) + translateEntry(id: entry.id, text: sentence) } - lastConsumedPartial = fullText + trimEntries() + } - guard !newText.isEmpty else { return } + /// Extracts completed sentences from liveText when punctuation boundaries are detected. + private func extractCompleteSentences() { + guard !liveText.isEmpty, isCapturing else { return } - AppLogger.shared.log("Partial text stabilized: \"\(newText)\"", category: .speech) - liveText = "" + // Split liveText into sentences using linguistic analysis + var sentenceRanges: [Range] = [] + liveText.enumerateSubstrings(in: liveText.startIndex..., options: .bySentences) { _, range, _, _ in + sentenceRanges.append(range) + } - let chunks = splitIntoChunks(newText) - for sentence in chunks { + // Need 2+ sentences: all but last are complete, last is in-progress + guard sentenceRanges.count >= 2 else { return } + + let lastStart = sentenceRanges.last!.lowerBound + let completedText = String(liveText[.. String { - let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - guard !trimmed.isEmpty else { return "" } - guard !context.isEmpty, let session else { - return try await translate(trimmed) - } - - let fullText = context.joined(separator: "\n") + "\n" + trimmed - let response = try await session.translate(fullText) - let resultLines = response.targetText.components(separatedBy: "\n") - let contextLineCount = context.count - if resultLines.count > contextLineCount { - return resultLines.dropFirst(contextLineCount).joined(separator: "\n").trimmingCharacters(in: .whitespacesAndNewlines) - } - return response.targetText - } - private func fallbackTranslation(_ text: String) async throws -> String { let sourceLang = configuration?.source?.languageCode?.identifier ?? "en" let encoded = text.addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed) ?? text From 09795a99e3dfff1b23f41df4679de42f1d945d4d Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Thu, 19 Feb 2026 21:24:39 +0900 Subject: [PATCH 2/5] feat: add persistent subtle background to overlay for visibility Add a very low-opacity background (5% black fill + 8% white border) to the full overlay area so users can always see where the overlay is positioned, even when no subtitles are displayed. --- OST/Sources/UI/SubtitleView.swift | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/OST/Sources/UI/SubtitleView.swift b/OST/Sources/UI/SubtitleView.swift index a103ece..a6d2c1b 100644 --- a/OST/Sources/UI/SubtitleView.swift +++ b/OST/Sources/UI/SubtitleView.swift @@ -32,6 +32,14 @@ struct SubtitleView: View { ) .animation(.easeInOut(duration: 0.2), value: appState.subtitleEntries.count) .frame(maxWidth: .infinity, maxHeight: .infinity, alignment: .bottomLeading) + .background( + RoundedRectangle(cornerRadius: 8) + .fill(Color.black.opacity(0.05)) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.white.opacity(0.08), lineWidth: 1) + ) + ) .translationTask(translationService.configuration) { session in AppLogger.shared.log("Translation session delivered by .translationTask", category: .translation) translationService.handleSession(session) From 1f9b18f505230912694119bf03e56592fa3cdd9d Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Fri, 20 Feb 2026 11:35:41 +0900 Subject: [PATCH 3/5] fix: prevent overlay window auto-resize and improve lock state controls Wrap NSHostingView in NSView container to decouple window size from content, add lock toggle and reset button in Settings > Display, and fix click-through behavior so locked overlay passes clicks through while unlocked overlay allows move/resize but blocks underlying clicks. --- OST/Sources/App/WindowManager.swift | 16 ++++++++++- OST/Sources/UI/FontSettingsView.swift | 23 ++++++++++++++++ OST/Sources/UI/OverlayWindow.swift | 39 +++++++++++++++++++++++---- OST/Sources/UI/SettingsView.swift | 4 ++- OST/Sources/UI/SubtitleView.swift | 35 ++++++++++++------------ 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/OST/Sources/App/WindowManager.swift b/OST/Sources/App/WindowManager.swift index 6973546..f9015e0 100644 --- a/OST/Sources/App/WindowManager.swift +++ b/OST/Sources/App/WindowManager.swift @@ -27,6 +27,14 @@ final class WindowManager: ObservableObject { overlayWindow?.updateLockState(locked: locked) } + func resetOverlay(settings: UserSettings) { + settings.overlayLocked = true + overlayWindow?.resetFrame() + overlayWindow?.updateLockState(locked: true) + // Set AFTER resetFrame() because didResizeNotification triggers persistFrame() + settings.overlayFrameSaved = false + } + func hideOverlay() { overlayWindow?.orderOut(nil) overlayWindow = nil @@ -40,7 +48,13 @@ final class WindowManager: ObservableObject { NSApp.activate(ignoringOtherApps: true) return } - let view = SettingsView(settings: settings, onOpenLogs: onOpenLogs, onOpenSessions: onOpenSessions) + let view = SettingsView( + settings: settings, + onOpenLogs: onOpenLogs, + onOpenSessions: onOpenSessions, + onResetOverlay: { [weak self] in self?.resetOverlay(settings: settings) }, + onToggleOverlayLock: { [weak self] locked in self?.updateOverlayLock(locked: locked) } + ) let window = NSWindow( contentRect: NSRect(x: 0, y: 0, width: 560, height: 480), styleMask: [.titled, .closable, .miniaturizable], diff --git a/OST/Sources/UI/FontSettingsView.swift b/OST/Sources/UI/FontSettingsView.swift index 3b64d11..709597d 100644 --- a/OST/Sources/UI/FontSettingsView.swift +++ b/OST/Sources/UI/FontSettingsView.swift @@ -2,6 +2,8 @@ import SwiftUI struct FontSettingsView: View { @ObservedObject var settings: UserSettings + var onResetOverlay: (() -> Void)? + var onToggleOverlayLock: ((Bool) -> Void)? var body: some View { Form { @@ -104,6 +106,27 @@ struct FontSettingsView: View { .accessibilityHint("Toggle display of translated text") } + Section("Overlay Window") { + Toggle("Lock Overlay", isOn: Binding( + get: { settings.overlayLocked }, + set: { newValue in + settings.overlayLocked = newValue + onToggleOverlayLock?(newValue) + } + )) + .accessibilityLabel("Lock overlay position") + Text(settings.overlayLocked + ? "Locked: clicks pass through to windows below." + : "Unlocked: drag to move or resize the overlay.") + .font(.caption) + .foregroundColor(.secondary) + + Button("Reset Overlay Position & Size") { + onResetOverlay?() + } + .accessibilityLabel("Reset overlay window to default position and size") + } + Section("Live Preview") { previewSection } diff --git a/OST/Sources/UI/OverlayWindow.swift b/OST/Sources/UI/OverlayWindow.swift index ec29d03..f995d1b 100644 --- a/OST/Sources/UI/OverlayWindow.swift +++ b/OST/Sources/UI/OverlayWindow.swift @@ -29,16 +29,28 @@ final class OverlayWindow: NSPanel { backgroundColor = .clear isOpaque = false hasShadow = false - isMovableByWindowBackground = true collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary] - ignoresMouseEvents = settings.overlayLocked + + // Apply initial lock state + let locked = settings.overlayLocked + ignoresMouseEvents = locked + isMovableByWindowBackground = !locked let subtitleView = SubtitleView( appState: appState, settings: settings, translationService: appState.translationService ) - contentView = NSHostingView(rootView: subtitleView) + let hostingView = NSHostingView(rootView: subtitleView) + hostingView.sizingOptions = [] + + // Wrap in a plain NSView container to completely prevent + // NSHostingView from driving window size changes + let container = NSView() + container.autoresizesSubviews = true + hostingView.autoresizingMask = [.width, .height] + container.addSubview(hostingView) + contentView = container NotificationCenter.default.addObserver( self, selector: #selector(persistFrame), @@ -60,9 +72,26 @@ final class OverlayWindow: NSPanel { } func updateLockState(locked: Bool) { - ignoresMouseEvents = locked + if locked { + ignoresMouseEvents = true + isMovableByWindowBackground = false + } else { + ignoresMouseEvents = false + isMovableByWindowBackground = true + } + } + + func resetFrame() { + let defaultFrame = NSRect(x: 200, y: 200, width: 600, height: 200) + setFrame(defaultFrame, display: true, animate: true) + // Note: persistFrame() will be called via didResizeNotification. + // WindowManager.resetOverlay() sets overlayFrameSaved = false AFTER this call. + } + + deinit { + NotificationCenter.default.removeObserver(self) } - override var canBecomeKey: Bool { true } + override var canBecomeKey: Bool { !ignoresMouseEvents } override var canBecomeMain: Bool { false } } diff --git a/OST/Sources/UI/SettingsView.swift b/OST/Sources/UI/SettingsView.swift index d336dc4..55dfac3 100644 --- a/OST/Sources/UI/SettingsView.swift +++ b/OST/Sources/UI/SettingsView.swift @@ -4,10 +4,12 @@ struct SettingsView: View { @ObservedObject var settings: UserSettings let onOpenLogs: () -> Void let onOpenSessions: () -> Void + var onResetOverlay: (() -> Void)? + var onToggleOverlayLock: ((Bool) -> Void)? var body: some View { TabView { - FontSettingsView(settings: settings) + FontSettingsView(settings: settings, onResetOverlay: onResetOverlay, onToggleOverlayLock: onToggleOverlayLock) .tabItem { Label("Display", systemImage: "textformat.size") } diff --git a/OST/Sources/UI/SubtitleView.swift b/OST/Sources/UI/SubtitleView.swift index a6d2c1b..35b5335 100644 --- a/OST/Sources/UI/SubtitleView.swift +++ b/OST/Sources/UI/SubtitleView.swift @@ -8,38 +8,37 @@ struct SubtitleView: View { var body: some View { VStack(alignment: .leading, spacing: 6) { + Spacer(minLength: 0) + ForEach(appState.subtitleEntries) { entry in subtitleRow(entry) - .transition(.asymmetric( - insertion: .opacity.combined(with: .move(edge: .bottom)), - removal: .opacity - )) + .transition(.opacity) } - // Show live partial text (not yet finalized) - if !appState.liveText.isEmpty { - if settings.showOriginalText { - Text(appState.liveText) - .font(.system(size: settings.fontSize)) - .foregroundColor(settings.fontColor.opacity(0.6)) - } + if !appState.liveText.isEmpty && settings.showOriginalText { + Text(appState.liveText) + .font(.system(size: settings.fontSize)) + .foregroundColor(settings.fontColor.opacity(0.6)) + .fixedSize(horizontal: false, vertical: true) } } .padding(12) + .frame(maxWidth: .infinity, maxHeight: .infinity, alignment: .bottomLeading) + .clipped() .background( RoundedRectangle(cornerRadius: 8) .fill(settings.backgroundColor.opacity(settings.backgroundOpacity)) ) - .animation(.easeInOut(duration: 0.2), value: appState.subtitleEntries.count) - .frame(maxWidth: .infinity, maxHeight: .infinity, alignment: .bottomLeading) - .background( + .overlay( RoundedRectangle(cornerRadius: 8) - .fill(Color.black.opacity(0.05)) - .overlay( - RoundedRectangle(cornerRadius: 8) - .stroke(Color.white.opacity(0.08), lineWidth: 1) + .stroke( + settings.overlayLocked + ? Color.white.opacity(0.15) + : Color.accentColor.opacity(0.6), + lineWidth: settings.overlayLocked ? 1 : 2 ) ) + .animation(.easeInOut(duration: 0.2), value: appState.subtitleEntries.count) .translationTask(translationService.configuration) { session in AppLogger.shared.log("Translation session delivered by .translationTask", category: .translation) translationService.handleSession(session) From fe1cb60ba88a33716dd79363258a23b88e6db22f Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Fri, 20 Feb 2026 16:50:17 +0900 Subject: [PATCH 4/5] fix: resolve duplicate subtitles, race conditions, and improve scroll tracking - Fix duplicate subtitle entries caused by recognizer text reformulation: find longest common prefix instead of resetting consumed state entirely, add deduplication check (last 2 entries within 2s) before adding entries - Fix race condition in stopCapture: set isCapturing=false before stopRecognition to prevent phantom entries from Combine sink - Fix SystemAudioCapture: finish continuation before awaiting stopCapture to prevent dangling yields; remove force-unwrap on re-entry - Fix SpeechRecognizer: handle partial result + error simultaneously to prevent recognition task from silently dying - Fix scroll tracking: replace unreliable onAppear/onDisappear on zero-height view with onScrollGeometryChange for reliable bottom detection - Fix handlePartialTextStabilized: use stored sink value instead of reading speechRecognizer.currentText directly (timing consistency) - Fix lastConsumedTail: unconditionally clear after first use to prevent stale overlap stripping in subsequent recognition sessions - Fix detectLanguageIfNeeded: check isCapturing before language change - Fix LanguagePickerView: add defensive guard and correct swap announcement - Fix MenuBarView: show "Auto" when source language is auto-detect - Update README with comprehensive setup guide and feature documentation --- OST/Sources/App/AppState.swift | 108 ++++++++++++++---- OST/Sources/Audio/SystemAudioCapture.swift | 14 ++- OST/Sources/Speech/SpeechRecognizer.swift | 38 +++--- .../Translation/TranslationService.swift | 3 +- OST/Sources/UI/LanguagePickerView.swift | 7 +- OST/Sources/UI/MenuBarView.swift | 9 +- OST/Sources/UI/SubtitleView.swift | 50 ++++++-- README.md | 65 ++++++++--- 8 files changed, 225 insertions(+), 69 deletions(-) diff --git a/OST/Sources/App/AppState.swift b/OST/Sources/App/AppState.swift index 792030b..8f0e783 100644 --- a/OST/Sources/App/AppState.swift +++ b/OST/Sources/App/AppState.swift @@ -37,6 +37,8 @@ final class AppState: ObservableObject { private var expiryTimer: Timer? private var speechPauseTimer: Timer? private var lastConsumedPartial: String = "" + private var lastConsumedTail: String = "" + private var lastSinkCurrentText: String = "" private var saveSessionHistory: Bool = true private var cancellables = Set() private var autoDetectEnabled: Bool = false @@ -76,6 +78,7 @@ final class AppState: ObservableObject { isCapturing = true lastConsumedPartial = "" + lastConsumedTail = "" subtitleEntries = [] liveText = "" startConsumingBuffers(from: buffers) @@ -95,6 +98,7 @@ final class AppState: ObservableObject { /// Stops capture and recognition, preserving last recognized text. func stopCapture() async { guard isCapturing else { return } + isCapturing = false bufferConsumerTask?.cancel() bufferConsumerTask = nil @@ -103,10 +107,8 @@ final class AppState: ObservableObject { speechPauseTimer?.invalidate() speechPauseTimer = nil - await audioCapture.stopCapture() speechRecognizer.stopRecognition() - - isCapturing = false + await audioCapture.stopCapture() if saveSessionHistory { sessionRecorder.endSession() @@ -119,6 +121,7 @@ final class AppState: ObservableObject { do { try await speechRecognizer.changeLanguage(locale: locale, useOnDevice: useOnDevice) lastConsumedPartial = "" + lastConsumedTail = "" } catch { errorMessage = error.localizedDescription } @@ -157,10 +160,12 @@ final class AppState: ObservableObject { AppLogger.shared.log("Auto-detected language: \(target.displayName) (confidence: \(confidence))", category: .speech) Task { - await changeSourceLanguage(to: target.speechLocale, useOnDevice: speechRecognizer.currentOnDeviceSetting) - // Reset consumed state since recognizer restarts + guard self.isCapturing else { return } + // Reset consumed state BEFORE language change to avoid stale tracking lastConsumedPartial = "" + lastConsumedTail = "" liveText = "" + await changeSourceLanguage(to: target.speechLocale, useOnDevice: speechRecognizer.currentOnDeviceSetting) } } @@ -194,6 +199,8 @@ final class AppState: ObservableObject { if !self.liveText.isEmpty { self.consumeRemainingText() } + // Save tail of consumed text for overlap detection on next session + self.lastConsumedTail = String(self.lastConsumedPartial.suffix(60)) self.lastConsumedPartial = "" self.liveText = "" self.speechPauseTimer?.invalidate() @@ -205,16 +212,40 @@ final class AppState: ObservableObject { if !self.lastConsumedPartial.isEmpty && currentText.hasPrefix(self.lastConsumedPartial) { self.liveText = String(currentText.dropFirst(self.lastConsumedPartial.count)).trimmingCharacters(in: .whitespaces) } else if self.lastConsumedPartial.isEmpty { - self.liveText = currentText + // After restart, check for overlap with previous session's tail + if !self.lastConsumedTail.isEmpty { + let stripped = self.stripOverlap(newText: currentText, tail: self.lastConsumedTail) + self.liveText = stripped + if stripped != currentText { + // We found overlap; track what we've consumed from the new session + let overlapLength = currentText.count - stripped.count + self.lastConsumedPartial = String(currentText.prefix(overlapLength)) + AppLogger.shared.log("Stripped overlap: \(overlapLength) chars from new session", category: .speech) + } + } else { + self.liveText = currentText + } + // Clear tail after first use in new session (unconditional) + self.lastConsumedTail = "" } else { - // currentText was reset with new content (e.g. language change) - self.lastConsumedPartial = "" - self.liveText = currentText + // currentText diverged from lastConsumedPartial (recognizer reformulation) + // Find longest common prefix to avoid re-showing already-consumed text + let common = self.findCommonPrefix(currentText, self.lastConsumedPartial) + if common.count > 10 { + self.lastConsumedPartial = common + self.liveText = String(currentText.dropFirst(common.count)).trimmingCharacters(in: .whitespaces) + } else { + // Completely different text (e.g. language change) + self.lastConsumedPartial = "" + self.liveText = currentText + } } + self.lastSinkCurrentText = currentText self.detectLanguageIfNeeded(currentText) // Extract completed sentences immediately (triggered by punctuation) - self.extractCompleteSentences() + // Pass currentText from sink to avoid reading stale speechRecognizer.currentText + self.extractCompleteSentences(sinkCurrentText: currentText) self.resetSpeechPauseTimer() } .store(in: &cancellables) @@ -236,12 +267,13 @@ final class AppState: ObservableObject { let text = liveText liveText = "" - lastConsumedPartial = speechRecognizer.currentText + lastConsumedPartial = lastSinkCurrentText AppLogger.shared.log("Pause-triggered consume: \"\(text)\"", category: .speech) let chunks = splitIntoChunks(text) for sentence in chunks { + guard !isDuplicateEntry(sentence) else { continue } let entry = SubtitleEntry(timestamp: Date(), recognized: sentence, isFinal: true) subtitleEntries.append(entry) translateEntry(id: entry.id, text: sentence) @@ -250,7 +282,9 @@ final class AppState: ObservableObject { } /// Extracts completed sentences from liveText when punctuation boundaries are detected. - private func extractCompleteSentences() { + /// Uses `sinkCurrentText` (the value delivered by the Combine sink) instead of reading + /// `speechRecognizer.currentText` directly, which may have changed since delivery. + private func extractCompleteSentences(sinkCurrentText: String) { guard !liveText.isEmpty, isCapturing else { return } // Split liveText into sentences using linguistic analysis @@ -260,9 +294,9 @@ final class AppState: ObservableObject { } // Need 2+ sentences: all but last are complete, last is in-progress - guard sentenceRanges.count >= 2 else { return } + guard sentenceRanges.count >= 2, let lastRange = sentenceRanges.last else { return } - let lastStart = sentenceRanges.last!.lowerBound + let lastStart = lastRange.lowerBound let completedText = String(liveText[.. String { + var endIndex = a.startIndex + var aIdx = a.startIndex + var bIdx = b.startIndex + while aIdx < a.endIndex && bIdx < b.endIndex { + if a[aIdx] != b[bIdx] { break } + aIdx = a.index(after: aIdx) + bIdx = b.index(after: bIdx) + endIndex = aIdx + } + return String(a[.. Bool { + let cutoff = Date().addingTimeInterval(-2.0) + return subtitleEntries.suffix(2).contains { $0.recognized == text && $0.timestamp > cutoff } + } + + /// Finds and strips overlapping text between the tail of previously consumed text and the start of new text. + private func stripOverlap(newText: String, tail: String) -> String { + // Try progressively shorter suffixes of the tail to find overlap with the start of newText + let tailWords = tail.split(separator: " ") + for startIdx in 0.. max { diff --git a/OST/Sources/Audio/SystemAudioCapture.swift b/OST/Sources/Audio/SystemAudioCapture.swift index 1631f5d..b740a37 100644 --- a/OST/Sources/Audio/SystemAudioCapture.swift +++ b/OST/Sources/Audio/SystemAudioCapture.swift @@ -29,14 +29,19 @@ final class SystemAudioCapture: NSObject, @unchecked Sendable { private var stream: SCStream? private var continuation: AsyncStream.Continuation? private(set) var audioBuffers: AsyncStream? - private var bufferCount: Int = 0 + private let bufferLock = NSLock() + private var _bufferCount: Int = 0 + private var bufferCount: Int { + get { bufferLock.withLock { _bufferCount } } + set { bufferLock.withLock { _bufferCount = newValue } } + } /// Requests permission if needed, then starts capturing system audio. /// Returns a fresh AsyncStream of audio buffers for each capture session. func startCapture() async throws -> AsyncStream { guard stream == nil else { AppLogger.post("Stream already active, returning existing", category: .audio) - return audioBuffers! + return audioBuffers ?? AsyncStream { $0.finish() } } bufferCount = 0 @@ -95,14 +100,15 @@ final class SystemAudioCapture: NSObject, @unchecked Sendable { func stopCapture() async { guard let current = stream else { return } stream = nil + // Finish continuation BEFORE awaiting stopCapture to prevent dangling yields + continuation?.finish() + continuation = nil AppLogger.post("Stopping capture (received \(bufferCount) audio buffers)", category: .audio) do { try await current.stopCapture() } catch { AppLogger.post("Stop error (non-fatal): \(error.localizedDescription)", category: .audio) } - continuation?.finish() - continuation = nil } // MARK: - Helpers diff --git a/OST/Sources/Speech/SpeechRecognizer.swift b/OST/Sources/Speech/SpeechRecognizer.swift index cb1a1d0..35af926 100644 --- a/OST/Sources/Speech/SpeechRecognizer.swift +++ b/OST/Sources/Speech/SpeechRecognizer.swift @@ -78,6 +78,7 @@ final class SpeechRecognizer: ObservableObject { recognitionRequest = nil recognitionTask = nil currentText = "" + finalizedText = "" } // MARK: - Recognition Task @@ -88,42 +89,49 @@ final class SpeechRecognizer: ObservableObject { throw SpeechRecognizerError.recognizerUnavailable } - // Clean up previous task without clearing isActive - recognitionRequest?.endAudio() - recognitionTask?.cancel() - recognitionRequest = nil - recognitionTask = nil - + // Create the new request BEFORE cleaning up the old one + // to minimize the window where recognitionRequest is nil + // and audio buffers from startConsumingBuffers are lost. let request = SFSpeechAudioBufferRecognitionRequest() request.shouldReportPartialResults = true if useOnDevice && recognizer.supportsOnDeviceRecognition { request.requiresOnDeviceRecognition = true } request.addsPunctuation = true - recognitionRequest = request + + // Now clean up previous task + let oldRequest = recognitionRequest + let oldTask = recognitionTask + recognitionRequest = request // Swap immediately so append() uses new request + + oldRequest?.endAudio() + oldTask?.cancel() AppLogger.shared.log("Starting recognition task (onDevice: \(useOnDevice))", category: .speech) recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in guard let self else { return } - if let result { - Task { @MainActor in + Task { @MainActor in + if let result { let text = result.bestTranscription.formattedString self.currentText = text if result.isFinal { AppLogger.shared.log("Final: \"\(text)\"", category: .speech) - self.finalizedText += (self.finalizedText.isEmpty ? "" : " ") + text + self.finalizedText = "" self.currentText = "" - // Auto-restart recognition for continuous listening self.restartRecognition() + return + } + // Partial result with concurrent error — task is dying + if error != nil { + AppLogger.shared.log("Partial result with error, restarting", category: .speech) + self.restartRecognition() + return } } - } - if let error { - Task { @MainActor in + if let error, result == nil { AppLogger.shared.log("Recognition error: \(error.localizedDescription)", category: .error) self.currentText = "" - // Auto-restart on transient errors self.restartRecognition() } } diff --git a/OST/Sources/Translation/TranslationService.swift b/OST/Sources/Translation/TranslationService.swift index 1deb6d9..9bcad9f 100644 --- a/OST/Sources/Translation/TranslationService.swift +++ b/OST/Sources/Translation/TranslationService.swift @@ -37,8 +37,9 @@ final class TranslationService: ObservableObject { private func fallbackTranslation(_ text: String) async throws -> String { let sourceLang = configuration?.source?.languageCode?.identifier ?? "en" + let targetLang = configuration?.target?.languageCode?.identifier ?? "ko" let encoded = text.addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed) ?? text - let urlString = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=\(sourceLang)&tl=ko&dt=t&q=\(encoded)" + let urlString = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=\(sourceLang)&tl=\(targetLang)&dt=t&q=\(encoded)" guard let url = URL(string: urlString) else { return text } diff --git a/OST/Sources/UI/LanguagePickerView.swift b/OST/Sources/UI/LanguagePickerView.swift index 0451900..f4e1e58 100644 --- a/OST/Sources/UI/LanguagePickerView.swift +++ b/OST/Sources/UI/LanguagePickerView.swift @@ -50,7 +50,7 @@ struct LanguagePickerView: View { .accessibilityLabel("Swap source and target languages") .accessibilityHint("Exchanges the source and target language selections") .keyboardShortcut("s", modifiers: [.command, .shift]) - .disabled(settings.sourceLanguage == settings.targetLanguage) + .disabled(settings.sourceLanguage == settings.targetLanguage || settings.sourceLanguage == "auto") } } .formStyle(.grouped) @@ -103,9 +103,12 @@ struct LanguagePickerView: View { // MARK: - Actions private func swapLanguages() { + guard settings.sourceLanguage != "auto" else { return } + let oldSource = sourceLanguage.displayName + let oldTarget = targetLanguage.displayName let previous = settings.sourceLanguage settings.sourceLanguage = settings.targetLanguage settings.targetLanguage = previous - AccessibilityManager.announce("Languages swapped: \(targetLanguage.displayName) to \(sourceLanguage.displayName)") + AccessibilityManager.announce("Languages swapped: \(oldTarget) to \(oldSource)") } } diff --git a/OST/Sources/UI/MenuBarView.swift b/OST/Sources/UI/MenuBarView.swift index 0b97c07..88e44fc 100644 --- a/OST/Sources/UI/MenuBarView.swift +++ b/OST/Sources/UI/MenuBarView.swift @@ -11,6 +11,11 @@ struct MenuBarView: View { let onToggleOverlayLock: (Bool) -> Void let onQuit: () -> Void + private var sourceLanguageDisplay: String { + if settings.sourceLanguage == "auto" { return "Auto" } + return (SupportedLanguage(rawValue: settings.sourceLanguage) ?? .english).displayName + } + private var sourceLanguage: SupportedLanguage { SupportedLanguage(rawValue: settings.sourceLanguage) ?? .english } @@ -31,8 +36,8 @@ struct MenuBarView: View { Divider() - Text("\(sourceLanguage.flagEmoji) \(sourceLanguage.displayName) → \(targetLanguage.flagEmoji) \(targetLanguage.displayName)") - .accessibilityLabel("Translating from \(sourceLanguage.displayName) to \(targetLanguage.displayName)") + Text("\(settings.sourceLanguage == "auto" ? "🌐" : sourceLanguage.flagEmoji) \(sourceLanguageDisplay) → \(targetLanguage.flagEmoji) \(targetLanguage.displayName)") + .accessibilityLabel("Translating from \(sourceLanguageDisplay) to \(targetLanguage.displayName)") Divider() diff --git a/OST/Sources/UI/SubtitleView.swift b/OST/Sources/UI/SubtitleView.swift index 35b5335..6966c02 100644 --- a/OST/Sources/UI/SubtitleView.swift +++ b/OST/Sources/UI/SubtitleView.swift @@ -6,20 +6,48 @@ struct SubtitleView: View { @ObservedObject var settings: UserSettings @ObservedObject var translationService: TranslationService + @State private var isAtBottom = true + var body: some View { - VStack(alignment: .leading, spacing: 6) { - Spacer(minLength: 0) + ScrollViewReader { proxy in + ScrollView(.vertical, showsIndicators: !settings.overlayLocked) { + VStack(alignment: .leading, spacing: 6) { + Spacer(minLength: 0) - ForEach(appState.subtitleEntries) { entry in - subtitleRow(entry) - .transition(.opacity) - } + ForEach(appState.subtitleEntries) { entry in + subtitleRow(entry) + .transition(.opacity) + } - if !appState.liveText.isEmpty && settings.showOriginalText { - Text(appState.liveText) - .font(.system(size: settings.fontSize)) - .foregroundColor(settings.fontColor.opacity(0.6)) - .fixedSize(horizontal: false, vertical: true) + if !appState.liveText.isEmpty && settings.showOriginalText { + Text(appState.liveText) + .font(.system(size: settings.fontSize)) + .foregroundColor(settings.fontColor.opacity(0.6)) + .fixedSize(horizontal: false, vertical: true) + } + + Color.clear + .frame(height: 0) + .id("bottom") + } + .frame(maxWidth: .infinity, minHeight: 0, alignment: .bottomLeading) + } + .scrollDisabled(settings.overlayLocked) + .onScrollGeometryChange(for: Bool.self) { geometry in + let atBottom = geometry.contentOffset.y + geometry.containerSize.height >= geometry.contentSize.height - 10 + return atBottom + } action: { _, newValue in + isAtBottom = newValue + } + .onChange(of: appState.subtitleEntries.count) { _, _ in + if isAtBottom || settings.overlayLocked { + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } + } + .onChange(of: appState.liveText) { _, _ in + if isAtBottom || settings.overlayLocked { + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } } } .padding(12) diff --git a/README.md b/README.md index dafb893..88acd1e 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,14 @@ This project was entirely written by [Claude](https://claude.ai/) (Anthropic's A - Speech-to-text using SFSpeechRecognizer (on-device or server-based) - Translation via Apple Translation framework (with Google Translate fallback) - Floating, resizable overlay with customizable appearance +- Lock/Unlock overlay: locked = click-through, unlocked = move/resize/scroll +- Scrollable subtitle history (unlock mode) - Separate font size and color for original and translated text -- Configurable speech pause detection (0.5–5 seconds) -- Session history recording +- Configurable background color, opacity, speech pause, subtitle expiry +- Automatic language detection (English, Korean, Japanese, Chinese) +- Sentence-based text segmentation with pause detection +- Duplicate text overlap detection on recognition restart +- Session history recording with export - Menu bar app (no Dock icon) ## Requirements @@ -46,17 +51,42 @@ This project was entirely written by [Claude](https://claude.ai/) (Anthropic's A - Apple Silicon (arm64) - Xcode Command Line Tools (`xcode-select --install`) -## Permissions +## Setup Guide -On first launch, macOS will prompt for: +### 1. Required Permissions -- **Screen Recording** — for system audio capture -- **Speech Recognition** — for SFSpeechRecognizer +On first launch, macOS will prompt for the following permissions. If not prompted, enable them manually: -### Recommended Setup +| Permission | Purpose | Path | +|---|---|---| +| **Screen Recording** | System audio capture via ScreenCaptureKit | System Settings > Privacy & Security > Screen Recording | +| **Speech Recognition** | SFSpeechRecognizer access | System Settings > Privacy & Security > Speech Recognition | -- Download on-device speech model: System Settings > Keyboard > Dictation > Languages -- Download translation language pack: System Settings > General > Language & Region > Translation Languages +### 2. Enable Siri & Dictation + +Speech recognition (especially server-based) requires Siri & Dictation to be enabled: + +- **System Settings > Siri & Spotlight > Siri** — Turn on (or "Listen for...") +- If using on-device recognition, Siri does not need to be active, but the speech model must be downloaded (see step 3) + +### 3. Download On-Device Speech Model (Recommended) + +For faster, offline, and more reliable recognition: + +- **System Settings > General > Keyboard > Dictation > Languages** +- Download the speech model for your source language (e.g., English, Korean, Japanese) +- After download, enable "On-device recognition" in OST Settings > Debug tab + +> Without the on-device model, server-based recognition is used (requires internet, may have higher latency). + +### 4. Download Translation Language Pack (Recommended) + +For offline translation using Apple Translation framework: + +- **System Settings > General > Language & Region > Translation Languages** +- Download the language pair you need (e.g., English ↔ Korean) + +> Without the translation pack, OST falls back to the Google Translate API (requires internet). ## Build @@ -95,13 +125,20 @@ OST/Sources/ └── Accessibility/ AccessibilityManager ``` +## Usage Tips + +- **Lock/Unlock**: Use the menu bar toggle or Settings > Display to switch overlay modes + - **Locked**: Overlay is click-through — interact with windows behind it normally + - **Unlocked**: Drag to move, resize edges, scroll through subtitle history +- **Reset Overlay**: If the overlay becomes invisible or mispositioned, use Settings > Display > "Reset Overlay Position & Size" +- **Scroll behavior**: Auto-scrolls to latest text by default. Scroll up to pause auto-scroll; scroll back to bottom to resume + ## Known Issues -- **Endpoint detection (EPD)** — Speech segmentation relies on a simple pause timer rather than proper endpoint detection. This means subtitle boundaries depend on silence duration, not linguistic structure, which can split mid-sentence or merge unrelated phrases. -- **Automatic language detection** — Auto-detect uses NLLanguageRecognizer on the first ~15 characters, which may misidentify the language from short or ambiguous input. Detection only runs once per session — if it picks the wrong language, the entire session uses the wrong recognizer. -- **Overlay blocks clicks even when empty** — The overlay window occupies its full frame area regardless of visible text content, which can block mouse interaction with underlying windows. Use "Lock Overlay" from the menu bar to toggle click-through. -- **Translation consistency** — Translation is triggered per speech pause, not per sentence. Short or fragmented pauses may produce less coherent translations. Context from recent entries is included to mitigate this, but long conversations may still show inconsistencies. -- **Speech recognition restart gap** — SFSpeechRecognizer's recognition task expires after ~60 seconds and auto-restarts, which may cause a brief gap in recognition. +- **Endpoint detection (EPD)** — Speech segmentation uses a pause timer combined with sentence boundary detection, not proper endpoint detection. Subtitle boundaries may sometimes split mid-sentence or merge unrelated phrases. +- **Automatic language detection** — Auto-detect uses NLLanguageRecognizer on the first ~15 characters, which may misidentify the language from short or ambiguous input. Detection only runs once per session. +- **Translation consistency** — Translation is triggered per speech segment. Short or fragmented segments may produce less coherent translations. +- **Speech recognition restart gap** — SFSpeechRecognizer's recognition task expires after ~60 seconds and auto-restarts. Overlap detection minimizes duplicate text, but a brief gap in recognition may still occur. ## License From 5f1fa823d1faa1ff39b2076bbb2e1598040c9da1 Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Fri, 20 Feb 2026 21:33:35 +0900 Subject: [PATCH 5/5] feat: add split overlay mode, live translation, and overlay stability fixes - Add dual display mode (combined/split) with independent recognition and translation windows - Add live translation that shows translations as text is being recognized - Fix invisible overlay blocking clicks by setting explicit hosting view frame - Add screen boundary clamping to prevent overlay windows from appearing off-screen - Add "Reset All Overlay Windows" button in Settings > Display Mode - Add duplicate text filtering, punctuation-only entry removal - Improve auto-scroll to always follow latest text - Sync lock/unlock state across both overlay windows - Update README with comprehensive setup guide, usage instructions, and troubleshooting --- OST/Sources/App/AppState.swift | 56 +++++++- OST/Sources/App/OSTApp.swift | 8 +- OST/Sources/App/WindowManager.swift | 116 ++++++++++++++- OST/Sources/Settings/UserSettings.swift | 11 ++ OST/Sources/UI/FontSettingsView.swift | 47 ++++++ OST/Sources/UI/OverlayWindow.swift | 129 ++++++++++++----- OST/Sources/UI/RecognitionOverlayView.swift | 69 +++++++++ OST/Sources/UI/SettingsView.swift | 10 +- OST/Sources/UI/SubtitleView.swift | 10 +- OST/Sources/UI/TranslationOverlayView.swift | 78 ++++++++++ README.md | 150 ++++++++++++++------ build.sh | 2 + 12 files changed, 582 insertions(+), 104 deletions(-) create mode 100644 OST/Sources/UI/RecognitionOverlayView.swift create mode 100644 OST/Sources/UI/TranslationOverlayView.swift diff --git a/OST/Sources/App/AppState.swift b/OST/Sources/App/AppState.swift index 8f0e783..a182b32 100644 --- a/OST/Sources/App/AppState.swift +++ b/OST/Sources/App/AppState.swift @@ -22,6 +22,7 @@ final class AppState: ObservableObject { @Published var isCapturing: Bool = false @Published private(set) var subtitleEntries: [SubtitleEntry] = [] @Published private(set) var liveText: String = "" + @Published private(set) var liveTranslatedText: String = "" @Published var errorMessage: String? = nil // MARK: - Pipeline Components @@ -36,6 +37,8 @@ final class AppState: ObservableObject { private var bufferConsumerTask: Task? private var expiryTimer: Timer? private var speechPauseTimer: Timer? + private var liveTranslationTimer: Timer? + private var liveTranslationTask: Task? private var lastConsumedPartial: String = "" private var lastConsumedTail: String = "" private var lastSinkCurrentText: String = "" @@ -106,6 +109,11 @@ final class AppState: ObservableObject { expiryTimer = nil speechPauseTimer?.invalidate() speechPauseTimer = nil + liveTranslationTimer?.invalidate() + liveTranslationTimer = nil + liveTranslationTask?.cancel() + liveTranslationTask = nil + liveTranslatedText = "" speechRecognizer.stopRecognition() await audioCapture.stopCapture() @@ -203,6 +211,9 @@ final class AppState: ObservableObject { self.lastConsumedTail = String(self.lastConsumedPartial.suffix(60)) self.lastConsumedPartial = "" self.liveText = "" + self.liveTranslatedText = "" + self.liveTranslationTimer?.invalidate() + self.liveTranslationTask?.cancel() self.speechPauseTimer?.invalidate() self.speechPauseTimer = nil return @@ -247,10 +258,38 @@ final class AppState: ObservableObject { // Pass currentText from sink to avoid reading stale speechRecognizer.currentText self.extractCompleteSentences(sinkCurrentText: currentText) self.resetSpeechPauseTimer() + self.debounceLiveTranslation() } .store(in: &cancellables) } + private func debounceLiveTranslation() { + liveTranslationTimer?.invalidate() + guard !liveText.isEmpty else { + liveTranslatedText = "" + liveTranslationTask?.cancel() + return + } + liveTranslationTimer = Timer.scheduledTimer(withTimeInterval: 0.8, repeats: false) { [weak self] _ in + Task { @MainActor [weak self] in + guard let self, self.isCapturing, !self.liveText.isEmpty else { return } + let textToTranslate = self.liveText + self.liveTranslationTask?.cancel() + self.liveTranslationTask = Task { [weak self] in + guard let self else { return } + do { + let result = try await self.translationService.translate(textToTranslate) + if !Task.isCancelled { + self.liveTranslatedText = result + } + } catch { + // Translation failed silently — keep previous liveTranslatedText + } + } + } + } + } + private func resetSpeechPauseTimer() { speechPauseTimer?.invalidate() speechPauseTimer = Timer.scheduledTimer(withTimeInterval: speechPauseSeconds, repeats: false) { [weak self] _ in @@ -267,13 +306,14 @@ final class AppState: ObservableObject { let text = liveText liveText = "" + liveTranslatedText = "" lastConsumedPartial = lastSinkCurrentText AppLogger.shared.log("Pause-triggered consume: \"\(text)\"", category: .speech) let chunks = splitIntoChunks(text) for sentence in chunks { - guard !isDuplicateEntry(sentence) else { continue } + guard !isPunctuationOnly(sentence), !isDuplicateEntry(sentence) else { continue } let entry = SubtitleEntry(timestamp: Date(), recognized: sentence, isFinal: true) subtitleEntries.append(entry) translateEntry(id: entry.id, text: sentence) @@ -310,7 +350,7 @@ final class AppState: ObservableObject { guard !sentences.isEmpty else { return } for sentence in sentences { - guard !isDuplicateEntry(sentence) else { continue } + guard !isPunctuationOnly(sentence), !isDuplicateEntry(sentence) else { continue } let entry = SubtitleEntry(timestamp: Date(), recognized: sentence, isFinal: true) subtitleEntries.append(entry) translateEntry(id: entry.id, text: sentence) @@ -339,7 +379,7 @@ final class AppState: ObservableObject { AppLogger.shared.log("Consuming remaining text before reset: \"\(text)\"", category: .speech) let chunks = splitIntoChunks(text) for sentence in chunks { - guard !isDuplicateEntry(sentence) else { continue } + guard !isPunctuationOnly(sentence), !isDuplicateEntry(sentence) else { continue } let entry = SubtitleEntry(timestamp: Date(), recognized: sentence, isFinal: true) subtitleEntries.append(entry) translateEntry(id: entry.id, text: sentence) @@ -415,10 +455,16 @@ final class AppState: ObservableObject { return String(a[.. Bool { + let stripped = text.trimmingCharacters(in: .whitespacesAndNewlines.union(.punctuationCharacters)) + return stripped.isEmpty + } + /// Checks if the same recognized text was very recently added to avoid duplicates from recognizer reformulation. private func isDuplicateEntry(_ text: String) -> Bool { - let cutoff = Date().addingTimeInterval(-2.0) - return subtitleEntries.suffix(2).contains { $0.recognized == text && $0.timestamp > cutoff } + let cutoff = Date().addingTimeInterval(-5.0) + return subtitleEntries.suffix(4).contains { $0.recognized == text && $0.timestamp > cutoff } } /// Finds and strips overlapping text between the tail of previously consumed text and the start of new text. diff --git a/OST/Sources/App/OSTApp.swift b/OST/Sources/App/OSTApp.swift index a8f2143..7b9f87f 100644 --- a/OST/Sources/App/OSTApp.swift +++ b/OST/Sources/App/OSTApp.swift @@ -16,7 +16,13 @@ struct OSTApp: App { onOpenSettings: openSettings, onOpenLogs: { windowManager.showLogViewer() }, onOpenSessions: { windowManager.showSessionHistory(recorder: appState.sessionRecorder, alwaysOnTop: settings.sessionWindowAlwaysOnTop) }, - onToggleOverlayLock: { locked in windowManager.updateOverlayLock(locked: locked) }, + onToggleOverlayLock: { locked in + windowManager.updateOverlayLock(locked: locked) + if settings.overlayDisplayMode == "split" { + settings.overlay2Locked = locked + windowManager.updateOverlay2Lock(locked: locked) + } + }, onQuit: quitApp ) } diff --git a/OST/Sources/App/WindowManager.swift b/OST/Sources/App/WindowManager.swift index f9015e0..2aa998c 100644 --- a/OST/Sources/App/WindowManager.swift +++ b/OST/Sources/App/WindowManager.swift @@ -7,6 +7,7 @@ import SwiftUI final class WindowManager: ObservableObject { private var overlayWindow: OverlayWindow? + private var overlayWindow2: OverlayWindow? // Translation window for split mode private var settingsWindow: NSWindow? private var logWindow: NSWindow? private var sessionWindow: NSWindow? @@ -14,30 +15,131 @@ final class WindowManager: ObservableObject { // MARK: - Overlay func showOverlay(appState: AppState, settings: UserSettings) { + let isSplit = settings.overlayDisplayMode == "split" + + if isSplit { + showSplitOverlay(appState: appState, settings: settings) + } else { + showCombinedOverlay(appState: appState, settings: settings) + } + } + + private func showCombinedOverlay(appState: AppState, settings: UserSettings) { + // Hide any split windows + hideOverlayWindow2() + if let existing = overlayWindow { existing.makeKeyAndOrderFront(nil) return } - let window = OverlayWindow(appState: appState, settings: settings) + let view = AnyView(SubtitleView( + appState: appState, + settings: settings, + translationService: appState.translationService + )) + let window = OverlayWindow(contentView: view, settings: settings, role: .combined) window.makeKeyAndOrderFront(nil) overlayWindow = window } + private func showSplitOverlay(appState: AppState, settings: UserSettings) { + // Recognition window (primary) + if let existing = overlayWindow { + existing.makeKeyAndOrderFront(nil) + } else { + let recognitionView = AnyView(RecognitionOverlayView( + appState: appState, + settings: settings + )) + let window = OverlayWindow(contentView: recognitionView, settings: settings, role: .recognition) + window.makeKeyAndOrderFront(nil) + overlayWindow = window + } + + // Translation window (secondary) + if let existing = overlayWindow2 { + existing.makeKeyAndOrderFront(nil) + } else { + let translationView = AnyView(TranslationOverlayView( + appState: appState, + settings: settings, + translationService: appState.translationService + )) + let window = OverlayWindow(contentView: translationView, settings: settings, role: .translation) + window.makeKeyAndOrderFront(nil) + overlayWindow2 = window + } + } + func updateOverlayLock(locked: Bool) { overlayWindow?.updateLockState(locked: locked) } + func updateOverlay2Lock(locked: Bool) { + overlayWindow2?.updateLockState(locked: locked) + } + func resetOverlay(settings: UserSettings) { - settings.overlayLocked = true - overlayWindow?.resetFrame() - overlayWindow?.updateLockState(locked: true) - // Set AFTER resetFrame() because didResizeNotification triggers persistFrame() + let isSplit = settings.overlayDisplayMode == "split" + if isSplit { + resetAllOverlaysSideBySide(settings: settings) + } else { + settings.overlayLocked = false + overlayWindow?.resetFrame() + overlayWindow?.updateLockState(locked: false) + settings.overlayFrameSaved = false + } + } + + func resetOverlay2(settings: UserSettings) { + let isSplit = settings.overlayDisplayMode == "split" + if isSplit { + resetAllOverlaysSideBySide(settings: settings) + } else { + settings.overlay2Locked = false + overlayWindow2?.resetFrame() + overlayWindow2?.updateLockState(locked: false) + settings.overlay2FrameSaved = false + } + } + + /// Resets both overlay windows side-by-side and unlocks them. + private func resetAllOverlaysSideBySide(settings: UserSettings) { + let windowWidth: CGFloat = 500 + let windowHeight: CGFloat = 200 + let gap: CGFloat = 20 + + // Center the pair on visible screen area (accounting for dock/menu bar) + let screen = NSScreen.main?.visibleFrame ?? NSRect(x: 0, y: 0, width: 1440, height: 900) + let totalWidth = windowWidth * 2 + gap + let startX = screen.origin.x + max(0, (screen.width - totalWidth) / 2) + let baseY = screen.origin.y + 200 + + let leftFrame = NSRect(x: startX, y: baseY, width: windowWidth, height: windowHeight) + let rightFrame = NSRect(x: startX + windowWidth + gap, y: baseY, width: windowWidth, height: windowHeight) + + // Recognition window (left) + overlayWindow?.setFrame(leftFrame, display: true, animate: true) + settings.overlayLocked = false + overlayWindow?.updateLockState(locked: false) settings.overlayFrameSaved = false + + // Translation window (right) + overlayWindow2?.setFrame(rightFrame, display: true, animate: true) + settings.overlay2Locked = false + overlayWindow2?.updateLockState(locked: false) + settings.overlay2FrameSaved = false } func hideOverlay() { overlayWindow?.orderOut(nil) overlayWindow = nil + hideOverlayWindow2() + } + + private func hideOverlayWindow2() { + overlayWindow2?.orderOut(nil) + overlayWindow2 = nil } // MARK: - Settings @@ -53,7 +155,9 @@ final class WindowManager: ObservableObject { onOpenLogs: onOpenLogs, onOpenSessions: onOpenSessions, onResetOverlay: { [weak self] in self?.resetOverlay(settings: settings) }, - onToggleOverlayLock: { [weak self] locked in self?.updateOverlayLock(locked: locked) } + onResetOverlay2: { [weak self] in self?.resetOverlay2(settings: settings) }, + onToggleOverlayLock: { [weak self] locked in self?.updateOverlayLock(locked: locked) }, + onToggleOverlay2Lock: { [weak self] locked in self?.updateOverlay2Lock(locked: locked) } ) let window = NSWindow( contentRect: NSRect(x: 0, y: 0, width: 560, height: 480), diff --git a/OST/Sources/Settings/UserSettings.swift b/OST/Sources/Settings/UserSettings.swift index 3072b75..7888052 100644 --- a/OST/Sources/Settings/UserSettings.swift +++ b/OST/Sources/Settings/UserSettings.swift @@ -48,6 +48,17 @@ final class UserSettings: ObservableObject { @AppStorage("overlayFrameY") var overlayFrameY: Double = 200 @AppStorage("overlayFrameSaved") var overlayFrameSaved: Bool = false + // Display mode: "combined" (single window) or "split" (recognition + translation) + @AppStorage("overlayDisplayMode") var overlayDisplayMode: String = "combined" + + // Second overlay (translation window) frame + @AppStorage("overlay2FrameX") var overlay2FrameX: Double = 200 + @AppStorage("overlay2FrameY") var overlay2FrameY: Double = 450 + @AppStorage("overlay2Width") var overlay2Width: Double = 600 + @AppStorage("overlay2Height") var overlay2Height: Double = 200 + @AppStorage("overlay2FrameSaved") var overlay2FrameSaved: Bool = false + @AppStorage("overlay2Locked") var overlay2Locked: Bool = true + var fontColor: Color { get { Self.decodeColor(fontColorData) ?? .white } set { fontColorData = Self.encodeColor(newValue) } diff --git a/OST/Sources/UI/FontSettingsView.swift b/OST/Sources/UI/FontSettingsView.swift index 709597d..d9c5889 100644 --- a/OST/Sources/UI/FontSettingsView.swift +++ b/OST/Sources/UI/FontSettingsView.swift @@ -3,7 +3,9 @@ import SwiftUI struct FontSettingsView: View { @ObservedObject var settings: UserSettings var onResetOverlay: (() -> Void)? + var onResetOverlay2: (() -> Void)? var onToggleOverlayLock: ((Bool) -> Void)? + var onToggleOverlay2Lock: ((Bool) -> Void)? var body: some View { Form { @@ -106,6 +108,28 @@ struct FontSettingsView: View { .accessibilityHint("Toggle display of translated text") } + Section("Display Mode") { + Picker("Mode", selection: $settings.overlayDisplayMode) { + Text("Combined").tag("combined") + Text("Split (Recognition + Translation)").tag("split") + } + .pickerStyle(.menu) + + Text(settings.overlayDisplayMode == "split" + ? "Two separate windows: recognition text and translated text." + : "Single window showing both recognition and translation.") + .font(.caption) + .foregroundColor(.secondary) + + Button("Reset All Overlay Windows") { + onResetOverlay?() + if settings.overlayDisplayMode == "split" { + onResetOverlay2?() + } + } + .accessibilityLabel("Reset all overlay windows to default position and size") + } + Section("Overlay Window") { Toggle("Lock Overlay", isOn: Binding( get: { settings.overlayLocked }, @@ -127,6 +151,29 @@ struct FontSettingsView: View { .accessibilityLabel("Reset overlay window to default position and size") } + if settings.overlayDisplayMode == "split" { + Section("Translation Window") { + Toggle("Lock Translation Window", isOn: Binding( + get: { settings.overlay2Locked }, + set: { newValue in + settings.overlay2Locked = newValue + onToggleOverlay2Lock?(newValue) + } + )) + .accessibilityLabel("Lock translation window position") + Text(settings.overlay2Locked + ? "Locked: clicks pass through to windows below." + : "Unlocked: drag to move or resize the translation window.") + .font(.caption) + .foregroundColor(.secondary) + + Button("Reset Translation Window Position & Size") { + onResetOverlay2?() + } + .accessibilityLabel("Reset translation window to default position and size") + } + } + Section("Live Preview") { previewSection } diff --git a/OST/Sources/UI/OverlayWindow.swift b/OST/Sources/UI/OverlayWindow.swift index f995d1b..87b6728 100644 --- a/OST/Sources/UI/OverlayWindow.swift +++ b/OST/Sources/UI/OverlayWindow.swift @@ -1,23 +1,51 @@ import AppKit import SwiftUI +/// Identifies which overlay role this window serves, determining frame persistence keys. +enum OverlayRole { + case combined // Single window: uses primary frame keys + case recognition // Split mode recognition: uses primary frame keys + case translation // Split mode translation: uses overlay2 frame keys +} + final class OverlayWindow: NSPanel { private let settings: UserSettings + let role: OverlayRole - init(appState: AppState, settings: UserSettings) { + init(contentView: AnyView, settings: UserSettings, role: OverlayRole = .combined) { self.settings = settings - let initialFrame: NSRect - if settings.overlayFrameSaved { - initialFrame = NSRect( - x: settings.overlayFrameX, - y: settings.overlayFrameY, - width: settings.overlayWidth, - height: settings.overlayHeight - ) - } else { - initialFrame = NSRect(x: 200, y: 200, width: settings.overlayWidth, height: settings.overlayHeight) + self.role = role + + var initialFrame: NSRect + switch role { + case .combined, .recognition: + if settings.overlayFrameSaved { + initialFrame = NSRect( + x: settings.overlayFrameX, + y: settings.overlayFrameY, + width: settings.overlayWidth, + height: settings.overlayHeight + ) + } else { + initialFrame = NSRect(x: 200, y: 200, width: settings.overlayWidth, height: settings.overlayHeight) + } + case .translation: + if settings.overlay2FrameSaved { + initialFrame = NSRect( + x: settings.overlay2FrameX, + y: settings.overlay2FrameY, + width: settings.overlay2Width, + height: settings.overlay2Height + ) + } else { + initialFrame = NSRect(x: 200, y: 450, width: settings.overlay2Width, height: settings.overlay2Height) + } } + + // Ensure frame is within visible screen area + initialFrame = Self.clampToScreen(initialFrame) + super.init( contentRect: initialFrame, styleMask: [.borderless, .nonactivatingPanel, .resizable], @@ -32,25 +60,19 @@ final class OverlayWindow: NSPanel { collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary] // Apply initial lock state - let locked = settings.overlayLocked + let locked = isLocked ignoresMouseEvents = locked isMovableByWindowBackground = !locked - let subtitleView = SubtitleView( - appState: appState, - settings: settings, - translationService: appState.translationService - ) - let hostingView = NSHostingView(rootView: subtitleView) - hostingView.sizingOptions = [] - - // Wrap in a plain NSView container to completely prevent - // NSHostingView from driving window size changes - let container = NSView() + let container = NSView(frame: NSRect(origin: .zero, size: initialFrame.size)) container.autoresizesSubviews = true + + let hostingView = NSHostingView(rootView: contentView) + hostingView.sizingOptions = [] + hostingView.frame = container.bounds hostingView.autoresizingMask = [.width, .height] container.addSubview(hostingView) - contentView = container + self.contentView = container NotificationCenter.default.addObserver( self, selector: #selector(persistFrame), @@ -62,30 +84,59 @@ final class OverlayWindow: NSPanel { ) } + private var isLocked: Bool { + switch role { + case .combined, .recognition: return settings.overlayLocked + case .translation: return settings.overlay2Locked + } + } + @objc private func persistFrame() { let f = frame - settings.overlayFrameX = f.origin.x - settings.overlayFrameY = f.origin.y - settings.overlayWidth = f.size.width - settings.overlayHeight = f.size.height - settings.overlayFrameSaved = true + switch role { + case .combined, .recognition: + settings.overlayFrameX = f.origin.x + settings.overlayFrameY = f.origin.y + settings.overlayWidth = f.size.width + settings.overlayHeight = f.size.height + settings.overlayFrameSaved = true + case .translation: + settings.overlay2FrameX = f.origin.x + settings.overlay2FrameY = f.origin.y + settings.overlay2Width = f.size.width + settings.overlay2Height = f.size.height + settings.overlay2FrameSaved = true + } } func updateLockState(locked: Bool) { - if locked { - ignoresMouseEvents = true - isMovableByWindowBackground = false - } else { - ignoresMouseEvents = false - isMovableByWindowBackground = true - } + ignoresMouseEvents = locked + isMovableByWindowBackground = !locked } func resetFrame() { - let defaultFrame = NSRect(x: 200, y: 200, width: 600, height: 200) + let defaultFrame: NSRect + switch role { + case .combined, .recognition: + defaultFrame = NSRect(x: 200, y: 200, width: 600, height: 200) + case .translation: + defaultFrame = NSRect(x: 200, y: 450, width: 600, height: 200) + } setFrame(defaultFrame, display: true, animate: true) - // Note: persistFrame() will be called via didResizeNotification. - // WindowManager.resetOverlay() sets overlayFrameSaved = false AFTER this call. + } + + /// Ensures the frame is at least partially visible on screen. + private static func clampToScreen(_ frame: NSRect) -> NSRect { + guard let screen = NSScreen.main?.visibleFrame else { return frame } + var f = frame + // Ensure minimum size + f.size.width = max(f.size.width, 200) + f.size.height = max(f.size.height, 100) + // Clamp position so at least 100px is visible on screen + let minVisible: CGFloat = 100 + f.origin.x = max(screen.minX - f.width + minVisible, min(f.origin.x, screen.maxX - minVisible)) + f.origin.y = max(screen.minY, min(f.origin.y, screen.maxY - 40)) + return f } deinit { diff --git a/OST/Sources/UI/RecognitionOverlayView.swift b/OST/Sources/UI/RecognitionOverlayView.swift new file mode 100644 index 0000000..f32f0df --- /dev/null +++ b/OST/Sources/UI/RecognitionOverlayView.swift @@ -0,0 +1,69 @@ +import SwiftUI + +/// Overlay view for split mode: shows only recognized text (no translation). +struct RecognitionOverlayView: View { + @ObservedObject var appState: AppState + @ObservedObject var settings: UserSettings + + @State private var isAtBottom = true + + var body: some View { + ScrollViewReader { proxy in + ScrollView(.vertical, showsIndicators: !settings.overlayLocked) { + VStack(alignment: .leading, spacing: 6) { + Spacer(minLength: 0) + + ForEach(appState.subtitleEntries) { entry in + Text(entry.recognized) + .font(.system(size: settings.fontSize)) + .foregroundColor(settings.fontColor) + .fixedSize(horizontal: false, vertical: true) + .transition(.opacity) + } + + if !appState.liveText.isEmpty { + Text(appState.liveText) + .font(.system(size: settings.fontSize)) + .foregroundColor(settings.fontColor.opacity(0.6)) + .fixedSize(horizontal: false, vertical: true) + } + + Color.clear + .frame(height: 16) + .id("bottom") + } + .frame(maxWidth: .infinity, minHeight: 0, alignment: .bottomLeading) + } + .scrollDisabled(settings.overlayLocked) + .onScrollGeometryChange(for: Bool.self) { geometry in + let atBottom = geometry.contentOffset.y + geometry.containerSize.height >= geometry.contentSize.height - 10 + return atBottom + } action: { _, newValue in + isAtBottom = newValue + } + .onChange(of: appState.subtitleEntries.count) { _, _ in + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } + .onChange(of: appState.liveText) { _, _ in + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } + } + .padding(12) + .frame(maxWidth: .infinity, maxHeight: .infinity, alignment: .bottomLeading) + .clipped() + .background( + RoundedRectangle(cornerRadius: 8) + .fill(settings.backgroundColor.opacity(settings.backgroundOpacity)) + ) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke( + settings.overlayLocked + ? Color.white.opacity(0.15) + : Color.accentColor.opacity(0.6), + lineWidth: settings.overlayLocked ? 1 : 2 + ) + ) + .animation(.easeInOut(duration: 0.2), value: appState.subtitleEntries.count) + } +} diff --git a/OST/Sources/UI/SettingsView.swift b/OST/Sources/UI/SettingsView.swift index 55dfac3..ce36ccc 100644 --- a/OST/Sources/UI/SettingsView.swift +++ b/OST/Sources/UI/SettingsView.swift @@ -5,11 +5,19 @@ struct SettingsView: View { let onOpenLogs: () -> Void let onOpenSessions: () -> Void var onResetOverlay: (() -> Void)? + var onResetOverlay2: (() -> Void)? var onToggleOverlayLock: ((Bool) -> Void)? + var onToggleOverlay2Lock: ((Bool) -> Void)? var body: some View { TabView { - FontSettingsView(settings: settings, onResetOverlay: onResetOverlay, onToggleOverlayLock: onToggleOverlayLock) + FontSettingsView( + settings: settings, + onResetOverlay: onResetOverlay, + onResetOverlay2: onResetOverlay2, + onToggleOverlayLock: onToggleOverlayLock, + onToggleOverlay2Lock: onToggleOverlay2Lock + ) .tabItem { Label("Display", systemImage: "textformat.size") } diff --git a/OST/Sources/UI/SubtitleView.swift b/OST/Sources/UI/SubtitleView.swift index 6966c02..bdb68e9 100644 --- a/OST/Sources/UI/SubtitleView.swift +++ b/OST/Sources/UI/SubtitleView.swift @@ -27,7 +27,7 @@ struct SubtitleView: View { } Color.clear - .frame(height: 0) + .frame(height: 16) .id("bottom") } .frame(maxWidth: .infinity, minHeight: 0, alignment: .bottomLeading) @@ -40,14 +40,10 @@ struct SubtitleView: View { isAtBottom = newValue } .onChange(of: appState.subtitleEntries.count) { _, _ in - if isAtBottom || settings.overlayLocked { - withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } - } + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } } .onChange(of: appState.liveText) { _, _ in - if isAtBottom || settings.overlayLocked { - withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } - } + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } } } .padding(12) diff --git a/OST/Sources/UI/TranslationOverlayView.swift b/OST/Sources/UI/TranslationOverlayView.swift new file mode 100644 index 0000000..d6417e3 --- /dev/null +++ b/OST/Sources/UI/TranslationOverlayView.swift @@ -0,0 +1,78 @@ +import SwiftUI +import Translation + +/// Overlay view for split mode: shows only translated text. +/// Hosts the .translationTask modifier to receive translation sessions. +struct TranslationOverlayView: View { + @ObservedObject var appState: AppState + @ObservedObject var settings: UserSettings + @ObservedObject var translationService: TranslationService + + @State private var isAtBottom = true + + var body: some View { + ScrollViewReader { proxy in + ScrollView(.vertical, showsIndicators: !settings.overlay2Locked) { + VStack(alignment: .leading, spacing: 6) { + Spacer(minLength: 0) + + ForEach(appState.subtitleEntries) { entry in + Text(entry.translated.isEmpty ? "..." : entry.translated) + .font(.system(size: settings.translatedFontSize)) + .foregroundColor(entry.translated.isEmpty + ? settings.translatedFontColor.opacity(0.4) + : settings.translatedFontColor) + .fixedSize(horizontal: false, vertical: true) + .transition(.opacity) + } + + if !appState.liveTranslatedText.isEmpty { + Text(appState.liveTranslatedText) + .font(.system(size: settings.translatedFontSize)) + .foregroundColor(settings.translatedFontColor.opacity(0.6)) + .fixedSize(horizontal: false, vertical: true) + } + + Color.clear + .frame(height: 16) + .id("bottom") + } + .frame(maxWidth: .infinity, minHeight: 0, alignment: .bottomLeading) + } + .scrollDisabled(settings.overlay2Locked) + .onScrollGeometryChange(for: Bool.self) { geometry in + let atBottom = geometry.contentOffset.y + geometry.containerSize.height >= geometry.contentSize.height - 10 + return atBottom + } action: { _, newValue in + isAtBottom = newValue + } + .onChange(of: appState.subtitleEntries.count) { _, _ in + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } + .onChange(of: appState.liveTranslatedText) { _, _ in + withAnimation { proxy.scrollTo("bottom", anchor: .bottom) } + } + } + .padding(12) + .frame(maxWidth: .infinity, maxHeight: .infinity, alignment: .bottomLeading) + .clipped() + .background( + RoundedRectangle(cornerRadius: 8) + .fill(settings.backgroundColor.opacity(settings.backgroundOpacity)) + ) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke( + settings.overlay2Locked + ? Color.white.opacity(0.15) + : Color.accentColor.opacity(0.6), + lineWidth: settings.overlay2Locked ? 1 : 2 + ) + ) + .animation(.easeInOut(duration: 0.2), value: appState.subtitleEntries.count) + .translationTask(translationService.configuration) { session in + AppLogger.shared.log("Translation session delivered by .translationTask", category: .translation) + translationService.handleSession(session) + } + } +} diff --git a/README.md b/README.md index 88acd1e..142c797 100644 --- a/README.md +++ b/README.md @@ -31,70 +31,84 @@ This project was entirely written by [Claude](https://claude.ai/) (Anthropic's A ## Features -- Real-time system audio capture via ScreenCaptureKit -- Speech-to-text using SFSpeechRecognizer (on-device or server-based) -- Translation via Apple Translation framework (with Google Translate fallback) -- Floating, resizable overlay with customizable appearance -- Lock/Unlock overlay: locked = click-through, unlocked = move/resize/scroll -- Scrollable subtitle history (unlock mode) -- Separate font size and color for original and translated text -- Configurable background color, opacity, speech pause, subtitle expiry -- Automatic language detection (English, Korean, Japanese, Chinese) -- Sentence-based text segmentation with pause detection -- Duplicate text overlap detection on recognition restart -- Session history recording with export -- Menu bar app (no Dock icon) +- **Real-time system audio capture** via ScreenCaptureKit (16kHz mono PCM) +- **Speech-to-text** using SFSpeechRecognizer (on-device or server-based) +- **Live translation** via Apple Translation framework — translates text as it's being recognized, not just after finalization +- **Dual display modes**: + - **Combined** — single overlay with both recognized and translated text + - **Split** — separate recognition and translation windows, independently positionable +- **Floating overlay** — resizable, movable, always-on-top window with customizable appearance +- **Lock/Unlock** — locked = click-through, unlocked = move/resize/scroll +- **Scrollable subtitle history** with auto-scroll +- **Customizable appearance** — separate font size/color for original and translated text, background color/opacity +- **Automatic language detection** (English, Korean, Japanese, Chinese) +- **Smart text processing** — sentence-based segmentation, pause detection, duplicate filtering, punctuation cleanup +- **Session history** recording with export +- **Menu bar app** — no Dock icon, minimal footprint ## Requirements - macOS 15.0 (Sequoia) or later - Apple Silicon (arm64) -- Xcode Command Line Tools (`xcode-select --install`) +- Xcode Command Line Tools ## Setup Guide -### 1. Required Permissions +### Step 1: Install Command Line Tools + +```bash +xcode-select --install +``` + +### Step 2: Grant Required Permissions On first launch, macOS will prompt for the following permissions. If not prompted, enable them manually: -| Permission | Purpose | Path | +| Permission | Purpose | How to Enable | |---|---|---| -| **Screen Recording** | System audio capture via ScreenCaptureKit | System Settings > Privacy & Security > Screen Recording | -| **Speech Recognition** | SFSpeechRecognizer access | System Settings > Privacy & Security > Speech Recognition | +| **Screen Recording** | System audio capture via ScreenCaptureKit | System Settings > Privacy & Security > Screen Recording > Enable OST | +| **Speech Recognition** | SFSpeechRecognizer access | System Settings > Privacy & Security > Speech Recognition > Enable OST | + +> After granting permissions, you may need to restart OST for changes to take effect. -### 2. Enable Siri & Dictation +### Step 3: Enable Siri & Dictation Speech recognition (especially server-based) requires Siri & Dictation to be enabled: -- **System Settings > Siri & Spotlight > Siri** — Turn on (or "Listen for...") -- If using on-device recognition, Siri does not need to be active, but the speech model must be downloaded (see step 3) +1. Open **System Settings > Siri & Spotlight** +2. Turn on **Siri** (or "Listen for...") +3. If using on-device recognition only, Siri does not need to be active — but the speech model must be downloaded (see Step 4) -### 3. Download On-Device Speech Model (Recommended) +### Step 4: Download On-Device Speech Model (Recommended) For faster, offline, and more reliable recognition: -- **System Settings > General > Keyboard > Dictation > Languages** -- Download the speech model for your source language (e.g., English, Korean, Japanese) -- After download, enable "On-device recognition" in OST Settings > Debug tab +1. Open **System Settings > General > Keyboard > Dictation** +2. Under **Languages**, download the speech model for your source language (e.g., English, Korean, Japanese) +3. After download, enable **"On-device recognition"** in OST Settings > Languages tab -> Without the on-device model, server-based recognition is used (requires internet, may have higher latency). +> Without the on-device model, server-based recognition is used. This requires internet and may have higher latency. -### 4. Download Translation Language Pack (Recommended) +### Step 5: Download Translation Language Pack (Recommended) For offline translation using Apple Translation framework: -- **System Settings > General > Language & Region > Translation Languages** -- Download the language pair you need (e.g., English ↔ Korean) +1. Open **System Settings > General > Language & Region > Translation Languages** +2. Download the language pair you need (e.g., English ↔ Korean) -> Without the translation pack, OST falls back to the Google Translate API (requires internet). +> Without the translation pack, translation will not work offline. ## Build ```bash +# Clone the repository +git clone https://github.com/9bow/ost-on-screen-translator.git +cd ost-on-screen-translator + # Full build → produces build/OST.app ./build.sh -# Type-check only +# Type-check only (no binary) ./build.sh --typecheck # Clean build @@ -106,32 +120,78 @@ open build/OST.app No Xcode project is required. The build script compiles all Swift sources via `xcrun swiftc`. +> If macOS blocks the app on first run, execute: +> ```bash +> xattr -dr com.apple.quarantine build/OST.app +> ``` + +## Usage + +### Starting a Session + +1. Click the **captions bubble icon** in the menu bar +2. Select source and target languages (or use "Auto" for automatic detection) +3. Click **Start** to begin capturing system audio +4. The overlay window(s) will appear with live transcription and translation + +### Overlay Controls + +| Action | How | +|---|---| +| **Lock/Unlock** | Menu bar > Lock Overlay, or Settings > Display > Overlay Window | +| **Move** | Unlock, then drag the overlay window | +| **Resize** | Unlock, then drag the window edges | +| **Scroll** | Unlock, then scroll through subtitle history | +| **Reset position** | Settings > Display > "Reset All Overlay Windows" | + +- **Locked mode**: The overlay is click-through — interact with windows behind it normally +- **Unlocked mode**: Drag to move, resize edges, scroll through subtitle history. Auto-scrolls to the latest text + +### Display Modes + +Configure in **Settings > Display > Mode**: + +- **Combined**: Single window showing both original and translated text +- **Split**: Two separate windows — recognition (original text) and translation. Each window can be independently positioned and resized. Lock/Unlock applies to both windows simultaneously + +### Tips + +- **Speech Pause**: Adjust in Settings > Display > "Speech Pause" slider. Shorter values finalize text faster; longer values wait for natural sentence endings +- **Subtitle Expiry**: Old subtitles automatically fade after the configured time (default 10s) +- **Max Lines**: Control how many subtitle entries are visible at once +- **Session History**: View past transcription sessions via menu bar > Session History. Sessions can be exported for reference + ## Architecture ``` -ScreenCaptureKit (16kHz mono) → SpeechRecognizer → AppState → TranslationService → SubtitleView +ScreenCaptureKit (16kHz mono) → SpeechRecognizer → AppState → TranslationService → Overlay Views + SystemAudioCapture SFSpeech entries Translation.framework NSPanel ``` ### Source Layout ``` OST/Sources/ -├── App/ AppState, OSTApp, WindowManager, Logger, SessionRecorder -├── Audio/ SystemAudioCapture (ScreenCaptureKit) -├── Speech/ SpeechRecognizer, SupportedLanguages -├── Translation/ TranslationService, TranslationConfig -├── Settings/ UserSettings -├── UI/ SubtitleView, OverlayWindow, MenuBarView, SettingsView, etc. -└── Accessibility/ AccessibilityManager +├── App/ AppState, OSTApp, WindowManager, Logger, SessionRecorder +├── Audio/ SystemAudioCapture (ScreenCaptureKit) +├── Speech/ SpeechRecognizer, SupportedLanguages +├── Translation/ TranslationService, TranslationConfig +├── Settings/ UserSettings +├── UI/ SubtitleView, RecognitionOverlayView, TranslationOverlayView, +│ OverlayWindow, MenuBarView, SettingsView, FontSettingsView, etc. +└── Accessibility/ AccessibilityManager ``` -## Usage Tips +## Troubleshooting -- **Lock/Unlock**: Use the menu bar toggle or Settings > Display to switch overlay modes - - **Locked**: Overlay is click-through — interact with windows behind it normally - - **Unlocked**: Drag to move, resize edges, scroll through subtitle history -- **Reset Overlay**: If the overlay becomes invisible or mispositioned, use Settings > Display > "Reset Overlay Position & Size" -- **Scroll behavior**: Auto-scrolls to latest text by default. Scroll up to pause auto-scroll; scroll back to bottom to resume +| Problem | Solution | +|---|---| +| No audio captured | Grant Screen Recording permission in System Settings, then restart OST | +| Speech recognition not working | Grant Speech Recognition permission; ensure Siri & Dictation is enabled | +| Translation not appearing | Download translation language pack in System Settings > Translation Languages | +| Overlay invisible but blocking clicks | Use Settings > Display > "Reset All Overlay Windows" to restore default position | +| macOS blocks the app | Run `xattr -dr com.apple.quarantine build/OST.app` | +| On-device recognition produces no results | Download the speech model for your language in System Settings > Keyboard > Dictation | ## Known Issues diff --git a/build.sh b/build.sh index bc113b5..801bd42 100755 --- a/build.sh +++ b/build.sh @@ -41,6 +41,8 @@ SOURCES=( OST/Sources/App/WindowManager.swift OST/Sources/Accessibility/AccessibilityManager.swift OST/Sources/UI/SubtitleView.swift + OST/Sources/UI/RecognitionOverlayView.swift + OST/Sources/UI/TranslationOverlayView.swift OST/Sources/UI/OverlayWindow.swift OST/Sources/UI/FontSettingsView.swift OST/Sources/UI/LanguagePickerView.swift