From 831d233fda83b2ddff96b5e75b703404322e6488 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 18:02:05 -0400 Subject: [PATCH 1/6] Add Benchmark speed test report --- Sources/DiarizationCLI/main.swift | 853 +++++++++++++++++++++++++++++- 1 file changed, 840 insertions(+), 13 deletions(-) diff --git a/Sources/DiarizationCLI/main.swift b/Sources/DiarizationCLI/main.swift index bc78b28a9..15842ae72 100644 --- a/Sources/DiarizationCLI/main.swift +++ b/Sources/DiarizationCLI/main.swift @@ -43,6 +43,8 @@ struct DiarizationCLI { benchmark Run AMI SDM benchmark evaluation with real annotations process Process a single audio file download Download datasets for benchmarking + speedtest Run end-to-end pipeline speed test + batchspeedtest Run batch speed test help Show this help message BENCHMARK OPTIONS: @@ -55,7 +57,11 @@ struct DiarizationCLI { --debug Enable debug mode --output Output results to JSON file --auto-download Automatically download dataset if not found - + --speed-test Enable speed testing with timing measurements + --speed-iterations Number of speed test iterations [default: 3] + --speed-warmup Number of warmup runs for speed test [default: 1] + --detailed-timing Show detailed per-component timing breakdown + NOTE: Benchmark now uses real AMI manual annotations from Tests/ami_public_1.6.2/ If annotations are not found, falls back to simplified placeholder. @@ -69,25 +75,72 @@ struct DiarizationCLI { --dataset Dataset to download (ami-sdm, ami-ihm, all) [default: all] --force Force re-download even if files exist + SPEEDTEST OPTIONS: + Audio file to test (.wav, .m4a, .mp3) + --iterations Number of test iterations [default: 5] + --warmup Number of warmup runs [default: 2] + --threshold Clustering threshold 0.0-1.0 [default: 0.7] + --min-duration-on Minimum speaker segment duration in seconds [default: 1.0] + --min-duration-off Minimum silence between speakers in seconds [default: 0.5] + --min-activity Minimum activity threshold in frames [default: 10.0] + --output Output results to JSON file + --debug Enable debug mode + --detailed Show detailed per-component timing + + NOTE: Benchmark now uses real AMI manual annotations from Tests/ami_public_1.6.2/ + If annotations are not found, falls back to simplified placeholder. + + BATCH SPEEDTEST OPTIONS: + --files Comma-separated list of audio files to test + --iterations Number of test iterations [default: 3] + --warmup Number of warmup runs [default: 1] + --threshold Clustering threshold 0.0-1.0 [default: 0.7] + --min-duration-on Minimum speaker segment duration in seconds [default: 1.0] + --min-duration-off Minimum silence between speakers in seconds [default: 0.5] + --min-activity Minimum activity threshold in frames [default: 10.0] + --debug Enable debug mode + --output Output results to JSON file + --detailed Show detailed per-file timing + EXAMPLES: # Download AMI datasets swift run fluidaudio download --dataset ami-sdm - + # Run AMI SDM benchmark with auto-download swift run fluidaudio benchmark --auto-download - + # Run benchmark with custom threshold and save results swift run fluidaudio benchmark --threshold 0.8 --output results.json - + + # Run benchmark with speed testing enabled + swift run fluidaudio benchmark --speed-test --speed-iterations 5 --detailed-timing + + # Run benchmark with both accuracy and speed testing + swift run fluidaudio benchmark --speed-test --threshold 0.7 --output comprehensive_results.json + # Process a single audio file swift run fluidaudio process meeting.wav - + # Process file with custom settings swift run fluidaudio process meeting.wav --threshold 0.6 --output output.json + + # Run speed test on audio file + swift run fluidaudio speedtest meeting.wav + + # Run speed test with custom iterations and detailed timing + swift run fluidaudio speedtest meeting.wav --iterations 10 --warmup 3 --detailed + + # Run speed test with custom parameters and save results + swift run fluidaudio speedtest meeting.wav --threshold 0.8 --output speed_results.json + + # Run batch speed test on multiple files + swift run fluidaudio batchspeedtest --files test1.wav,test2.wav --iterations 5 --warmup 2 --detailed """) } static func runBenchmark(arguments: [String]) async { + let benchmarkStartTime = Date() + var dataset = "ami-sdm" var threshold: Float = 0.7 var minDurationOn: Float = 1.0 @@ -189,6 +242,9 @@ struct DiarizationCLI { print("šŸ’” Supported datasets: ami-sdm, ami-ihm") exit(1) } + + let benchmarkElapsed = Date().timeIntervalSince(benchmarkStartTime) + print("\nā±ļø Total benchmark execution time: \(String(format: "%.1f", benchmarkElapsed)) seconds") } static func downloadDataset(arguments: [String]) async { @@ -326,6 +382,560 @@ struct DiarizationCLI { } } + static func runSpeedTest(arguments: [String]) async { + guard !arguments.isEmpty else { + print("āŒ No audio file specified") + printUsage() + exit(1) + } + + let audioFile = arguments[0] + + // Check for help flags first + if audioFile == "--help" || audioFile == "-h" { + printUsage() + return + } + + var iterations = 5 + var warmupRuns = 2 + var threshold: Float = 0.7 + var minDurationOn: Float = 1.0 + var minDurationOff: Float = 0.5 + var minActivityThreshold: Float = 10.0 + var debugMode = false + var outputFile: String? + var detailedTiming = false + + // Parse remaining arguments + var i = 1 + while i < arguments.count { + switch arguments[i] { + case "--iterations": + if i + 1 < arguments.count { + iterations = Int(arguments[i + 1]) ?? 5 + i += 1 + } + case "--warmup": + if i + 1 < arguments.count { + warmupRuns = Int(arguments[i + 1]) ?? 2 + i += 1 + } + case "--threshold": + if i + 1 < arguments.count { + threshold = Float(arguments[i + 1]) ?? 0.7 + i += 1 + } + case "--min-duration-on": + if i + 1 < arguments.count { + minDurationOn = Float(arguments[i + 1]) ?? 1.0 + i += 1 + } + case "--min-duration-off": + if i + 1 < arguments.count { + minDurationOff = Float(arguments[i + 1]) ?? 0.5 + i += 1 + } + case "--min-activity": + if i + 1 < arguments.count { + minActivityThreshold = Float(arguments[i + 1]) ?? 10.0 + i += 1 + } + case "--debug": + debugMode = true + case "--output": + if i + 1 < arguments.count { + outputFile = arguments[i + 1] + i += 1 + } + case "--detailed": + detailedTiming = true + default: + print("āš ļø Unknown option: \(arguments[i])") + } + i += 1 + } + + print("⚔ Starting End-to-End Pipeline Speed Test") + print(" Audio file: \(audioFile)") + print(" Iterations: \(iterations)") + print(" Warmup runs: \(warmupRuns)") + print(" Clustering threshold: \(threshold)") + print(" Min duration on: \(minDurationOn)s") + print(" Min duration off: \(minDurationOff)s") + print(" Min activity threshold: \(minActivityThreshold)") + print(" Debug mode: \(debugMode ? "enabled" : "disabled")") + print(" Detailed timing: \(detailedTiming ? "enabled" : "disabled")") + + let config = DiarizerConfig( + clusteringThreshold: threshold, + minDurationOn: minDurationOn, + minDurationOff: minDurationOff, + minActivityThreshold: minActivityThreshold, + debugMode: debugMode + ) + + let manager = DiarizerManager(config: config) + + do { + try await manager.initialize() + print("āœ… Models initialized successfully") + } catch { + print("āŒ Failed to initialize models: \(error)") + print("šŸ’” Make sure you have network access for model downloads") + exit(1) + } + + // Load audio file once + let audioSamples: [Float] + do { + audioSamples = try await loadAudioFile(path: audioFile) + let duration = Float(audioSamples.count) / 16000.0 + print("āœ… Loaded audio: \(audioSamples.count) samples (\(String(format: "%.1f", duration))s)") + } catch { + print("āŒ Failed to load audio file: \(error)") + exit(1) + } + + // Run warmup iterations + print("\nšŸ”„ Running \(warmupRuns) warmup iterations...") + for i in 1...warmupRuns { + print(" Warmup \(i)/\(warmupRuns)...") + do { + let _ = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) + } catch { + print(" āš ļø Warmup \(i) failed: \(error)") + } + } + + // Run actual speed test iterations + print("\n⚔ Running \(iterations) speed test iterations...") + var timingResults: [SpeedTestResult] = [] + let duration = Float(audioSamples.count) / 16000.0 + + for i in 1...iterations { + print(" Iteration \(i)/\(iterations)...") + + let startTime = Date() + do { + let result = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) + let processingTime = Date().timeIntervalSince(startTime) + let rtf = Float(processingTime) / duration + + let speedResult = SpeedTestResult( + iteration: i, + processingTimeSeconds: processingTime, + realTimeFactor: rtf, + speakerCount: result.speakerDatabase.count, + segmentCount: result.segments.count, + audioDurationSeconds: duration + ) + + timingResults.append(speedResult) + print(" āœ… RTF: \(String(format: "%.2f", rtf))x, \(result.speakerDatabase.count) speakers, \(result.segments.count) segments") + + } catch { + print(" āŒ Iteration \(i) failed: \(error)") + } + } + + guard !timingResults.isEmpty else { + print("āŒ No successful iterations completed") + return + } + + // Calculate statistics + let avgRTF = timingResults.map { $0.realTimeFactor }.reduce(0, +) / Float(timingResults.count) + let avgProcessingTime = timingResults.map { $0.processingTimeSeconds }.reduce(0, +) / Double(timingResults.count) + let minRTF = timingResults.map { $0.realTimeFactor }.min()! + let maxRTF = timingResults.map { $0.realTimeFactor }.max()! + let stdDevRTF = calculateStandardDeviation(timingResults.map { $0.realTimeFactor }) + + // Print results + printSpeedTestResults( + timingResults, + avgRTF: avgRTF, + avgProcessingTime: avgProcessingTime, + minRTF: minRTF, + maxRTF: maxRTF, + stdDevRTF: stdDevRTF, + audioFile: audioFile, + detailed: detailedTiming + ) + + // Save results if requested + if let outputFile = outputFile { + let summary = SpeedTestSummary( + audioFile: audioFile, + iterations: iterations, + warmupRuns: warmupRuns, + averageRTF: avgRTF, + averageProcessingTime: avgProcessingTime, + minRTF: minRTF, + maxRTF: maxRTF, + stdDevRTF: stdDevRTF, + results: timingResults, + config: config + ) + + do { + try await saveSpeedTestResults(summary, to: outputFile) + print("šŸ’¾ Speed test results saved to: \(outputFile)") + } catch { + print("āš ļø Failed to save results: \(error)") + } + } + } + + static func runBatchSpeedTest(arguments: [String]) async { + // Check for help flags first + if arguments.contains("--help") || arguments.contains("-h") { + printUsage() + return + } + + var audioFiles: [String] = [] + var iterations = 3 + var warmupRuns = 1 + var threshold: Float = 0.7 + var minDurationOn: Float = 1.0 + var minDurationOff: Float = 0.5 + var minActivityThreshold: Float = 10.0 + var debugMode = false + var outputFile: String? + var detailedTiming = false + + // Parse arguments + var i = 0 + while i < arguments.count { + switch arguments[i] { + case "--files": + // Collect all file paths until next option + i += 1 + while i < arguments.count && !arguments[i].hasPrefix("--") { + audioFiles.append(arguments[i]) + i += 1 } + continue + case "--iterations": + if i + 1 < arguments.count { + iterations = Int(arguments[i + 1]) ?? 3 + i += 1 + } + case "--warmup": + if i + 1 < arguments.count { + warmupRuns = Int(arguments[i + 1]) ?? 1 + i += 1 + } + case "--threshold": + if i + 1 < arguments.count { + threshold = Float(arguments[i + 1]) ?? 0.7 + i += 1 + } + case "--min-duration-on": + if i + 1 < arguments.count { + minDurationOn = Float(arguments[i + 1]) ?? 1.0 + i += 1 + } + case "--min-duration-off": + if i + 1 < arguments.count { + minDurationOff = Float(arguments[i + 1]) ?? 0.5 + i += 1 + } + case "--min-activity": + if i + 1 < arguments.count { + minActivityThreshold = Float(arguments[i + 1]) ?? 10.0 + i += 1 + } + case "--debug": + debugMode = true + case "--output": + if i + 1 < arguments.count { + outputFile = arguments[i + 1] + i += 1 + } + case "--detailed": + detailedTiming = true + default: + if !arguments[i].hasPrefix("--") { + audioFiles.append(arguments[i]) + } else { + print("āš ļø Unknown option: \(arguments[i])") + } + } + i += 1 + } + + // If no files specified, use default test files + if audioFiles.isEmpty { + print("šŸ“ No audio files specified, using default test files...") + // You can add default test files here + audioFiles = ["test1.wav", "test2.wav"] // Placeholder + } + + print("⚔ Starting Batch Speed Test") + print(" Audio files: \(audioFiles.count)") + print(" Iterations per file: \(iterations)") + print(" Warmup runs: \(warmupRuns)") + print(" Clustering threshold: \(threshold)") + print(" Min duration on: \(minDurationOn)s") + print(" Min duration off: \(minDurationOff)s") + print(" Min activity threshold: \(minActivityThreshold)") + + let config = DiarizerConfig( + clusteringThreshold: threshold, + minDurationOn: minDurationOn, + minDurationOff: minDurationOff, + minActivityThreshold: minActivityThreshold, + debugMode: debugMode + ) + + let manager = DiarizerManager(config: config) + + do { + try await manager.initialize() + print("āœ… Models initialized successfully") + } catch { + print("āŒ Failed to initialize models: \(error)") + exit(1) + } + + var allResults: [BatchSpeedTestResult] = [] + var totalProcessingTime: Double = 0 + var totalAudioDuration: Float = 0 + + for (fileIndex, audioFile) in audioFiles.enumerated() { + print("\nšŸ“ Testing file \(fileIndex + 1)/\(audioFiles.count): \(audioFile)") + + guard FileManager.default.fileExists(atPath: audioFile) else { + print(" āŒ File not found: \(audioFile)") + continue + } + + // Load audio file + let audioSamples: [Float] + do { + audioSamples = try await loadAudioFile(path: audioFile) + let duration = Float(audioSamples.count) / 16000.0 + print(" āœ… Loaded audio: \(String(format: "%.1f", duration))s") + totalAudioDuration += duration + } catch { + print(" āŒ Failed to load audio file: \(error)") + continue + } + + // Run warmup iterations + for i in 1...warmupRuns { + print(" šŸ”„ Warmup \(i)/\(warmupRuns)...") + do { + let _ = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) + } catch { + print(" āš ļø Warmup \(i) failed: \(error)") + } + } + + // Run speed test iterations + var fileResults: [SpeedTestResult] = [] + let duration = Float(audioSamples.count) / 16000.0 + + for i in 1...iterations { + print(" ⚔ Iteration \(i)/\(iterations)...") + + let startTime = Date() + do { + let result = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) + let processingTime = Date().timeIntervalSince(startTime) + let rtf = Float(processingTime) / duration + totalProcessingTime += processingTime + + let speedResult = SpeedTestResult( + iteration: i, + processingTimeSeconds: processingTime, + realTimeFactor: rtf, + speakerCount: result.speakerDatabase.count, + segmentCount: result.segments.count, + audioDurationSeconds: duration + ) + + fileResults.append(speedResult) + print(" āœ… RTF: \(String(format: "%.2f", rtf))x") + + } catch { + print(" āŒ Iteration \(i) failed: \(error)") + } + } + + if !fileResults.isEmpty { + let avgRTF = fileResults.map { $0.realTimeFactor }.reduce(0, +) / Float(fileResults.count) + let avgProcessingTime = fileResults.map { $0.processingTimeSeconds }.reduce(0, +) / Double(fileResults.count) + let minRTF = fileResults.map { $0.realTimeFactor }.min()! + let maxRTF = fileResults.map { $0.realTimeFactor }.max()! + let stdDevRTF = calculateStandardDeviation(fileResults.map { $0.realTimeFactor }) + + let batchResult = BatchSpeedTestResult( + audioFile: audioFile, + averageRTF: avgRTF, + averageProcessingTime: avgProcessingTime, + minRTF: minRTF, + maxRTF: maxRTF, + stdDevRTF: stdDevRTF, + results: fileResults + ) + + allResults.append(batchResult) + } + } + + guard !allResults.isEmpty else { + print("āŒ No successful tests completed") + return + } + + // Calculate overall statistics + let overallAvgRTF = allResults.map { $0.averageRTF }.reduce(0, +) / Float(allResults.count) + let overallAvgProcessingTime = allResults.map { $0.averageProcessingTime }.reduce(0, +) / Double(allResults.count) + let overallMinRTF = allResults.map { $0.minRTF }.min()! + let overallMaxRTF = allResults.map { $0.maxRTF }.max()! + let overallStdDevRTF = calculateStandardDeviation(allResults.map { $0.averageRTF }) + + // Print batch results + printBatchSpeedTestResults( + allResults, + overallAvgRTF: overallAvgRTF, + overallAvgProcessingTime: overallAvgProcessingTime, + overallMinRTF: overallMinRTF, + overallMaxRTF: overallMaxRTF, + overallStdDevRTF: overallStdDevRTF, + totalProcessingTime: totalProcessingTime, + totalAudioDuration: totalAudioDuration, + detailed: detailedTiming + ) + + // Save results if requested + if let outputFile = outputFile { + let summary = BatchSpeedTestSummary( + audioFiles: audioFiles, + iterations: iterations, + warmupRuns: warmupRuns, + overallAverageRTF: overallAvgRTF, + overallAverageProcessingTime: overallAvgProcessingTime, + overallMinRTF: overallMinRTF, + overallMaxRTF: overallMaxRTF, + overallStdDevRTF: overallStdDevRTF, + totalProcessingTime: totalProcessingTime, + totalAudioDuration: totalAudioDuration, + results: allResults, + config: config + ) + + do { + try await saveBatchSpeedTestResults(summary, to: outputFile) + print("šŸ’¾ Batch speed test results saved to: \(outputFile)") + } catch { + print("āš ļø Failed to save results: \(error)") + } + } + } + + static func printBatchSpeedTestResults( + _ results: [BatchSpeedTestResult], + overallAvgRTF: Float, + overallAvgProcessingTime: Double, + overallMinRTF: Float, + overallMaxRTF: Float, + overallStdDevRTF: Float, + totalProcessingTime: Double, + totalAudioDuration: Float, + detailed: Bool + ) { + print("\nšŸ Batch Speed Test Results") + let separator = String(repeating: "=", count: 80) + print("\(separator)") + + // Print table header + print("│ File Name │ RTF │ Processing │ Speakers │ Segments │ Duration │") + let headerSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" + print("\(headerSep)") + + // Print individual file results + for result in results.sorted(by: { $0.averageRTF < $1.averageRTF }) { + let fileName = result.audioFile.split(separator: "/").last.map(String.init) ?? result.audioFile + let fileNameStr = String(fileName.prefix(15)).padding(toLength: 15, withPad: " ", startingAt: 0) + let rtfStr = String(format: "%.2fx", result.averageRTF).padding(toLength: 6, withPad: " ", startingAt: 0) + let procStr = String(format: "%.1fs", result.averageProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) + let avgSpeakers = result.results.reduce(0) { $0 + $1.speakerCount } / result.results.count + let speakerStr = String(format: "%.1f", Float(avgSpeakers)).padding(toLength: 8, withPad: " ", startingAt: 0) + let avgSegments = result.results.reduce(0) { $0 + $1.segmentCount } / result.results.count + let segmentStr = String(format: "%.1f", Float(avgSegments)).padding(toLength: 8, withPad: " ", startingAt: 0) + let durationStr = String(format: "%.1fs", result.results.first?.audioDurationSeconds ?? 0).padding(toLength: 8, withPad: " ", startingAt: 0) + + print("│ \(fileNameStr) │ \(rtfStr) │ \(procStr) │ \(speakerStr) │ \(segmentStr) │ \(durationStr) │") + } + + // Print summary section + let midSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" + print("\(midSep)") + + let avgRtfStr = String(format: "%.2fx", overallAvgRTF).padding(toLength: 6, withPad: " ", startingAt: 0) + let avgProcStr = String(format: "%.1fs", overallAvgProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) + let totalDurationStr = String(format: "%.1fs", totalAudioDuration).padding(toLength: 8, withPad: " ", startingAt: 0) + + print("│ OVERALL AVERAGE │ \(avgRtfStr) │ \(avgProcStr) │ │ │ \(totalDurationStr) │") + let bottomSep = "ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜" + print("\(bottomSep)") + + // Print overall statistics + print("\nšŸ“Š Overall Performance Statistics:") + print(" Files tested: \(results.count)") + print(" Total audio duration: \(String(format: "%.1f", totalAudioDuration))s") + print(" Total processing time: \(String(format: "%.1f", totalProcessingTime))s") + print(" Overall average RTF: \(String(format: "%.2f", overallAvgRTF))x") + print(" Overall min RTF: \(String(format: "%.2f", overallMinRTF))x") + print(" Overall max RTF: \(String(format: "%.2f", overallMaxRTF))x") + print(" Overall RTF Std Dev: \(String(format: "%.2f", overallStdDevRTF))x") + + // Performance assessment + print("\nšŸŽÆ Overall Performance Assessment:") + if overallAvgRTF < 0.1 { + print(" šŸš€ EXCELLENT: Real-time factor < 0.1x (10x faster than real-time)") + } else if overallAvgRTF < 0.5 { + print(" āœ… VERY GOOD: Real-time factor < 0.5x (2x faster than real-time)") + } else if overallAvgRTF < 1.0 { + print(" šŸ‘ GOOD: Real-time factor < 1.0x (faster than real-time)") + } else if overallAvgRTF < 2.0 { + print(" āš ļø MODERATE: Real-time factor < 2.0x (slower than real-time)") + } else { + print(" 🐌 SLOW: Real-time factor >= 2.0x (significantly slower than real-time)") + } + + // File-by-file analysis if detailed + if detailed { + print("\nšŸ” File-by-File Analysis:") + let sortedResults = results.sorted(by: { $0.averageRTF < $1.averageRTF }) + print(" Fastest file: \(sortedResults.first?.audioFile.split(separator: "/").last.map(String.init) ?? "unknown") (\(String(format: "%.2f", sortedResults.first?.averageRTF ?? 0))x RTF)") + print(" Slowest file: \(sortedResults.last?.audioFile.split(separator: "/").last.map(String.init) ?? "unknown") (\(String(format: "%.2f", sortedResults.last?.averageRTF ?? 0))x RTF)") + + let rtfRange = (sortedResults.last?.averageRTF ?? 0) - (sortedResults.first?.averageRTF ?? 0) + print(" RTF range: \(String(format: "%.2f", rtfRange))x") + + if rtfRange > 0.5 { + print(" āš ļø High variability between files - consider file-specific optimization") + } else if rtfRange > 0.2 { + print(" āš ļø Moderate variability between files") + } else { + print(" āœ… Consistent performance across files") + } + } + } + + static func saveBatchSpeedTestResults(_ summary: BatchSpeedTestSummary, to file: String) async throws { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + encoder.dateEncodingStrategy = .iso8601 + + let data = try encoder.encode(summary) + try data.write(to: URL(fileURLWithPath: file)) + } + // MARK: - AMI Benchmark Implementation static func runAMISDMBenchmark( @@ -833,7 +1443,7 @@ struct DiarizationCLI { // Find optimal assignment using Hungarian Algorithm for globally optimal solution let predSpeakerArray = Array(predSpeakers).sorted() // Consistent ordering let gtSpeakerArray = Array(gtSpeakers).sorted() // Consistent ordering - + // Build numerical overlap matrix for Hungarian algorithm var numericalOverlapMatrix: [[Int]] = [] for predSpeaker in predSpeakerArray { @@ -843,24 +1453,24 @@ struct DiarizationCLI { } numericalOverlapMatrix.append(row) } - + // Convert overlap matrix to cost matrix (higher overlap = lower cost) let costMatrix = HungarianAlgorithm.overlapToCostMatrix(numericalOverlapMatrix) - + // Solve optimal assignment let assignments = HungarianAlgorithm.minimumCostAssignment(costs: costMatrix) - + // Create speaker mapping from Hungarian result var mapping: [String: String] = [:] var totalAssignmentCost: Float = 0 var totalOverlap = 0 - + for (predIndex, gtIndex) in assignments.assignments.enumerated() { if gtIndex != -1 && predIndex < predSpeakerArray.count && gtIndex < gtSpeakerArray.count { let predSpeaker = predSpeakerArray[predIndex] let gtSpeaker = gtSpeakerArray[gtIndex] let overlap = overlapMatrix[predSpeaker]![gtSpeaker]! - + if overlap > 0 { // Only assign if there's actual overlap mapping[predSpeaker] = gtSpeaker totalOverlap += overlap @@ -868,10 +1478,10 @@ struct DiarizationCLI { } } } - + totalAssignmentCost = assignments.totalCost print("šŸ” HUNGARIAN RESULT: Total assignment cost: \(String(format: "%.1f", totalAssignmentCost)), Total overlap: \(totalOverlap) frames") - + // Handle unassigned predicted speakers for predSpeaker in predSpeakerArray { if mapping[predSpeaker] == nil { @@ -1317,6 +1927,135 @@ struct DiarizationCLI { } return embedding } + + static func printSpeedTestResults( + _ results: [SpeedTestResult], + avgRTF: Float, + avgProcessingTime: Double, + minRTF: Float, + maxRTF: Float, + stdDevRTF: Float, + audioFile: String, + detailed: Bool + ) { + print("\nšŸ Speed Test Results") + let separator = String(repeating: "=", count: 75) + print("\(separator)") + + // Print table header + print("│ Iteration │ RTF │ Processing │ Speakers │ Segments │") + let headerSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" + print("\(headerSep)") + + // Print individual results + for result in results.sorted(by: { $0.iteration < $1.iteration }) { + let iterStr = String(result.iteration).padding(toLength: 9, withPad: " ", startingAt: 0) + let rtfStr = String(format: "%.2fx", result.realTimeFactor).padding(toLength: 6, withPad: " ", startingAt: 0) + let procStr = String(format: "%.1fs", result.processingTimeSeconds).padding(toLength: 10, withPad: " ", startingAt: 0) + let speakerStr = String(result.speakerCount).padding(toLength: 8, withPad: " ", startingAt: 0) + let segmentStr = String(result.segmentCount).padding(toLength: 8, withPad: " ", startingAt: 0) + + print("│ \(iterStr) │ \(rtfStr) │ \(procStr) │ \(speakerStr) │ \(segmentStr) │") + } + + // Print summary section + let midSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" + print("\(midSep)") + + let avgRtfStr = String(format: "%.2fx", avgRTF).padding(toLength: 6, withPad: " ", startingAt: 0) + let avgProcStr = String(format: "%.1fs", avgProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) + let avgSpeakers = results.reduce(0) { $0 + $1.speakerCount } / results.count + let avgSpeakerStr = String(format: "%.1f", Float(avgSpeakers)).padding(toLength: 8, withPad: " ", startingAt: 0) + let avgSegments = results.reduce(0) { $0 + $1.segmentCount } / results.count + let avgSegmentStr = String(format: "%.1f", Float(avgSegments)).padding(toLength: 8, withPad: " ", startingAt: 0) + + print("│ AVERAGE │ \(avgRtfStr) │ \(avgProcStr) │ \(avgSpeakerStr) │ \(avgSegmentStr) │") + let bottomSep = "ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜" + print("\(bottomSep)") + + // Print detailed statistics + print("\nšŸ“Š Performance Statistics:") + print(" Audio file: \(audioFile)") + print(" Audio duration: \(String(format: "%.1f", results.first?.audioDurationSeconds ?? 0))s") + print(" Iterations: \(results.count)") + print(" Average RTF: \(String(format: "%.2f", avgRTF))x") + print(" Min RTF: \(String(format: "%.2f", minRTF))x") + print(" Max RTF: \(String(format: "%.2f", maxRTF))x") + print(" RTF Std Dev: \(String(format: "%.2f", stdDevRTF))x") + print(" Average processing time: \(String(format: "%.1f", avgProcessingTime))s") + print(" Average speakers detected: \(String(format: "%.1f", Float(avgSpeakers)))") + print(" Average segments: \(String(format: "%.1f", Float(avgSegments)))") + + // Performance assessment + print("\nšŸŽÆ Performance Assessment:") + if avgRTF < 0.1 { + print(" šŸš€ EXCELLENT: Real-time factor < 0.1x (10x faster than real-time)") + } else if avgRTF < 0.5 { + print(" āœ… VERY GOOD: Real-time factor < 0.5x (2x faster than real-time)") + } else if avgRTF < 1.0 { + print(" šŸ‘ GOOD: Real-time factor < 1.0x (faster than real-time)") + } else if avgRTF < 2.0 { + print(" āš ļø MODERATE: Real-time factor < 2.0x (slower than real-time)") + } else { + print(" 🐌 SLOW: Real-time factor >= 2.0x (significantly slower than real-time)") + } + + // Research comparison + print("\nšŸ“ Research Comparison:") + print(" Your Results: \(String(format: "%.2f", avgRTF))x RTF") + print(" Pyannote (2021): 0.15x RTF (GPU)") + print(" EEND (2019): 0.8x RTF (CPU)") + print(" x-vector clustering: 1.2x RTF (CPU)") + + // Detailed timing breakdown if requested + if detailed { + print("\nšŸ” Detailed Timing Analysis:") + let sortedRTFs = results.map { $0.realTimeFactor }.sorted() + let medianRTF = sortedRTFs[sortedRTFs.count / 2] + let p95RTF = sortedRTFs[Int(Double(sortedRTFs.count) * 0.95)] + let p99RTF = sortedRTFs[Int(Double(sortedRTFs.count) * 0.99)] + + print(" Median RTF: \(String(format: "%.2f", medianRTF))x") + print(" 95th percentile RTF: \(String(format: "%.2f", p95RTF))x") + print(" 99th percentile RTF: \(String(format: "%.2f", p99RTF))x") + + // Consistency analysis + let consistency = (1.0 - stdDevRTF / avgRTF) * 100 + print(" Consistency: \(String(format: "%.1f", consistency))%") + + if consistency > 90 { + print(" šŸŽÆ EXCELLENT: Very consistent performance") + } else if consistency > 80 { + print(" āœ… GOOD: Consistent performance") + } else if consistency > 70 { + print(" āš ļø MODERATE: Some performance variability") + } else { + print(" 🚨 POOR: High performance variability") + } + } + + // Optimization suggestions + print("\nšŸ’” Optimization Suggestions:") + if avgRTF > 1.0 { + print(" • Consider reducing clustering threshold for faster processing") + print(" • Increase min-duration-on to reduce segment count") + print(" • Use GPU acceleration if available") + print(" • Consider batch processing for multiple files") + } else { + print(" • Performance is already excellent!") + print(" • Consider increasing accuracy parameters if needed") + print(" • Ready for production deployment") + } + } + + static func saveSpeedTestResults(_ summary: SpeedTestSummary, to file: String) async throws { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + encoder.dateEncodingStrategy = .iso8601 + + let data = try encoder.encode(summary) + try data.write(to: URL(fileURLWithPath: file)) + } } // MARK: - Data Structures @@ -1381,6 +2120,47 @@ struct BenchmarkSummary: Codable { } } +struct SpeedTestResult: Codable { + let iteration: Int + let processingTimeSeconds: TimeInterval + let realTimeFactor: Float + let speakerCount: Int + let segmentCount: Int + let audioDurationSeconds: Float +} + +struct SpeedTestSummary: Codable { + let audioFile: String + let iterations: Int + let warmupRuns: Int + let averageRTF: Float + let averageProcessingTime: Double + let minRTF: Float + let maxRTF: Float + let stdDevRTF: Float + let results: [SpeedTestResult] + let config: DiarizerConfig + let timestamp: Date + + init( + audioFile: String, iterations: Int, warmupRuns: Int, averageRTF: Float, + averageProcessingTime: Double, minRTF: Float, maxRTF: Float, stdDevRTF: Float, + results: [SpeedTestResult], config: DiarizerConfig + ) { + self.audioFile = audioFile + self.iterations = iterations + self.warmupRuns = warmupRuns + self.averageRTF = averageRTF + self.averageProcessingTime = averageProcessingTime + self.minRTF = minRTF + self.maxRTF = maxRTF + self.stdDevRTF = stdDevRTF + self.results = results + self.config = config + self.timestamp = Date() + } +} + struct DiarizationMetrics { let der: Float let jer: Float @@ -1670,3 +2450,50 @@ private class AMIMeetingsXMLDelegate: NSObject, XMLParserDelegate { parsingError = parseError } } + +struct BatchSpeedTestResult: Codable { + let audioFile: String + let averageRTF: Float + let averageProcessingTime: Double + let minRTF: Float + let maxRTF: Float + let stdDevRTF: Float + let results: [SpeedTestResult] +} + +struct BatchSpeedTestSummary: Codable { + let audioFiles: [String] + let iterations: Int + let warmupRuns: Int + let overallAverageRTF: Float + let overallAverageProcessingTime: Double + let overallMinRTF: Float + let overallMaxRTF: Float + let overallStdDevRTF: Float + let totalProcessingTime: Double + let totalAudioDuration: Float + let results: [BatchSpeedTestResult] + let config: DiarizerConfig + let timestamp: Date + + init( + audioFiles: [String], iterations: Int, warmupRuns: Int, overallAverageRTF: Float, + overallAverageProcessingTime: Double, overallMinRTF: Float, overallMaxRTF: Float, + overallStdDevRTF: Float, totalProcessingTime: Double, totalAudioDuration: Float, + results: [BatchSpeedTestResult], config: DiarizerConfig + ) { + self.audioFiles = audioFiles + self.iterations = iterations + self.warmupRuns = warmupRuns + self.overallAverageRTF = overallAverageRTF + self.overallAverageProcessingTime = overallAverageProcessingTime + self.overallMinRTF = overallMinRTF + self.overallMaxRTF = overallMaxRTF + self.overallStdDevRTF = overallStdDevRTF + self.totalProcessingTime = totalProcessingTime + self.totalAudioDuration = totalAudioDuration + self.results = results + self.config = config + self.timestamp = Date() + } +} From ddebd8fdaf8de787a46f6a6518086fc9c6e030ac Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 18:49:08 -0400 Subject: [PATCH 2/6] remove redundant additional commands --- Sources/DiarizationCLI/main.swift | 822 ------------------------------ 1 file changed, 822 deletions(-) diff --git a/Sources/DiarizationCLI/main.swift b/Sources/DiarizationCLI/main.swift index 15842ae72..f8cb77826 100644 --- a/Sources/DiarizationCLI/main.swift +++ b/Sources/DiarizationCLI/main.swift @@ -43,8 +43,6 @@ struct DiarizationCLI { benchmark Run AMI SDM benchmark evaluation with real annotations process Process a single audio file download Download datasets for benchmarking - speedtest Run end-to-end pipeline speed test - batchspeedtest Run batch speed test help Show this help message BENCHMARK OPTIONS: @@ -57,10 +55,6 @@ struct DiarizationCLI { --debug Enable debug mode --output Output results to JSON file --auto-download Automatically download dataset if not found - --speed-test Enable speed testing with timing measurements - --speed-iterations Number of speed test iterations [default: 3] - --speed-warmup Number of warmup runs for speed test [default: 1] - --detailed-timing Show detailed per-component timing breakdown NOTE: Benchmark now uses real AMI manual annotations from Tests/ami_public_1.6.2/ If annotations are not found, falls back to simplified placeholder. @@ -75,33 +69,6 @@ struct DiarizationCLI { --dataset Dataset to download (ami-sdm, ami-ihm, all) [default: all] --force Force re-download even if files exist - SPEEDTEST OPTIONS: - Audio file to test (.wav, .m4a, .mp3) - --iterations Number of test iterations [default: 5] - --warmup Number of warmup runs [default: 2] - --threshold Clustering threshold 0.0-1.0 [default: 0.7] - --min-duration-on Minimum speaker segment duration in seconds [default: 1.0] - --min-duration-off Minimum silence between speakers in seconds [default: 0.5] - --min-activity Minimum activity threshold in frames [default: 10.0] - --output Output results to JSON file - --debug Enable debug mode - --detailed Show detailed per-component timing - - NOTE: Benchmark now uses real AMI manual annotations from Tests/ami_public_1.6.2/ - If annotations are not found, falls back to simplified placeholder. - - BATCH SPEEDTEST OPTIONS: - --files Comma-separated list of audio files to test - --iterations Number of test iterations [default: 3] - --warmup Number of warmup runs [default: 1] - --threshold Clustering threshold 0.0-1.0 [default: 0.7] - --min-duration-on Minimum speaker segment duration in seconds [default: 1.0] - --min-duration-off Minimum silence between speakers in seconds [default: 0.5] - --min-activity Minimum activity threshold in frames [default: 10.0] - --debug Enable debug mode - --output Output results to JSON file - --detailed Show detailed per-file timing - EXAMPLES: # Download AMI datasets swift run fluidaudio download --dataset ami-sdm @@ -112,29 +79,11 @@ struct DiarizationCLI { # Run benchmark with custom threshold and save results swift run fluidaudio benchmark --threshold 0.8 --output results.json - # Run benchmark with speed testing enabled - swift run fluidaudio benchmark --speed-test --speed-iterations 5 --detailed-timing - - # Run benchmark with both accuracy and speed testing - swift run fluidaudio benchmark --speed-test --threshold 0.7 --output comprehensive_results.json - # Process a single audio file swift run fluidaudio process meeting.wav # Process file with custom settings swift run fluidaudio process meeting.wav --threshold 0.6 --output output.json - - # Run speed test on audio file - swift run fluidaudio speedtest meeting.wav - - # Run speed test with custom iterations and detailed timing - swift run fluidaudio speedtest meeting.wav --iterations 10 --warmup 3 --detailed - - # Run speed test with custom parameters and save results - swift run fluidaudio speedtest meeting.wav --threshold 0.8 --output speed_results.json - - # Run batch speed test on multiple files - swift run fluidaudio batchspeedtest --files test1.wav,test2.wav --iterations 5 --warmup 2 --detailed """) } @@ -382,560 +331,6 @@ struct DiarizationCLI { } } - static func runSpeedTest(arguments: [String]) async { - guard !arguments.isEmpty else { - print("āŒ No audio file specified") - printUsage() - exit(1) - } - - let audioFile = arguments[0] - - // Check for help flags first - if audioFile == "--help" || audioFile == "-h" { - printUsage() - return - } - - var iterations = 5 - var warmupRuns = 2 - var threshold: Float = 0.7 - var minDurationOn: Float = 1.0 - var minDurationOff: Float = 0.5 - var minActivityThreshold: Float = 10.0 - var debugMode = false - var outputFile: String? - var detailedTiming = false - - // Parse remaining arguments - var i = 1 - while i < arguments.count { - switch arguments[i] { - case "--iterations": - if i + 1 < arguments.count { - iterations = Int(arguments[i + 1]) ?? 5 - i += 1 - } - case "--warmup": - if i + 1 < arguments.count { - warmupRuns = Int(arguments[i + 1]) ?? 2 - i += 1 - } - case "--threshold": - if i + 1 < arguments.count { - threshold = Float(arguments[i + 1]) ?? 0.7 - i += 1 - } - case "--min-duration-on": - if i + 1 < arguments.count { - minDurationOn = Float(arguments[i + 1]) ?? 1.0 - i += 1 - } - case "--min-duration-off": - if i + 1 < arguments.count { - minDurationOff = Float(arguments[i + 1]) ?? 0.5 - i += 1 - } - case "--min-activity": - if i + 1 < arguments.count { - minActivityThreshold = Float(arguments[i + 1]) ?? 10.0 - i += 1 - } - case "--debug": - debugMode = true - case "--output": - if i + 1 < arguments.count { - outputFile = arguments[i + 1] - i += 1 - } - case "--detailed": - detailedTiming = true - default: - print("āš ļø Unknown option: \(arguments[i])") - } - i += 1 - } - - print("⚔ Starting End-to-End Pipeline Speed Test") - print(" Audio file: \(audioFile)") - print(" Iterations: \(iterations)") - print(" Warmup runs: \(warmupRuns)") - print(" Clustering threshold: \(threshold)") - print(" Min duration on: \(minDurationOn)s") - print(" Min duration off: \(minDurationOff)s") - print(" Min activity threshold: \(minActivityThreshold)") - print(" Debug mode: \(debugMode ? "enabled" : "disabled")") - print(" Detailed timing: \(detailedTiming ? "enabled" : "disabled")") - - let config = DiarizerConfig( - clusteringThreshold: threshold, - minDurationOn: minDurationOn, - minDurationOff: minDurationOff, - minActivityThreshold: minActivityThreshold, - debugMode: debugMode - ) - - let manager = DiarizerManager(config: config) - - do { - try await manager.initialize() - print("āœ… Models initialized successfully") - } catch { - print("āŒ Failed to initialize models: \(error)") - print("šŸ’” Make sure you have network access for model downloads") - exit(1) - } - - // Load audio file once - let audioSamples: [Float] - do { - audioSamples = try await loadAudioFile(path: audioFile) - let duration = Float(audioSamples.count) / 16000.0 - print("āœ… Loaded audio: \(audioSamples.count) samples (\(String(format: "%.1f", duration))s)") - } catch { - print("āŒ Failed to load audio file: \(error)") - exit(1) - } - - // Run warmup iterations - print("\nšŸ”„ Running \(warmupRuns) warmup iterations...") - for i in 1...warmupRuns { - print(" Warmup \(i)/\(warmupRuns)...") - do { - let _ = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) - } catch { - print(" āš ļø Warmup \(i) failed: \(error)") - } - } - - // Run actual speed test iterations - print("\n⚔ Running \(iterations) speed test iterations...") - var timingResults: [SpeedTestResult] = [] - let duration = Float(audioSamples.count) / 16000.0 - - for i in 1...iterations { - print(" Iteration \(i)/\(iterations)...") - - let startTime = Date() - do { - let result = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) - let processingTime = Date().timeIntervalSince(startTime) - let rtf = Float(processingTime) / duration - - let speedResult = SpeedTestResult( - iteration: i, - processingTimeSeconds: processingTime, - realTimeFactor: rtf, - speakerCount: result.speakerDatabase.count, - segmentCount: result.segments.count, - audioDurationSeconds: duration - ) - - timingResults.append(speedResult) - print(" āœ… RTF: \(String(format: "%.2f", rtf))x, \(result.speakerDatabase.count) speakers, \(result.segments.count) segments") - - } catch { - print(" āŒ Iteration \(i) failed: \(error)") - } - } - - guard !timingResults.isEmpty else { - print("āŒ No successful iterations completed") - return - } - - // Calculate statistics - let avgRTF = timingResults.map { $0.realTimeFactor }.reduce(0, +) / Float(timingResults.count) - let avgProcessingTime = timingResults.map { $0.processingTimeSeconds }.reduce(0, +) / Double(timingResults.count) - let minRTF = timingResults.map { $0.realTimeFactor }.min()! - let maxRTF = timingResults.map { $0.realTimeFactor }.max()! - let stdDevRTF = calculateStandardDeviation(timingResults.map { $0.realTimeFactor }) - - // Print results - printSpeedTestResults( - timingResults, - avgRTF: avgRTF, - avgProcessingTime: avgProcessingTime, - minRTF: minRTF, - maxRTF: maxRTF, - stdDevRTF: stdDevRTF, - audioFile: audioFile, - detailed: detailedTiming - ) - - // Save results if requested - if let outputFile = outputFile { - let summary = SpeedTestSummary( - audioFile: audioFile, - iterations: iterations, - warmupRuns: warmupRuns, - averageRTF: avgRTF, - averageProcessingTime: avgProcessingTime, - minRTF: minRTF, - maxRTF: maxRTF, - stdDevRTF: stdDevRTF, - results: timingResults, - config: config - ) - - do { - try await saveSpeedTestResults(summary, to: outputFile) - print("šŸ’¾ Speed test results saved to: \(outputFile)") - } catch { - print("āš ļø Failed to save results: \(error)") - } - } - } - - static func runBatchSpeedTest(arguments: [String]) async { - // Check for help flags first - if arguments.contains("--help") || arguments.contains("-h") { - printUsage() - return - } - - var audioFiles: [String] = [] - var iterations = 3 - var warmupRuns = 1 - var threshold: Float = 0.7 - var minDurationOn: Float = 1.0 - var minDurationOff: Float = 0.5 - var minActivityThreshold: Float = 10.0 - var debugMode = false - var outputFile: String? - var detailedTiming = false - - // Parse arguments - var i = 0 - while i < arguments.count { - switch arguments[i] { - case "--files": - // Collect all file paths until next option - i += 1 - while i < arguments.count && !arguments[i].hasPrefix("--") { - audioFiles.append(arguments[i]) - i += 1 } - continue - case "--iterations": - if i + 1 < arguments.count { - iterations = Int(arguments[i + 1]) ?? 3 - i += 1 - } - case "--warmup": - if i + 1 < arguments.count { - warmupRuns = Int(arguments[i + 1]) ?? 1 - i += 1 - } - case "--threshold": - if i + 1 < arguments.count { - threshold = Float(arguments[i + 1]) ?? 0.7 - i += 1 - } - case "--min-duration-on": - if i + 1 < arguments.count { - minDurationOn = Float(arguments[i + 1]) ?? 1.0 - i += 1 - } - case "--min-duration-off": - if i + 1 < arguments.count { - minDurationOff = Float(arguments[i + 1]) ?? 0.5 - i += 1 - } - case "--min-activity": - if i + 1 < arguments.count { - minActivityThreshold = Float(arguments[i + 1]) ?? 10.0 - i += 1 - } - case "--debug": - debugMode = true - case "--output": - if i + 1 < arguments.count { - outputFile = arguments[i + 1] - i += 1 - } - case "--detailed": - detailedTiming = true - default: - if !arguments[i].hasPrefix("--") { - audioFiles.append(arguments[i]) - } else { - print("āš ļø Unknown option: \(arguments[i])") - } - } - i += 1 - } - - // If no files specified, use default test files - if audioFiles.isEmpty { - print("šŸ“ No audio files specified, using default test files...") - // You can add default test files here - audioFiles = ["test1.wav", "test2.wav"] // Placeholder - } - - print("⚔ Starting Batch Speed Test") - print(" Audio files: \(audioFiles.count)") - print(" Iterations per file: \(iterations)") - print(" Warmup runs: \(warmupRuns)") - print(" Clustering threshold: \(threshold)") - print(" Min duration on: \(minDurationOn)s") - print(" Min duration off: \(minDurationOff)s") - print(" Min activity threshold: \(minActivityThreshold)") - - let config = DiarizerConfig( - clusteringThreshold: threshold, - minDurationOn: minDurationOn, - minDurationOff: minDurationOff, - minActivityThreshold: minActivityThreshold, - debugMode: debugMode - ) - - let manager = DiarizerManager(config: config) - - do { - try await manager.initialize() - print("āœ… Models initialized successfully") - } catch { - print("āŒ Failed to initialize models: \(error)") - exit(1) - } - - var allResults: [BatchSpeedTestResult] = [] - var totalProcessingTime: Double = 0 - var totalAudioDuration: Float = 0 - - for (fileIndex, audioFile) in audioFiles.enumerated() { - print("\nšŸ“ Testing file \(fileIndex + 1)/\(audioFiles.count): \(audioFile)") - - guard FileManager.default.fileExists(atPath: audioFile) else { - print(" āŒ File not found: \(audioFile)") - continue - } - - // Load audio file - let audioSamples: [Float] - do { - audioSamples = try await loadAudioFile(path: audioFile) - let duration = Float(audioSamples.count) / 16000.0 - print(" āœ… Loaded audio: \(String(format: "%.1f", duration))s") - totalAudioDuration += duration - } catch { - print(" āŒ Failed to load audio file: \(error)") - continue - } - - // Run warmup iterations - for i in 1...warmupRuns { - print(" šŸ”„ Warmup \(i)/\(warmupRuns)...") - do { - let _ = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) - } catch { - print(" āš ļø Warmup \(i) failed: \(error)") - } - } - - // Run speed test iterations - var fileResults: [SpeedTestResult] = [] - let duration = Float(audioSamples.count) / 16000.0 - - for i in 1...iterations { - print(" ⚔ Iteration \(i)/\(iterations)...") - - let startTime = Date() - do { - let result = try await manager.performCompleteDiarization(audioSamples, sampleRate: 16000) - let processingTime = Date().timeIntervalSince(startTime) - let rtf = Float(processingTime) / duration - totalProcessingTime += processingTime - - let speedResult = SpeedTestResult( - iteration: i, - processingTimeSeconds: processingTime, - realTimeFactor: rtf, - speakerCount: result.speakerDatabase.count, - segmentCount: result.segments.count, - audioDurationSeconds: duration - ) - - fileResults.append(speedResult) - print(" āœ… RTF: \(String(format: "%.2f", rtf))x") - - } catch { - print(" āŒ Iteration \(i) failed: \(error)") - } - } - - if !fileResults.isEmpty { - let avgRTF = fileResults.map { $0.realTimeFactor }.reduce(0, +) / Float(fileResults.count) - let avgProcessingTime = fileResults.map { $0.processingTimeSeconds }.reduce(0, +) / Double(fileResults.count) - let minRTF = fileResults.map { $0.realTimeFactor }.min()! - let maxRTF = fileResults.map { $0.realTimeFactor }.max()! - let stdDevRTF = calculateStandardDeviation(fileResults.map { $0.realTimeFactor }) - - let batchResult = BatchSpeedTestResult( - audioFile: audioFile, - averageRTF: avgRTF, - averageProcessingTime: avgProcessingTime, - minRTF: minRTF, - maxRTF: maxRTF, - stdDevRTF: stdDevRTF, - results: fileResults - ) - - allResults.append(batchResult) - } - } - - guard !allResults.isEmpty else { - print("āŒ No successful tests completed") - return - } - - // Calculate overall statistics - let overallAvgRTF = allResults.map { $0.averageRTF }.reduce(0, +) / Float(allResults.count) - let overallAvgProcessingTime = allResults.map { $0.averageProcessingTime }.reduce(0, +) / Double(allResults.count) - let overallMinRTF = allResults.map { $0.minRTF }.min()! - let overallMaxRTF = allResults.map { $0.maxRTF }.max()! - let overallStdDevRTF = calculateStandardDeviation(allResults.map { $0.averageRTF }) - - // Print batch results - printBatchSpeedTestResults( - allResults, - overallAvgRTF: overallAvgRTF, - overallAvgProcessingTime: overallAvgProcessingTime, - overallMinRTF: overallMinRTF, - overallMaxRTF: overallMaxRTF, - overallStdDevRTF: overallStdDevRTF, - totalProcessingTime: totalProcessingTime, - totalAudioDuration: totalAudioDuration, - detailed: detailedTiming - ) - - // Save results if requested - if let outputFile = outputFile { - let summary = BatchSpeedTestSummary( - audioFiles: audioFiles, - iterations: iterations, - warmupRuns: warmupRuns, - overallAverageRTF: overallAvgRTF, - overallAverageProcessingTime: overallAvgProcessingTime, - overallMinRTF: overallMinRTF, - overallMaxRTF: overallMaxRTF, - overallStdDevRTF: overallStdDevRTF, - totalProcessingTime: totalProcessingTime, - totalAudioDuration: totalAudioDuration, - results: allResults, - config: config - ) - - do { - try await saveBatchSpeedTestResults(summary, to: outputFile) - print("šŸ’¾ Batch speed test results saved to: \(outputFile)") - } catch { - print("āš ļø Failed to save results: \(error)") - } - } - } - - static func printBatchSpeedTestResults( - _ results: [BatchSpeedTestResult], - overallAvgRTF: Float, - overallAvgProcessingTime: Double, - overallMinRTF: Float, - overallMaxRTF: Float, - overallStdDevRTF: Float, - totalProcessingTime: Double, - totalAudioDuration: Float, - detailed: Bool - ) { - print("\nšŸ Batch Speed Test Results") - let separator = String(repeating: "=", count: 80) - print("\(separator)") - - // Print table header - print("│ File Name │ RTF │ Processing │ Speakers │ Segments │ Duration │") - let headerSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" - print("\(headerSep)") - - // Print individual file results - for result in results.sorted(by: { $0.averageRTF < $1.averageRTF }) { - let fileName = result.audioFile.split(separator: "/").last.map(String.init) ?? result.audioFile - let fileNameStr = String(fileName.prefix(15)).padding(toLength: 15, withPad: " ", startingAt: 0) - let rtfStr = String(format: "%.2fx", result.averageRTF).padding(toLength: 6, withPad: " ", startingAt: 0) - let procStr = String(format: "%.1fs", result.averageProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) - let avgSpeakers = result.results.reduce(0) { $0 + $1.speakerCount } / result.results.count - let speakerStr = String(format: "%.1f", Float(avgSpeakers)).padding(toLength: 8, withPad: " ", startingAt: 0) - let avgSegments = result.results.reduce(0) { $0 + $1.segmentCount } / result.results.count - let segmentStr = String(format: "%.1f", Float(avgSegments)).padding(toLength: 8, withPad: " ", startingAt: 0) - let durationStr = String(format: "%.1fs", result.results.first?.audioDurationSeconds ?? 0).padding(toLength: 8, withPad: " ", startingAt: 0) - - print("│ \(fileNameStr) │ \(rtfStr) │ \(procStr) │ \(speakerStr) │ \(segmentStr) │ \(durationStr) │") - } - - // Print summary section - let midSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" - print("\(midSep)") - - let avgRtfStr = String(format: "%.2fx", overallAvgRTF).padding(toLength: 6, withPad: " ", startingAt: 0) - let avgProcStr = String(format: "%.1fs", overallAvgProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) - let totalDurationStr = String(format: "%.1fs", totalAudioDuration).padding(toLength: 8, withPad: " ", startingAt: 0) - - print("│ OVERALL AVERAGE │ \(avgRtfStr) │ \(avgProcStr) │ │ │ \(totalDurationStr) │") - let bottomSep = "ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜" - print("\(bottomSep)") - - // Print overall statistics - print("\nšŸ“Š Overall Performance Statistics:") - print(" Files tested: \(results.count)") - print(" Total audio duration: \(String(format: "%.1f", totalAudioDuration))s") - print(" Total processing time: \(String(format: "%.1f", totalProcessingTime))s") - print(" Overall average RTF: \(String(format: "%.2f", overallAvgRTF))x") - print(" Overall min RTF: \(String(format: "%.2f", overallMinRTF))x") - print(" Overall max RTF: \(String(format: "%.2f", overallMaxRTF))x") - print(" Overall RTF Std Dev: \(String(format: "%.2f", overallStdDevRTF))x") - - // Performance assessment - print("\nšŸŽÆ Overall Performance Assessment:") - if overallAvgRTF < 0.1 { - print(" šŸš€ EXCELLENT: Real-time factor < 0.1x (10x faster than real-time)") - } else if overallAvgRTF < 0.5 { - print(" āœ… VERY GOOD: Real-time factor < 0.5x (2x faster than real-time)") - } else if overallAvgRTF < 1.0 { - print(" šŸ‘ GOOD: Real-time factor < 1.0x (faster than real-time)") - } else if overallAvgRTF < 2.0 { - print(" āš ļø MODERATE: Real-time factor < 2.0x (slower than real-time)") - } else { - print(" 🐌 SLOW: Real-time factor >= 2.0x (significantly slower than real-time)") - } - - // File-by-file analysis if detailed - if detailed { - print("\nšŸ” File-by-File Analysis:") - let sortedResults = results.sorted(by: { $0.averageRTF < $1.averageRTF }) - print(" Fastest file: \(sortedResults.first?.audioFile.split(separator: "/").last.map(String.init) ?? "unknown") (\(String(format: "%.2f", sortedResults.first?.averageRTF ?? 0))x RTF)") - print(" Slowest file: \(sortedResults.last?.audioFile.split(separator: "/").last.map(String.init) ?? "unknown") (\(String(format: "%.2f", sortedResults.last?.averageRTF ?? 0))x RTF)") - - let rtfRange = (sortedResults.last?.averageRTF ?? 0) - (sortedResults.first?.averageRTF ?? 0) - print(" RTF range: \(String(format: "%.2f", rtfRange))x") - - if rtfRange > 0.5 { - print(" āš ļø High variability between files - consider file-specific optimization") - } else if rtfRange > 0.2 { - print(" āš ļø Moderate variability between files") - } else { - print(" āœ… Consistent performance across files") - } - } - } - - static func saveBatchSpeedTestResults(_ summary: BatchSpeedTestSummary, to file: String) async throws { - let encoder = JSONEncoder() - encoder.outputFormatting = [.prettyPrinted, .sortedKeys] - encoder.dateEncodingStrategy = .iso8601 - - let data = try encoder.encode(summary) - try data.write(to: URL(fileURLWithPath: file)) - } - // MARK: - AMI Benchmark Implementation static func runAMISDMBenchmark( @@ -1927,135 +1322,6 @@ struct DiarizationCLI { } return embedding } - - static func printSpeedTestResults( - _ results: [SpeedTestResult], - avgRTF: Float, - avgProcessingTime: Double, - minRTF: Float, - maxRTF: Float, - stdDevRTF: Float, - audioFile: String, - detailed: Bool - ) { - print("\nšŸ Speed Test Results") - let separator = String(repeating: "=", count: 75) - print("\(separator)") - - // Print table header - print("│ Iteration │ RTF │ Processing │ Speakers │ Segments │") - let headerSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" - print("\(headerSep)") - - // Print individual results - for result in results.sorted(by: { $0.iteration < $1.iteration }) { - let iterStr = String(result.iteration).padding(toLength: 9, withPad: " ", startingAt: 0) - let rtfStr = String(format: "%.2fx", result.realTimeFactor).padding(toLength: 6, withPad: " ", startingAt: 0) - let procStr = String(format: "%.1fs", result.processingTimeSeconds).padding(toLength: 10, withPad: " ", startingAt: 0) - let speakerStr = String(result.speakerCount).padding(toLength: 8, withPad: " ", startingAt: 0) - let segmentStr = String(result.segmentCount).padding(toLength: 8, withPad: " ", startingAt: 0) - - print("│ \(iterStr) │ \(rtfStr) │ \(procStr) │ \(speakerStr) │ \(segmentStr) │") - } - - // Print summary section - let midSep = "ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤" - print("\(midSep)") - - let avgRtfStr = String(format: "%.2fx", avgRTF).padding(toLength: 6, withPad: " ", startingAt: 0) - let avgProcStr = String(format: "%.1fs", avgProcessingTime).padding(toLength: 10, withPad: " ", startingAt: 0) - let avgSpeakers = results.reduce(0) { $0 + $1.speakerCount } / results.count - let avgSpeakerStr = String(format: "%.1f", Float(avgSpeakers)).padding(toLength: 8, withPad: " ", startingAt: 0) - let avgSegments = results.reduce(0) { $0 + $1.segmentCount } / results.count - let avgSegmentStr = String(format: "%.1f", Float(avgSegments)).padding(toLength: 8, withPad: " ", startingAt: 0) - - print("│ AVERAGE │ \(avgRtfStr) │ \(avgProcStr) │ \(avgSpeakerStr) │ \(avgSegmentStr) │") - let bottomSep = "ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜" - print("\(bottomSep)") - - // Print detailed statistics - print("\nšŸ“Š Performance Statistics:") - print(" Audio file: \(audioFile)") - print(" Audio duration: \(String(format: "%.1f", results.first?.audioDurationSeconds ?? 0))s") - print(" Iterations: \(results.count)") - print(" Average RTF: \(String(format: "%.2f", avgRTF))x") - print(" Min RTF: \(String(format: "%.2f", minRTF))x") - print(" Max RTF: \(String(format: "%.2f", maxRTF))x") - print(" RTF Std Dev: \(String(format: "%.2f", stdDevRTF))x") - print(" Average processing time: \(String(format: "%.1f", avgProcessingTime))s") - print(" Average speakers detected: \(String(format: "%.1f", Float(avgSpeakers)))") - print(" Average segments: \(String(format: "%.1f", Float(avgSegments)))") - - // Performance assessment - print("\nšŸŽÆ Performance Assessment:") - if avgRTF < 0.1 { - print(" šŸš€ EXCELLENT: Real-time factor < 0.1x (10x faster than real-time)") - } else if avgRTF < 0.5 { - print(" āœ… VERY GOOD: Real-time factor < 0.5x (2x faster than real-time)") - } else if avgRTF < 1.0 { - print(" šŸ‘ GOOD: Real-time factor < 1.0x (faster than real-time)") - } else if avgRTF < 2.0 { - print(" āš ļø MODERATE: Real-time factor < 2.0x (slower than real-time)") - } else { - print(" 🐌 SLOW: Real-time factor >= 2.0x (significantly slower than real-time)") - } - - // Research comparison - print("\nšŸ“ Research Comparison:") - print(" Your Results: \(String(format: "%.2f", avgRTF))x RTF") - print(" Pyannote (2021): 0.15x RTF (GPU)") - print(" EEND (2019): 0.8x RTF (CPU)") - print(" x-vector clustering: 1.2x RTF (CPU)") - - // Detailed timing breakdown if requested - if detailed { - print("\nšŸ” Detailed Timing Analysis:") - let sortedRTFs = results.map { $0.realTimeFactor }.sorted() - let medianRTF = sortedRTFs[sortedRTFs.count / 2] - let p95RTF = sortedRTFs[Int(Double(sortedRTFs.count) * 0.95)] - let p99RTF = sortedRTFs[Int(Double(sortedRTFs.count) * 0.99)] - - print(" Median RTF: \(String(format: "%.2f", medianRTF))x") - print(" 95th percentile RTF: \(String(format: "%.2f", p95RTF))x") - print(" 99th percentile RTF: \(String(format: "%.2f", p99RTF))x") - - // Consistency analysis - let consistency = (1.0 - stdDevRTF / avgRTF) * 100 - print(" Consistency: \(String(format: "%.1f", consistency))%") - - if consistency > 90 { - print(" šŸŽÆ EXCELLENT: Very consistent performance") - } else if consistency > 80 { - print(" āœ… GOOD: Consistent performance") - } else if consistency > 70 { - print(" āš ļø MODERATE: Some performance variability") - } else { - print(" 🚨 POOR: High performance variability") - } - } - - // Optimization suggestions - print("\nšŸ’” Optimization Suggestions:") - if avgRTF > 1.0 { - print(" • Consider reducing clustering threshold for faster processing") - print(" • Increase min-duration-on to reduce segment count") - print(" • Use GPU acceleration if available") - print(" • Consider batch processing for multiple files") - } else { - print(" • Performance is already excellent!") - print(" • Consider increasing accuracy parameters if needed") - print(" • Ready for production deployment") - } - } - - static func saveSpeedTestResults(_ summary: SpeedTestSummary, to file: String) async throws { - let encoder = JSONEncoder() - encoder.outputFormatting = [.prettyPrinted, .sortedKeys] - encoder.dateEncodingStrategy = .iso8601 - - let data = try encoder.encode(summary) - try data.write(to: URL(fileURLWithPath: file)) - } } // MARK: - Data Structures @@ -2120,47 +1386,6 @@ struct BenchmarkSummary: Codable { } } -struct SpeedTestResult: Codable { - let iteration: Int - let processingTimeSeconds: TimeInterval - let realTimeFactor: Float - let speakerCount: Int - let segmentCount: Int - let audioDurationSeconds: Float -} - -struct SpeedTestSummary: Codable { - let audioFile: String - let iterations: Int - let warmupRuns: Int - let averageRTF: Float - let averageProcessingTime: Double - let minRTF: Float - let maxRTF: Float - let stdDevRTF: Float - let results: [SpeedTestResult] - let config: DiarizerConfig - let timestamp: Date - - init( - audioFile: String, iterations: Int, warmupRuns: Int, averageRTF: Float, - averageProcessingTime: Double, minRTF: Float, maxRTF: Float, stdDevRTF: Float, - results: [SpeedTestResult], config: DiarizerConfig - ) { - self.audioFile = audioFile - self.iterations = iterations - self.warmupRuns = warmupRuns - self.averageRTF = averageRTF - self.averageProcessingTime = averageProcessingTime - self.minRTF = minRTF - self.maxRTF = maxRTF - self.stdDevRTF = stdDevRTF - self.results = results - self.config = config - self.timestamp = Date() - } -} - struct DiarizationMetrics { let der: Float let jer: Float @@ -2450,50 +1675,3 @@ private class AMIMeetingsXMLDelegate: NSObject, XMLParserDelegate { parsingError = parseError } } - -struct BatchSpeedTestResult: Codable { - let audioFile: String - let averageRTF: Float - let averageProcessingTime: Double - let minRTF: Float - let maxRTF: Float - let stdDevRTF: Float - let results: [SpeedTestResult] -} - -struct BatchSpeedTestSummary: Codable { - let audioFiles: [String] - let iterations: Int - let warmupRuns: Int - let overallAverageRTF: Float - let overallAverageProcessingTime: Double - let overallMinRTF: Float - let overallMaxRTF: Float - let overallStdDevRTF: Float - let totalProcessingTime: Double - let totalAudioDuration: Float - let results: [BatchSpeedTestResult] - let config: DiarizerConfig - let timestamp: Date - - init( - audioFiles: [String], iterations: Int, warmupRuns: Int, overallAverageRTF: Float, - overallAverageProcessingTime: Double, overallMinRTF: Float, overallMaxRTF: Float, - overallStdDevRTF: Float, totalProcessingTime: Double, totalAudioDuration: Float, - results: [BatchSpeedTestResult], config: DiarizerConfig - ) { - self.audioFiles = audioFiles - self.iterations = iterations - self.warmupRuns = warmupRuns - self.overallAverageRTF = overallAverageRTF - self.overallAverageProcessingTime = overallAverageProcessingTime - self.overallMinRTF = overallMinRTF - self.overallMaxRTF = overallMaxRTF - self.overallStdDevRTF = overallStdDevRTF - self.totalProcessingTime = totalProcessingTime - self.totalAudioDuration = totalAudioDuration - self.results = results - self.config = config - self.timestamp = Date() - } -} From 52f1115c9f560b96559e20e9b5e6531797a96a59 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 19:03:20 -0400 Subject: [PATCH 3/6] add time to the benchmark.yml --- .github/workflows/benchmark.yml | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 67c5efa2b..33bf13a4b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -33,23 +33,30 @@ jobs: id: benchmark run: | echo "šŸš€ Running single file benchmark..." - # Run benchmark with ES2004a file and save results to JSON - swift run fluidaudio benchmark --auto-download --single-file ES2004a --output benchmark_results.json + swift run fluidaudio benchmark --auto-download --single-file ES2004a --output benchmark_results.json | tee benchmark.log + + # Extract total time from CLI output + if grep -q "Total benchmark execution time:" benchmark.log; then + BENCHMARK_TIME=$(grep "Total benchmark execution time:" benchmark.log | grep -o '[0-9.]*') + echo "BENCHMARK_TIME=${BENCHMARK_TIME}" >> $GITHUB_OUTPUT + else + echo "BENCHMARK_TIME=NA" >> $GITHUB_OUTPUT + fi # Extract key metrics from JSON output if [ -f benchmark_results.json ]; then # Parse JSON results (using basic tools available in GitHub runners) AVERAGE_DER=$(cat benchmark_results.json | grep -o '"averageDER":[0-9]*\.?[0-9]*' | cut -d':' -f2) - AVERAGE_JER=$(cat benchmark_results.json | grep -o '"averageJER":[0-9]*\.?[0-9]*' | cut -d':' -f2) + AVERAGE_JER=$(cat benchmark_results.json | grep -o '"averageJER":[0-9]*\.?[0-9]*' | cut -d':' -f2) PROCESSED_FILES=$(cat benchmark_results.json | grep -o '"processedFiles":[0-9]*' | cut -d':' -f2) - + # Get first result details RTF=$(cat benchmark_results.json | grep -o '"realTimeFactor":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) DURATION=$(cat benchmark_results.json | grep -o '"durationSeconds":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) SPEAKER_COUNT=$(cat benchmark_results.json | grep -o '"speakerCount":[0-9]*' | head -1 | cut -d':' -f2) - + echo "DER=${AVERAGE_DER}" >> $GITHUB_OUTPUT - echo "JER=${AVERAGE_JER}" >> $GITHUB_OUTPUT + echo "JER=${AVERAGE_JER}" >> $GITHUB_OUTPUT echo "RTF=${RTF}" >> $GITHUB_OUTPUT echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT @@ -76,15 +83,17 @@ jobs: const rtf = parseFloat('${{ steps.benchmark.outputs.RTF }}').toFixed(2); const duration = parseFloat('${{ steps.benchmark.outputs.DURATION }}').toFixed(1); const speakerCount = '${{ steps.benchmark.outputs.SPEAKER_COUNT }}'; - + const benchmarkTime = '${{ steps.benchmark.outputs.BENCHMARK_TIME }}'; + comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; comment += '| Metric | Value | Target | Status |\n'; comment += '|--------|-------|--------|---------|\n'; comment += `| **DER** (Diarization Error Rate) | ${der}% | < 30% | ${der < 30 ? 'āœ…' : 'āŒ'} |\n`; comment += `| **JER** (Jaccard Error Rate) | ${jer}% | < 25% | ${jer < 25 ? 'āœ…' : 'āŒ'} |\n`; comment += `| **RTF** (Real-Time Factor) | ${rtf}x | < 1.0x | ${rtf < 1.0 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n\n`; - + comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n`; + comment += `| **Benchmark Runtime** | ${benchmarkTime}s | - | ā„¹ļø |\n\n`; + // Performance assessment if (der < 20) { comment += 'šŸŽ‰ **Excellent Performance!** - Competitive with state-of-the-art research\n'; @@ -93,12 +102,12 @@ jobs: } else { comment += 'āš ļø **Performance Below Target** - Consider parameter optimization\n'; } - + comment += '\nšŸ“Š **Research Comparison:**\n'; comment += '- Powerset BCE (2023): 18.5% DER\n'; comment += '- EEND (2019): 25.3% DER\n'; comment += '- x-vector clustering: 28.7% DER\n'; - + } else { comment += 'āŒ **Benchmark Failed**\n\n'; comment += 'The single file benchmark could not complete successfully. '; From 9ed9ce353e0f9df809b44ebc4f888974f3743f4c Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 19:59:57 -0400 Subject: [PATCH 4/6] show benchmark_results.json contents --- .github/workflows/benchmark.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 33bf13a4b..25159c176 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -68,6 +68,13 @@ jobs: fi timeout-minutes: 25 + - name: Show benchmark_results.json + if: always() + run: | + echo "--- benchmark_results.json ---" + cat benchmark_results.json || echo "benchmark_results.json not found" + echo "-----------------------------" + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 From f3a13e6f4f22cdc420ff25547123067b792327cf Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 20:20:21 -0400 Subject: [PATCH 5/6] github comment bot failing --- .github/workflows/benchmark.yml | 80 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 25159c176..399bb2e60 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -75,56 +75,56 @@ jobs: cat benchmark_results.json || echo "benchmark_results.json not found" echo "-----------------------------" + - name: Extract benchmark metrics with jq + id: extract + run: | + DER=$(jq '.averageDER' benchmark_results.json) + JER=$(jq '.averageJER' benchmark_results.json) + RTF=$(jq '.results[0].realTimeFactor' benchmark_results.json) + DURATION=$(jq '.results[0].durationSeconds' benchmark_results.json) + SPEAKER_COUNT=$(jq '.results[0].speakerCount' benchmark_results.json) + echo "DER=${DER}" >> $GITHUB_OUTPUT + echo "JER=${JER}" >> $GITHUB_OUTPUT + echo "RTF=${RTF}" >> $GITHUB_OUTPUT + echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT + echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 with: script: | - const success = '${{ steps.benchmark.outputs.SUCCESS }}' === 'true'; + const der = parseFloat('${{ steps.extract.outputs.DER }}'); + const jer = parseFloat('${{ steps.extract.outputs.JER }}'); + const rtf = parseFloat('${{ steps.extract.outputs.RTF }}'); + const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1); + const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}'; + const benchmarkTime = '${{ steps.benchmark.outputs.BENCHMARK_TIME }}'; let comment = '## šŸŽÆ Single File Benchmark Results\n\n'; - - if (success) { - const der = parseFloat('${{ steps.benchmark.outputs.DER }}').toFixed(1); - const jer = parseFloat('${{ steps.benchmark.outputs.JER }}').toFixed(1); - const rtf = parseFloat('${{ steps.benchmark.outputs.RTF }}').toFixed(2); - const duration = parseFloat('${{ steps.benchmark.outputs.DURATION }}').toFixed(1); - const speakerCount = '${{ steps.benchmark.outputs.SPEAKER_COUNT }}'; - const benchmarkTime = '${{ steps.benchmark.outputs.BENCHMARK_TIME }}'; - - comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; - comment += '| Metric | Value | Target | Status |\n'; - comment += '|--------|-------|--------|---------|\n'; - comment += `| **DER** (Diarization Error Rate) | ${der}% | < 30% | ${der < 30 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **JER** (Jaccard Error Rate) | ${jer}% | < 25% | ${jer < 25 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **RTF** (Real-Time Factor) | ${rtf}x | < 1.0x | ${rtf < 1.0 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n`; - comment += `| **Benchmark Runtime** | ${benchmarkTime}s | - | ā„¹ļø |\n\n`; - - // Performance assessment - if (der < 20) { - comment += 'šŸŽ‰ **Excellent Performance!** - Competitive with state-of-the-art research\n'; - } else if (der < 30) { - comment += 'āœ… **Good Performance** - Meeting target benchmarks\n'; - } else { - comment += 'āš ļø **Performance Below Target** - Consider parameter optimization\n'; - } - - comment += '\nšŸ“Š **Research Comparison:**\n'; - comment += '- Powerset BCE (2023): 18.5% DER\n'; - comment += '- EEND (2019): 25.3% DER\n'; - comment += '- x-vector clustering: 28.7% DER\n'; - + comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; + comment += '| Metric | Value | Target | Status |\n'; + comment += '|--------|-------|--------|---------|\n'; + comment += `| **DER** (Diarization Error Rate) | ${der.toFixed(1)}% | < 30% | ${der < 30 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **JER** (Jaccard Error Rate) | ${jer.toFixed(1)}% | < 25% | ${jer < 25 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **RTF** (Real-Time Factor) | ${rtf.toFixed(2)}x | < 1.0x | ${rtf < 1.0 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n`; + comment += `| **Benchmark Runtime** | ${benchmarkTime}s | - | ā„¹ļø |\n\n`; + + // Performance assessment + if (der < 20) { + comment += 'šŸŽ‰ **Excellent Performance!** - Competitive with state-of-the-art research\n'; + } else if (der < 30) { + comment += 'āœ… **Good Performance** - Meeting target benchmarks\n'; } else { - comment += 'āŒ **Benchmark Failed**\n\n'; - comment += 'The single file benchmark could not complete successfully. '; - comment += 'This may be due to:\n'; - comment += '- Network issues downloading test data\n'; - comment += '- Model initialization problems\n'; - comment += '- Audio processing errors\n\n'; - comment += 'Please check the workflow logs for detailed error information.'; + comment += 'āš ļø **Performance Below Target** - Consider parameter optimization\n'; } + comment += '\nšŸ“Š **Research Comparison:**\n'; + comment += '- Powerset BCE (2023): 18.5% DER\n'; + comment += '- EEND (2019): 25.3% DER\n'; + comment += '- x-vector clustering: 28.7% DER\n'; + comment += '\n\n---\n*Automated benchmark using AMI corpus ES2004a test file*'; github.rest.issues.createComment({ From f8722f0d29a699a80f16564b9b49c4ca24c71f73 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 29 Jun 2025 20:40:53 -0400 Subject: [PATCH 6/6] fix JER --- Sources/DiarizationCLI/main.swift | 71 +++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/Sources/DiarizationCLI/main.swift b/Sources/DiarizationCLI/main.swift index f8cb77826..3dc0324ea 100644 --- a/Sources/DiarizationCLI/main.swift +++ b/Sources/DiarizationCLI/main.swift @@ -786,11 +786,74 @@ struct DiarizationCLI { static func calculateJaccardErrorRate( predicted: [TimedSpeakerSegment], groundTruth: [TimedSpeakerSegment] ) -> Float { - let totalGTDuration = groundTruth.reduce(0) { $0 + $1.durationSeconds } - let totalPredDuration = predicted.reduce(0) { $0 + $1.durationSeconds } + // If no segments in either prediction or ground truth, return 100% error + if predicted.isEmpty && groundTruth.isEmpty { + return 0.0 // Perfect match - both empty + } else if predicted.isEmpty || groundTruth.isEmpty { + return 100.0 // Complete mismatch - one empty, one not + } + + // Use the same frame size as DER calculation for consistency + let frameSize: Float = 0.01 + let totalDuration = max( + predicted.map { $0.endTimeSeconds }.max() ?? 0, + groundTruth.map { $0.endTimeSeconds }.max() ?? 0 + ) + let totalFrames = Int(totalDuration / frameSize) + + // Get optimal speaker mapping using existing Hungarian algorithm + let speakerMapping = findOptimalSpeakerMapping( + predicted: predicted, + groundTruth: groundTruth, + totalDuration: totalDuration + ) + + var intersectionFrames = 0 + var unionFrames = 0 + + // Calculate frame-by-frame Jaccard + for frame in 0.. 0 ? Float(intersectionFrames) / Float(unionFrames) : 0.0 + + // Convert to error rate: JER = 1 - Jaccard Index + let jer = (1.0 - jaccardIndex) * 100.0 + + // Debug logging for first few calculations + if predicted.count > 0 && groundTruth.count > 0 { + print("šŸ” JER DEBUG: Intersection: \(intersectionFrames), Union: \(unionFrames), Jaccard Index: \(String(format: "%.3f", jaccardIndex)), JER: \(String(format: "%.1f", jer))%") + } - let durationDiff = abs(totalGTDuration - totalPredDuration) - return (durationDiff / max(totalGTDuration, totalPredDuration)) * 100 + return jer } static func findSpeakerAtTime(_ time: Float, in segments: [TimedSpeakerSegment]) -> String? {