diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 67c5efa2b..399bb2e60 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -33,23 +33,30 @@ jobs: id: benchmark run: | echo "šŸš€ Running single file benchmark..." - # Run benchmark with ES2004a file and save results to JSON - swift run fluidaudio benchmark --auto-download --single-file ES2004a --output benchmark_results.json + swift run fluidaudio benchmark --auto-download --single-file ES2004a --output benchmark_results.json | tee benchmark.log + + # Extract total time from CLI output + if grep -q "Total benchmark execution time:" benchmark.log; then + BENCHMARK_TIME=$(grep "Total benchmark execution time:" benchmark.log | grep -o '[0-9.]*') + echo "BENCHMARK_TIME=${BENCHMARK_TIME}" >> $GITHUB_OUTPUT + else + echo "BENCHMARK_TIME=NA" >> $GITHUB_OUTPUT + fi # Extract key metrics from JSON output if [ -f benchmark_results.json ]; then # Parse JSON results (using basic tools available in GitHub runners) AVERAGE_DER=$(cat benchmark_results.json | grep -o '"averageDER":[0-9]*\.?[0-9]*' | cut -d':' -f2) - AVERAGE_JER=$(cat benchmark_results.json | grep -o '"averageJER":[0-9]*\.?[0-9]*' | cut -d':' -f2) + AVERAGE_JER=$(cat benchmark_results.json | grep -o '"averageJER":[0-9]*\.?[0-9]*' | cut -d':' -f2) PROCESSED_FILES=$(cat benchmark_results.json | grep -o '"processedFiles":[0-9]*' | cut -d':' -f2) - + # Get first result details RTF=$(cat benchmark_results.json | grep -o '"realTimeFactor":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) DURATION=$(cat benchmark_results.json | grep -o '"durationSeconds":[0-9]*\.?[0-9]*' | head -1 | cut -d':' -f2) SPEAKER_COUNT=$(cat benchmark_results.json | grep -o '"speakerCount":[0-9]*' | head -1 | cut -d':' -f2) - + echo "DER=${AVERAGE_DER}" >> $GITHUB_OUTPUT - echo "JER=${AVERAGE_JER}" >> $GITHUB_OUTPUT + echo "JER=${AVERAGE_JER}" >> $GITHUB_OUTPUT echo "RTF=${RTF}" >> $GITHUB_OUTPUT echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT @@ -61,54 +68,63 @@ jobs: fi timeout-minutes: 25 + - name: Show benchmark_results.json + if: always() + run: | + echo "--- benchmark_results.json ---" + cat benchmark_results.json || echo "benchmark_results.json not found" + echo "-----------------------------" + + - name: Extract benchmark metrics with jq + id: extract + run: | + DER=$(jq '.averageDER' benchmark_results.json) + JER=$(jq '.averageJER' benchmark_results.json) + RTF=$(jq '.results[0].realTimeFactor' benchmark_results.json) + DURATION=$(jq '.results[0].durationSeconds' benchmark_results.json) + SPEAKER_COUNT=$(jq '.results[0].speakerCount' benchmark_results.json) + echo "DER=${DER}" >> $GITHUB_OUTPUT + echo "JER=${JER}" >> $GITHUB_OUTPUT + echo "RTF=${RTF}" >> $GITHUB_OUTPUT + echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT + echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 with: script: | - const success = '${{ steps.benchmark.outputs.SUCCESS }}' === 'true'; + const der = parseFloat('${{ steps.extract.outputs.DER }}'); + const jer = parseFloat('${{ steps.extract.outputs.JER }}'); + const rtf = parseFloat('${{ steps.extract.outputs.RTF }}'); + const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1); + const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}'; + const benchmarkTime = '${{ steps.benchmark.outputs.BENCHMARK_TIME }}'; let comment = '## šŸŽÆ Single File Benchmark Results\n\n'; + comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; + comment += '| Metric | Value | Target | Status |\n'; + comment += '|--------|-------|--------|---------|\n'; + comment += `| **DER** (Diarization Error Rate) | ${der.toFixed(1)}% | < 30% | ${der < 30 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **JER** (Jaccard Error Rate) | ${jer.toFixed(1)}% | < 25% | ${jer < 25 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **RTF** (Real-Time Factor) | ${rtf.toFixed(2)}x | < 1.0x | ${rtf < 1.0 ? 'āœ…' : 'āŒ'} |\n`; + comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n`; + comment += `| **Benchmark Runtime** | ${benchmarkTime}s | - | ā„¹ļø |\n\n`; - if (success) { - const der = parseFloat('${{ steps.benchmark.outputs.DER }}').toFixed(1); - const jer = parseFloat('${{ steps.benchmark.outputs.JER }}').toFixed(1); - const rtf = parseFloat('${{ steps.benchmark.outputs.RTF }}').toFixed(2); - const duration = parseFloat('${{ steps.benchmark.outputs.DURATION }}').toFixed(1); - const speakerCount = '${{ steps.benchmark.outputs.SPEAKER_COUNT }}'; - - comment += `**Test File:** ES2004a (${duration}s audio)\n\n`; - comment += '| Metric | Value | Target | Status |\n'; - comment += '|--------|-------|--------|---------|\n'; - comment += `| **DER** (Diarization Error Rate) | ${der}% | < 30% | ${der < 30 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **JER** (Jaccard Error Rate) | ${jer}% | < 25% | ${jer < 25 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **RTF** (Real-Time Factor) | ${rtf}x | < 1.0x | ${rtf < 1.0 ? 'āœ…' : 'āŒ'} |\n`; - comment += `| **Speakers Detected** | ${speakerCount} | - | ā„¹ļø |\n\n`; - - // Performance assessment - if (der < 20) { - comment += 'šŸŽ‰ **Excellent Performance!** - Competitive with state-of-the-art research\n'; - } else if (der < 30) { - comment += 'āœ… **Good Performance** - Meeting target benchmarks\n'; - } else { - comment += 'āš ļø **Performance Below Target** - Consider parameter optimization\n'; - } - - comment += '\nšŸ“Š **Research Comparison:**\n'; - comment += '- Powerset BCE (2023): 18.5% DER\n'; - comment += '- EEND (2019): 25.3% DER\n'; - comment += '- x-vector clustering: 28.7% DER\n'; - + // Performance assessment + if (der < 20) { + comment += 'šŸŽ‰ **Excellent Performance!** - Competitive with state-of-the-art research\n'; + } else if (der < 30) { + comment += 'āœ… **Good Performance** - Meeting target benchmarks\n'; } else { - comment += 'āŒ **Benchmark Failed**\n\n'; - comment += 'The single file benchmark could not complete successfully. '; - comment += 'This may be due to:\n'; - comment += '- Network issues downloading test data\n'; - comment += '- Model initialization problems\n'; - comment += '- Audio processing errors\n\n'; - comment += 'Please check the workflow logs for detailed error information.'; + comment += 'āš ļø **Performance Below Target** - Consider parameter optimization\n'; } + comment += '\nšŸ“Š **Research Comparison:**\n'; + comment += '- Powerset BCE (2023): 18.5% DER\n'; + comment += '- EEND (2019): 25.3% DER\n'; + comment += '- x-vector clustering: 28.7% DER\n'; + comment += '\n\n---\n*Automated benchmark using AMI corpus ES2004a test file*'; github.rest.issues.createComment({ diff --git a/Sources/DiarizationCLI/main.swift b/Sources/DiarizationCLI/main.swift index bc78b28a9..3dc0324ea 100644 --- a/Sources/DiarizationCLI/main.swift +++ b/Sources/DiarizationCLI/main.swift @@ -55,7 +55,7 @@ struct DiarizationCLI { --debug Enable debug mode --output Output results to JSON file --auto-download Automatically download dataset if not found - + NOTE: Benchmark now uses real AMI manual annotations from Tests/ami_public_1.6.2/ If annotations are not found, falls back to simplified placeholder. @@ -72,22 +72,24 @@ struct DiarizationCLI { EXAMPLES: # Download AMI datasets swift run fluidaudio download --dataset ami-sdm - + # Run AMI SDM benchmark with auto-download swift run fluidaudio benchmark --auto-download - + # Run benchmark with custom threshold and save results swift run fluidaudio benchmark --threshold 0.8 --output results.json - + # Process a single audio file swift run fluidaudio process meeting.wav - + # Process file with custom settings swift run fluidaudio process meeting.wav --threshold 0.6 --output output.json """) } static func runBenchmark(arguments: [String]) async { + let benchmarkStartTime = Date() + var dataset = "ami-sdm" var threshold: Float = 0.7 var minDurationOn: Float = 1.0 @@ -189,6 +191,9 @@ struct DiarizationCLI { print("šŸ’” Supported datasets: ami-sdm, ami-ihm") exit(1) } + + let benchmarkElapsed = Date().timeIntervalSince(benchmarkStartTime) + print("\nā±ļø Total benchmark execution time: \(String(format: "%.1f", benchmarkElapsed)) seconds") } static func downloadDataset(arguments: [String]) async { @@ -781,11 +786,74 @@ struct DiarizationCLI { static func calculateJaccardErrorRate( predicted: [TimedSpeakerSegment], groundTruth: [TimedSpeakerSegment] ) -> Float { - let totalGTDuration = groundTruth.reduce(0) { $0 + $1.durationSeconds } - let totalPredDuration = predicted.reduce(0) { $0 + $1.durationSeconds } + // If no segments in either prediction or ground truth, return 100% error + if predicted.isEmpty && groundTruth.isEmpty { + return 0.0 // Perfect match - both empty + } else if predicted.isEmpty || groundTruth.isEmpty { + return 100.0 // Complete mismatch - one empty, one not + } + + // Use the same frame size as DER calculation for consistency + let frameSize: Float = 0.01 + let totalDuration = max( + predicted.map { $0.endTimeSeconds }.max() ?? 0, + groundTruth.map { $0.endTimeSeconds }.max() ?? 0 + ) + let totalFrames = Int(totalDuration / frameSize) + + // Get optimal speaker mapping using existing Hungarian algorithm + let speakerMapping = findOptimalSpeakerMapping( + predicted: predicted, + groundTruth: groundTruth, + totalDuration: totalDuration + ) + + var intersectionFrames = 0 + var unionFrames = 0 + + // Calculate frame-by-frame Jaccard + for frame in 0.. 0 ? Float(intersectionFrames) / Float(unionFrames) : 0.0 + + // Convert to error rate: JER = 1 - Jaccard Index + let jer = (1.0 - jaccardIndex) * 100.0 + + // Debug logging for first few calculations + if predicted.count > 0 && groundTruth.count > 0 { + print("šŸ” JER DEBUG: Intersection: \(intersectionFrames), Union: \(unionFrames), Jaccard Index: \(String(format: "%.3f", jaccardIndex)), JER: \(String(format: "%.1f", jer))%") + } + + return jer } static func findSpeakerAtTime(_ time: Float, in segments: [TimedSpeakerSegment]) -> String? { @@ -833,7 +901,7 @@ struct DiarizationCLI { // Find optimal assignment using Hungarian Algorithm for globally optimal solution let predSpeakerArray = Array(predSpeakers).sorted() // Consistent ordering let gtSpeakerArray = Array(gtSpeakers).sorted() // Consistent ordering - + // Build numerical overlap matrix for Hungarian algorithm var numericalOverlapMatrix: [[Int]] = [] for predSpeaker in predSpeakerArray { @@ -843,24 +911,24 @@ struct DiarizationCLI { } numericalOverlapMatrix.append(row) } - + // Convert overlap matrix to cost matrix (higher overlap = lower cost) let costMatrix = HungarianAlgorithm.overlapToCostMatrix(numericalOverlapMatrix) - + // Solve optimal assignment let assignments = HungarianAlgorithm.minimumCostAssignment(costs: costMatrix) - + // Create speaker mapping from Hungarian result var mapping: [String: String] = [:] var totalAssignmentCost: Float = 0 var totalOverlap = 0 - + for (predIndex, gtIndex) in assignments.assignments.enumerated() { if gtIndex != -1 && predIndex < predSpeakerArray.count && gtIndex < gtSpeakerArray.count { let predSpeaker = predSpeakerArray[predIndex] let gtSpeaker = gtSpeakerArray[gtIndex] let overlap = overlapMatrix[predSpeaker]![gtSpeaker]! - + if overlap > 0 { // Only assign if there's actual overlap mapping[predSpeaker] = gtSpeaker totalOverlap += overlap @@ -868,10 +936,10 @@ struct DiarizationCLI { } } } - + totalAssignmentCost = assignments.totalCost print("šŸ” HUNGARIAN RESULT: Total assignment cost: \(String(format: "%.1f", totalAssignmentCost)), Total overlap: \(totalOverlap) frames") - + // Handle unassigned predicted speakers for predSpeaker in predSpeakerArray { if mapping[predSpeaker] == nil {