In [21]:
# Setup and Imports
import sys
import os
from pathlib import Path

# Set working directory to notebook location
notebook_dir = Path(r"D:\Git_repos\Nemo-diarization")
os.chdir(notebook_dir)

# Add to Python path
if str(notebook_dir) not in sys.path:
    sys.path.insert(0, str(notebook_dir))

print(f"Working directory: {os.getcwd()}")
print(f"Python path includes: {notebook_dir}")

# Reload module to get latest changes
import importlib
if 'nemo_diarization' in sys.modules:
    importlib.reload(sys.modules['nemo_diarization'])

# Import the main function
from nemo_diarization import process_audio_with_nemo, diarize_and_transcribe

print("✓ Imports successful (module reloaded)")


Working directory: D:\Git_repos\Nemo-diarization
Python path includes: D:\Git_repos\Nemo-diarization
✓ Imports successful (module reloaded)


## Create Voice Embeddings Database (Optional)

If you want to identify specific speakers by name, create a database of known voices first.
Otherwise, skip this section and speakers will be labeled as SPEAKER_00, SPEAKER_01, etc.

In [None]:
"""
Create voice embeddings database for speaker identification
You need reference audio samples for each person you want to identify
"""

# Option 1: Create database from audio samples
# Prepare your speaker samples - each person should have 1-3 audio clips
speaker_samples = {
    "sp3000": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0024.flac", 
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0040.flac"],
    "sp777" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0028.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0025.flac"],
    "sp422" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0021.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0016.flac"],
    "sp1993": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0005.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0003.flac"],
}

# Database output path
voice_embeddings_database_path = r"D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json"

# Uncomment to create the database:
from resemblyzer import VoiceEncoder
import json

encoder = VoiceEncoder()
embeddings_db = {}

for speaker_name, audio_files in speaker_samples.items():
    print(f"Processing {speaker_name}...")
    speaker_embeddings = []
    
    for audio_file in audio_files:
        from resemblyzer import preprocess_wav
        wav = preprocess_wav(audio_file)
        embedding = encoder.embed_utterance(wav)
        speaker_embeddings.append(embedding.tolist())
    
    # Average embeddings for better accuracy
    import numpy as np
    avg_embedding = np.mean(speaker_embeddings, axis=0)
    embeddings_db[speaker_name] = avg_embedding.tolist()
    print(f"  ✓ {speaker_name} enrolled")

# Save database
import os
os.makedirs(os.path.dirname(voice_embeddings_database_path), exist_ok=True)
with open(voice_embeddings_database_path, 'w') as f:
    json.dump(embeddings_db, f, indent=2)

print(f"\n✓ Database saved to: {voice_embeddings_database_path}")
print(f"✓ Enrolled {len(embeddings_db)} speakers")

print("To create embeddings database:")
print("1. Prepare audio samples for each speaker")
print("2. Update the speaker_samples dictionary above")
print("3. Uncomment the code block")
print("4. Run this cell")
print("\nOr skip this if you only need anonymous speaker labels (SPEAKER_00, etc.)")

## Configuration

Set your paths and parameters here:

In [17]:
# Audio file to process
meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav"
# meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_1m.wav"
# meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_doubled.wav"

# Voice embeddings database (can be empty for basic diarization)
voice_embeddings_database_path = r"D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json"

# Language (e.g., 'en', 'fa', 'ar', or None for auto-detect)
expected_language = "en"

# Enable transcription
output_transcriptions = False

# Path to your cached Whisper model (optional)
# Examples:
# - Small model: r"path/to/whisper/small.pt"
# - Medium model: r"path/to/whisper/medium.pt"  
# - Persian finetuned: r"path/to/whisper/persian_finetuned.pt"
transcriptor_model_path = None  # Will use default "base" model if None

# Number of expected speakers (optional, auto-detect if None)
num_speakers = None

# Backend selection:
# - use_wsl=True: WSL2 NeMo with GPU (RECOMMENDED - accurate! Now using native WSL venv for speed)
# - use_wsl=False: Windows pyannote.audio (fallback option)
# 
# Note: NeMo now uses native WSL venv (~nemo_venv) instead of mounted drive
# This fixes the slow import issue while keeping GPU acceleration!
use_wsl = True  # Changed back to True - NeMo with GPU is working now!

## Run Diarization

Simple one-line execution:

In [22]:
# Run the diarization pipeline
result = process_audio_with_nemo(
    meeting_audio_path=meeting_audio_path,
    voice_embeddings_database_path=voice_embeddings_database_path,
    expected_language=expected_language,
    output_transcriptions=output_transcriptions,
    transcriptor_model_path=transcriptor_model_path,
    num_speakers=num_speakers,
    use_wsl=use_wsl
)

print("\n" + "="*70)
print("RESULTS SUMMARY")
print("="*70)
print(f"Number of speakers: {result['num_speakers']}")
print(f"Total segments: {len(result['segments'])}")
if 'transcription' in result:
    print(f"Detected language: {result.get('detected_language', 'N/A')}")
print(f"Output files: {result['output_files']}")

NVIDIA NeMo SPEAKER DIARIZATION PIPELINE
Audio file: D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav
Voice database: D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json
Language: en
Transcription: disabled
Mode: WSL2 NeMo

[WSL2 Mode] Running NeMo diarization in WSL2 Ubuntu...
Executing NeMo diarization in WSL...
Note: First run may take longer while models download


✓ NeMo diarization completed

[Speaker Identification] Matching speakers to database...
Loaded the voice encoder model on cpu in 0.01 seconds.
✓ Merged 12 segments → 8 segments

RESULTS SUMMARY
Number of speakers: 4
Total segments: 8
Output files: {'rttm': '/mnt/d/Projects_tmp/noisy_audio_files/speeches/1/nemo_output/pred_rttms/concat_1.rttm'}


## View Diarization Results

Inspect the diarization segments:

In [24]:
# Display first 10 diarization segments
print("All segments:")
print("-" * 70)
for i, seg in enumerate(result['segments'], 1):
    print(f"{i}. [{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}")

All segments:
----------------------------------------------------------------------
1. [0.54s - 20.75s] sp3000
2. [21.18s - 33.31s] sp422
3. [33.58s - 39.71s] sp777
4. [39.98s - 44.10s] sp422
5. [44.10s - 58.35s] sp3000
6. [58.86s - 69.39s] sp777
7. [69.74s - 82.35s] sp1993
8. [82.86s - 95.47s] sp3000


## View Transcription with Speakers

If transcription was enabled, view the transcribed text with speaker labels:

In [None]:
# Display transcription with speakers
if 'speaker_segments' in result:
    print("\nTranscription with Speaker Labels:")
    print("="*70)
    
    for seg in result['speaker_segments'][:20]:  # First 20 segments
        print(f"\n[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}:")
        print(f"  {seg['text']}")
else:
    print("Transcription not available. Set output_transcriptions=True to enable.")

## Full Transcription Text

View the complete transcribed text:

In [None]:
# Display full transcription
if 'transcription' in result:
    print("Full Transcription:")
    print("="*70)
    print(result['transcription'])
else:
    print("Transcription not available.")

## Alternative: Simplified Function

You can also use the simplified wrapper function:

In [None]:
# Using the simplified function
result2 = diarize_and_transcribe(
    meeting_audio_path=meeting_audio_path,
    expected_language="en",
    output_transcriptions=True,
    transcriptor_model_path=None  # Use default model
)

print(f"Processed {len(result2['segments'])} segments")

## Export Results

Save results to different formats:

In [None]:
import json

# Save to JSON
output_file = "diarization_output.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, ensure_ascii=False)

print(f"✓ Results saved to: {output_file}")

# Save transcription to text file
if 'speaker_segments' in result:
    transcript_file = "transcript_with_speakers.txt"
    with open(transcript_file, 'w', encoding='utf-8') as f:
        for seg in result['speaker_segments']:
            f.write(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}:\n")
            f.write(f"{seg['text']}\n\n")
    
    print(f"✓ Transcript saved to: {transcript_file}")

## Testing with Different Whisper Models

Test with your cached Whisper models:

In [None]:
# Example: Test with different models

# Test with small model
# result_small = diarize_and_transcribe(
#     meeting_audio_path=meeting_audio_path,
#     expected_language="en",
#     transcriptor_model_path=r"D:\path\to\whisper_small.pt"
# )

# Test with medium model
# result_medium = diarize_and_transcribe(
#     meeting_audio_path=meeting_audio_path,
#     expected_language="en",
#     transcriptor_model_path=r"D:\path\to\whisper_medium.pt"
# )

# Test with Persian finetuned model
# result_persian = diarize_and_transcribe(
#     meeting_audio_path=r"path\to\persian_audio.wav",
#     expected_language="fa",
#     transcriptor_model_path=r"D:\path\to\whisper_persian_finetuned.pt"
# )

print("Uncomment the examples above to test with your cached models")