In [1]:
# Setup and Imports
import sys
import os
from pathlib import Path

# Set working directory to notebook location
notebook_dir = Path(r"D:\Git_repos\Nemo-diarization")
os.chdir(notebook_dir)

# Add to Python path
if str(notebook_dir) not in sys.path:
    sys.path.insert(0, str(notebook_dir))

print(f"Working directory: {os.getcwd()}")
print(f"Python path includes: {notebook_dir}")

# Reload module to get latest changes
import importlib
if 'nemo_diarization' in sys.modules:
    importlib.reload(sys.modules['nemo_diarization'])

# Import functions
from nemo_diarization import diarize_with_nemo, add_transcription_to_segments, diarize_and_transcribe

print("✓ Imports successful (module reloaded)")


Working directory: D:\Git_repos\Nemo-diarization
Python path includes: D:\Git_repos\Nemo-diarization
✓ Imports successful (module reloaded)


## Create Voice Embeddings Database (Optional)

If you want to identify specific speakers by name, create a database of known voices first.
Otherwise, skip this section and speakers will be labeled as SPEAKER_00, SPEAKER_01, etc.

In [2]:
# """
# Create voice embeddings database for speaker identification
# You need reference audio samples for each person you want to identify
# """

# # Option 1: Create database from audio samples
# # Prepare your speaker samples - each person should have 1-3 audio clips
# speaker_samples = {
#     "sp3000": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0024.flac", 
#                r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0040.flac"],
#     "sp777" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0028.flac",
#                r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0025.flac"],
#     "sp422" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0021.flac",
#                r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0016.flac"],
#     "sp1993": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0005.flac",
#                r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0003.flac"],
# }

# # Database output path
# voice_embeddings_database_path = r"D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json"

# # Uncomment to create the database:
# from resemblyzer import VoiceEncoder
# import json

# encoder = VoiceEncoder()
# embeddings_db = {}

# for speaker_name, audio_files in speaker_samples.items():
#     print(f"Processing {speaker_name}...")
#     speaker_embeddings = []
    
#     for audio_file in audio_files:
#         from resemblyzer import preprocess_wav
#         wav = preprocess_wav(audio_file)
#         embedding = encoder.embed_utterance(wav)
#         speaker_embeddings.append(embedding.tolist())
    
#     # Average embeddings for better accuracy
#     import numpy as np
#     avg_embedding = np.mean(speaker_embeddings, axis=0)
#     embeddings_db[speaker_name] = avg_embedding.tolist()
#     print(f"  ✓ {speaker_name} enrolled")

# # Save database
# import os
# os.makedirs(os.path.dirname(voice_embeddings_database_path), exist_ok=True)
# with open(voice_embeddings_database_path, 'w') as f:
#     json.dump(embeddings_db, f, indent=2)

# print(f"\n✓ Database saved to: {voice_embeddings_database_path}")
# print(f"✓ Enrolled {len(embeddings_db)} speakers")

# print("To create embeddings database:")
# print("1. Prepare audio samples for each speaker")
# print("2. Update the speaker_samples dictionary above")
# print("3. Uncomment the code block")
# print("4. Run this cell")
# print("\nOr skip this if you only need anonymous speaker labels (SPEAKER_00, etc.)")

## Configuration

Set your paths and parameters here:

In [10]:
# Audio file to process
meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav"
meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_1m.wav"
meeting_audio_path = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_doubled.wav"

# Voice embeddings database (can be empty for basic diarization)
voice_embeddings_database_path = r"D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json"

# Language (e.g., 'en', 'fa', 'ar', or None for auto-detect)
expected_language = "en"

# Whisper model for transcription ('tiny', 'base', 'small', 'medium', 'large')
whisper_model = "medium"

# Number of expected speakers (optional, auto-detect if None)
num_speakers = None

# Backend: use_wsl=True for NeMo (GPU), False for pyannote (Windows)
use_wsl = True


In [11]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected - PyTorch may need reinstallation with CUDA support")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU: NVIDIA GeForce RTX 4090


## Run Diarization

NeMo diarization with GPU - completes in ~10 seconds:


In [12]:
# Step 1: Diarization (fast - GPU accelerated)
result = diarize_with_nemo(
    meeting_audio_path=meeting_audio_path,
    voice_embeddings_database_path=voice_embeddings_database_path,
    num_speakers=num_speakers,
    use_wsl=use_wsl
)

print("\n" + "="*70)
print("DIARIZATION RESULTS")
print("="*70)
print(f"Number of speakers: {result['num_speakers']}")
print(f"Total segments: {len(result['segments'])}")
print(f"Output files: {result['output_files']}")


NVIDIA NeMo SPEAKER DIARIZATION
Audio file: D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_doubled.wav
Voice database: D:\Git_repos\Nemo-diarization\outputs\db\speakers_db.json
Mode: WSL2 NeMo GPU

[WSL2 Mode] Running NeMo diarization in WSL2 Ubuntu...
Executing NeMo diarization in WSL...
Note: First run may take longer while models download


✓ NeMo diarization completed

[Speaker Identification] Matching speakers to database...
Loaded the voice encoder model on cuda in 0.03 seconds.


  checkpoint = torch.load(weights_fpath, map_location="cpu")


✓ Merged 25 segments → 15 segments

DIARIZATION RESULTS
Number of speakers: 4
Total segments: 15
Output files: {'rttm': '/mnt/d/Projects_tmp/noisy_audio_files/speeches/1/nemo_output/pred_rttms/concat_1_doubled.rttm'}


## View Diarization Results

Inspect the diarization segments:

In [13]:
# Display first 10 diarization segments
print("All segments:")
print("-" * 70)
for i, seg in enumerate(result['segments'], 1):
    print(f"{i}. [{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}")

All segments:
----------------------------------------------------------------------
1. [0.54s - 20.75s] sp3000
2. [21.18s - 33.31s] sp422
3. [33.58s - 39.71s] sp777
4. [39.98s - 44.10s] sp422
5. [44.10s - 58.35s] sp3000
6. [58.86s - 69.39s] sp777
7. [69.74s - 82.35s] sp1993
8. [82.86s - 116.27s] sp3000
9. [116.78s - 128.91s] sp422
10. [129.26s - 135.63s] sp777
11. [135.63s - 139.79s] sp422
12. [140.22s - 153.95s] sp3000
13. [154.46s - 164.99s] sp777
14. [165.34s - 177.95s] sp1993
15. [178.46s - 191.07s] sp3000


## Add Transcription

Add Whisper transcription to the diarization results (runs on Windows):


In [14]:
# Step 2: Add transcription (optional - runs on Windows with Whisper)
result = add_transcription_to_segments(
    diarization_result=result,
    expected_language=expected_language,
    model_name=whisper_model
)

print("\n" + "="*70)
print("TRANSCRIPTION RESULTS")
print("="*70)
print(f"Detected language: {result['detected_language']}")
print(f"Transcribed segments: {len(result['speaker_segments'])}")



[Transcription] Loading Whisper 'medium' model on CUDA...
[Transcription] Transcribing audio with CUDA...


100%|███████████████████████████████████████████████████████████████████████| 19122/19122 [00:29<00:00, 638.30frames/s]

[Transcription] Aligning with speaker segments...
✓ Transcription complete
✓ Detected language: en
✓ Transcribed 44 segments, merged into 16 speech blocks

TRANSCRIPTION RESULTS
Detected language: en
Transcribed segments: 44





## View Merged Speech (LLM-Ready Format)

Perfect for generating meeting minutes with an LLM:


In [16]:
# Display the merged speech text (ready for LLM input)
from pprint import pprint

if 'merged_speech_text' in result:
    print("MERGED SPEECH FOR LLM (speaker-labelled, consecutive segments merged):")
    print("="*70)
    pprint(result['merged_speech_text'])
    print("="*70)
    print(f"\n✓ {len(result['merged_speeches'])} merged speech blocks")
else:
    print("No transcription available. Run add_transcription_to_segments() first.")


MERGED SPEECH FOR LLM (speaker-labelled, consecutive segments merged):
('[0.00s - 20.38s] sp3000: Arctic beauty and desolation, with their blessings '
 'and dangers, all may be found here, to test the endurance and skill of '
 'adventurous climbers. But far better than climbing the mountain is going '
 'around its warm, fertile base, enjoying its bounties like a bee circling '
 'around a bank of flowers.\n'
 '\n'
 '[21.18s - 33.08s] sp422: The distinctions of moral values have either '
 'originated in a ruling caste pleasantly conscious of being different from '
 'the ruled, or among the ruled class, the slaves and dependents of all '
 'sorts.\n'
 '\n'
 '[33.80s - 39.68s] sp777: Stevie, accustomed to move about disregarded, had '
 'got up from the kitchen table carrying off his drawing to bed with him.\n'
 '\n'
 '[40.18s - 43.98s] sp422: We truthful ones, the nobility in ancient Greece '
 'called themselves.\n'
 '\n'
 '[44.62s - 58.22s] sp3000: Perhaps the profession of doing good may 

## View Transcription with Speakers

If transcription was enabled, view the transcribed text with speaker labels:

In [19]:
# Display transcription with speakers
if 'speaker_segments' in result:
    print("\nTranscription with Speaker Labels:")
    print("="*70)
    
    for seg in result['speaker_segments'][:20]:  # First 20 segments
        print(f"\n[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}:")
        print(f"  {seg['text']}")
else:
    print("Transcription not available. Set output_transcriptions=True to enable.")


Transcription with Speaker Labels:

[0.00s - 6.74s] sp3000:
  Arctic beauty and desolation, with their blessings and dangers, all may be found here, to test

[6.74s - 9.82s] sp3000:
  the endurance and skill of adventurous climbers.

[10.28s - 16.20s] sp3000:
  But far better than climbing the mountain is going around its warm, fertile base, enjoying

[16.20s - 20.38s] sp3000:
  its bounties like a bee circling around a bank of flowers.

[21.18s - 26.80s] sp422:
  The distinctions of moral values have either originated in a ruling caste pleasantly conscious

[26.80s - 32.14s] sp422:
  of being different from the ruled, or among the ruled class, the slaves and dependents

[32.14s - 33.08s] sp422:
  of all sorts.

[33.80s - 38.18s] sp777:
  Stevie, accustomed to move about disregarded, had got up from the kitchen table carrying

[38.18s - 39.68s] sp777:
  off his drawing to bed with him.

[40.18s - 43.98s] sp422:
  We truthful ones, the nobility in ancient Greece called themselves.

[44

## Full Transcription Text

View the complete transcribed text:

In [20]:
# Display full transcription
if 'transcription' in result:
    print("Full Transcription:")
    print("="*70)
    print(result['transcription'])
else:
    print("Transcription not available.")

Full Transcription:
 Arctic beauty and desolation, with their blessings and dangers, all may be found here, to test the endurance and skill of adventurous climbers. But far better than climbing the mountain is going around its warm, fertile base, enjoying its bounties like a bee circling around a bank of flowers. The distinctions of moral values have either originated in a ruling caste pleasantly conscious of being different from the ruled, or among the ruled class, the slaves and dependents of all sorts. Stevie, accustomed to move about disregarded, had got up from the kitchen table carrying off his drawing to bed with him. We truthful ones, the nobility in ancient Greece called themselves. Perhaps the profession of doing good may be full, but everybody should be kind at least to himself. Thus one saunters on and on in the glorious radiance, in utter peace and forgetfulness of time. The sheet of paper covered with circles dropped out of his fingers, and he remained staring at the old 

## All-in-One: Diarization + Transcription

Or use the convenience function that does both steps:


In [None]:
# Complete pipeline in one call (diarization + transcription)
# Commented out by default - uncomment to use
# result_combined = diarize_and_transcribe(
#     meeting_audio_path=meeting_audio_path,
#     voice_embeddings_database_path=voice_embeddings_database_path,
#     expected_language=expected_language,
#     transcriptor_model_name=whisper_model,
#     num_speakers=num_speakers,
#     use_wsl=use_wsl
# )

# print(f"✓ Complete! {result_combined['num_speakers']} speakers, {len(result_combined['speaker_segments'])} transcribed segments")

print("Uncomment above to run complete pipeline in one call")


Uncomment above to run complete pipeline in one call


## Export Results

Save results to different formats:

In [22]:
import json

# Save to JSON
output_file = "diarization_output.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, ensure_ascii=False)

print(f"✓ Results saved to: {output_file}")

# Save transcription to text file
if 'speaker_segments' in result:
    transcript_file = "transcript_with_speakers.txt"
    with open(transcript_file, 'w', encoding='utf-8') as f:
        for seg in result['speaker_segments']:
            f.write(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}:\n")
            f.write(f"{seg['text']}\n\n")
    
    print(f"✓ Transcript saved to: {transcript_file}")

✓ Results saved to: diarization_output.json
✓ Transcript saved to: transcript_with_speakers.txt


## Testing with Different Whisper Models

Test with your cached Whisper models:

In [None]:
# Example: Test with different models

# Test with small model
# result_small = diarize_and_transcribe(
#     meeting_audio_path=meeting_audio_path,
#     expected_language="en",
#     transcriptor_model_path=r"D:\path\to\whisper_small.pt"
# )

# Test with medium model
# result_medium = diarize_and_transcribe(
#     meeting_audio_path=meeting_audio_path,
#     expected_language="en",
#     transcriptor_model_path=r"D:\path\to\whisper_medium.pt"
# )

# Test with Persian finetuned model
# result_persian = diarize_and_transcribe(
#     meeting_audio_path=r"path\to\persian_audio.wav",
#     expected_language="fa",
#     transcriptor_model_path=r"D:\path\to\whisper_persian_finetuned.pt"
# )

print("Uncomment the examples above to test with your cached models")