## Setup and Imports

In [10]:
import sys
import os
from pathlib import Path

os.chdir(r"D:\Git_repos\ClearCast\Nemo-diarization")

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

# IMPORTANT: Reload modules if they were previously imported
import importlib
if 'diarization_processor' in sys.modules:
    importlib.reload(sys.modules['diarization_processor'])
if 'voice_enrollment' in sys.modules:
    importlib.reload(sys.modules['voice_enrollment'])
if 'transcription' in sys.modules:
    importlib.reload(sys.modules['transcription'])
if 'main' in sys.modules:
    importlib.reload(sys.modules['main'])

# Import our modules
from voice_enrollment import VoiceEnrollment, create_voice_database
from diarization_processor import DiarizationProcessor
from transcription import TranscriptionProcessor
from main import process_meeting_audio, quick_diarize, diarize_and_transcribe

print("✓ Imports successful")

✓ Imports successful


In [8]:
# Set up HuggingFace token for model downloads
import os
from pathlib import Path

HF_TOKEN_PATH = Path(r"D:\Git_repos\ClearCast\hf_token.txt")

if HF_TOKEN_PATH.exists():
    with open(HF_TOKEN_PATH, 'r') as f:
        hf_token = f.read().strip()
        os.environ['HF_TOKEN'] = hf_token
        os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token
        os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token
        os.environ['HF_HOME'] = str(Path.home() / '.cache' / 'huggingface')
    print("✓ HuggingFace token loaded")
else:
    print("⚠ HuggingFace token not found at:", HF_TOKEN_PATH)
    print("  Models will be downloaded without authentication")

# Fix huggingface_hub version compatibility
try:
    import huggingface_hub
    print(f"✓ huggingface_hub version: {huggingface_hub.__version__}")
    
    # Upgrade if needed
    if hasattr(huggingface_hub, 'hf_hub_download'):
        import inspect
        sig = inspect.signature(huggingface_hub.hf_hub_download)
        if 'use_auth_token' in sig.parameters and 'token' not in sig.parameters:
            print("⚠ Old huggingface_hub version detected. Upgrading...")
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "huggingface-hub"])
            print("✓ huggingface_hub upgraded. Please restart kernel.")
except Exception as e:
    print(f"Warning: {e}")

✓ HuggingFace token loaded
✓ huggingface_hub version: 1.2.1


## Configuration

Set your file paths here:

In [3]:
# ==== CONFIGURE THESE PATHS ====

# Speaker voice samples for enrollment
SPEAKER_SAMPLES = {
    "sp3000": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0024.flac", 
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\3000\15664\3000-15664-0040.flac"],
    "sp777" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0028.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\777\126732\777-126732-0025.flac"],
    "sp422" : [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0021.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\422\122949\422-122949-0016.flac"],
    "sp1993": [r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0005.flac",
               r"D:\Projects_tmp\noisy_audio_files\LibriSpeech\dev-clean\1993\147964\1993-147964-0003.flac"],
}

# Where to save speaker database
SPEAKER_DATABASE = r"D:\Git_repos\ClearCast\Nemo-diarization\outputs\db\speakers_db.json"

# Optional: Path to custom Whisper model
WHISPER_MODEL_PATH = r"C:\Users\User_1\.cache\huggingface\hub\models--openai--whisper-medium"  # or "path/to/whisper_medium.pt"

# Language (en, fa, ar, etc.) or None for auto-detect
LANGUAGE = "en"  # Change to "fa" for Persian

# Output directory
OUTPUT_DIR = r"D:\Git_repos\ClearCast\Nemo-diarization\outputs\files"

print("✓ Configuration set")

✓ Configuration set


## Step 1: Create Speaker Database

Enroll speakers by providing reference audio samples.

In [4]:
# Create voice database from samples
print("Creating speaker database...")

enrollment = create_voice_database(
    database_path=SPEAKER_DATABASE,
    speaker_samples=SPEAKER_SAMPLES
)

print(f"\n✓ Database created with {len(enrollment.get_all_speakers())} speakers")
print(f"Enrolled speakers: {enrollment.get_all_speakers()}")

Creating speaker database...
Loaded the voice encoder model on cuda in 0.17 seconds.
✓ Loaded 4 speakers from database
✓ Enrolled speaker: sp3000 (from 2 samples)
✓ Enrolled speaker: sp777 (from 2 samples)
✓ Enrolled speaker: sp422 (from 2 samples)
✓ Enrolled speaker: sp1993 (from 2 samples)
✓ Database saved to: D:\Git_repos\ClearCast\Nemo-diarization\outputs\db\speakers_db.json

✓ Database created with 4 speakers
Enrolled speakers: ['sp3000', 'sp777', 'sp422', 'sp1993']


## Step 2: Quick Diarization (No Transcription)

Perform speaker diarization to detect "who spoke when"

In [5]:
# Meeting audio file to process
MEETING_AUDIO = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav"
# MEETING_AUDIO = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_doubled.wav"
# MEETING_AUDIO = r"D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1_1m.wav"

# Quick diarization without transcription
print("Performing diarization...\n")

result = quick_diarize(
    audio_path=MEETING_AUDIO,
    database_path=SPEAKER_DATABASE,
    output_dir=OUTPUT_DIR,
    window_size=1.5,  # Smaller window for better boundary precision
    hop_size=0.5      # Smaller hop for more frequent sampling
)

print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"Number of speakers detected: {result['num_speakers']}")
print(f"Identified speakers: {result['identified_speakers']}")
print(f"Total segments: {len(result['segments'])}")
print(f"\nOutput files:")
for key, path in result['output_files'].items():
    print(f"  {key}: {path}")

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Performing diarization...

SPEAKER DIARIZATION PIPELINE
Audio file: D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav
Voice database: D:\Git_repos\ClearCast\Nemo-diarization\outputs\db\speakers_db.json
Language: auto-detect
Transcription: disabled

[1/4] Loading speaker database...
Loaded the voice encoder model on cuda in 0.02 seconds.
✓ Loaded 4 speakers from database
✓ Loaded 4 enrolled speakers: ['sp3000', 'sp777', 'sp422', 'sp1993']

[2/4] Performing speaker diarization...
Using device: cuda
Loading SpeechBrain models...
Continuing with Resemblyzer only...
Loaded the voice encoder model on cuda in 0.01 seconds.
✓ Models loaded successfully
Processing: D:\Projects_tmp\noisy_audio_files\speeches\1\concat_1.wav
Extracting speaker embeddings...
Extracted 87 embeddings
Audio duration: 95.61s
Clustering speakers...
✓ Found 6 speakers
✓ Generated 11 segments
✓ Results saved to: D:\Git_repos\ClearCast\Nemo-diarization\outputs\files\diarization_raw.json

[3/4] Identifying speakers.

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Step 3: View Diarization Segments

In [6]:
# Display all segments
print("Diarization Segments:")
print(f"{'='*60}\n")

# Check audio duration
if result['segments']:
    total_duration = result['segments'][-1]['end']
    print(f"Total audio duration: {total_duration:.2f}s\n")

for i, seg in enumerate(result['segments'], 1):
    start = seg['start']
    end = seg['end']
    duration = end - start
    speaker = seg['speaker']
    identified = "✓" if seg.get('identified', False) else "?"
    
    print(f"{i:2d}. [{start:6.2f}s - {end:6.2f}s] ({duration:5.2f}s) {identified} {speaker}")

Diarization Segments:

Total audio duration: 95.61s

 1. [  1.00s -  18.00s] (17.00s) ✓ sp3000
 2. [ 18.00s -  31.00s] (13.00s) ✓ sp422
 3. [ 31.00s -  36.00s] ( 5.00s) ✓ sp422
 4. [ 36.00s -  41.00s] ( 5.00s) ✓ sp777
 5. [ 41.00s -  53.00s] (12.00s) ✓ sp3000
 6. [ 53.00s -  63.00s] (10.00s) ✓ sp3000
 7. [ 63.00s -  74.00s] (11.00s) ✓ sp777
 8. [ 74.00s -  75.00s] ( 1.00s) ? Speaker_5
 9. [ 75.00s -  86.00s] (11.00s) ✓ sp1993
10. [ 86.00s -  84.59s] (-1.41s) ? Speaker_4
11. [ 84.59s -  95.61s] (11.02s) ✓ sp3000


## Step 4: Full Pipeline with Transcription

Run complete pipeline with speaker diarization + transcription

In [11]:
# Test audio path
MEETING_AUDIO = r"D:\Projects_tmp\noisy_audio_files\output-4-speakers.wav"

# Initialize diarization processor
print("Initializing NeMo diarization processor...")
processor = DiarizationProcessor()

# Perform diarization with NeMo (num_speakers is optional - can auto-detect)
print(f"\nProcessing audio: {MEETING_AUDIO}")
segments = processor.perform_diarization(
    MEETING_AUDIO,
    num_speakers=4  # Set to None for auto-detection
)

# Display results
print(f"\n{'='*60}")
print(f"DIARIZATION RESULTS")
print(f"{'='*60}")
for i, seg in enumerate(segments, 1):
    duration = seg['end'] - seg['start']
    print(f"Segment {i:2d}: {seg['start']:6.2f}s - {seg['end']:6.2f}s ({duration:5.2f}s) | {seg['speaker']}")

Initializing NeMo diarization processor...
Using device: cuda




Error importing huggingface_hub.hf_api: cannot import name 'HfFolder' from 'huggingface_hub.utils' (D:\Projects\venv\Lib\site-packages\huggingface_hub\utils\__init__.py)


ImportError: NVIDIA NeMo not installed. Please install with:
pip install nemo_toolkit[asr]

## Step 5: View Transcript with Speakers

In [None]:
# Display transcript with speaker labels
if 'transcription' in result_full:
    print("TRANSCRIPT WITH SPEAKER LABELS")
    print(f"{'='*60}\n")
    
    # Show first 10 segments with text
    for i, seg in enumerate(result_full['segments'][:10], 1):
        if 'text' in seg:
            print(f"[{seg['start']:6.2f}s - {seg['end']:6.2f}s]")
            print(f"{seg['speaker']}: {seg['text']}")
            print()
    
    if len(result_full['segments']) > 10:
        print(f"... and {len(result_full['segments']) - 10} more segments\n")
    
    # Full transcript
    print(f"\n{'='*60}")
    print("FULL TRANSCRIPT")
    print(f"{'='*60}\n")
    print(result_full['transcription'])
else:
    print("No transcription available (transcription was disabled)")

## Step 6: Alternative - Using Direct Function Call

In [None]:
# Alternative method: Direct function call with all parameters
result_alt = diarize_and_transcribe(
    audio_path=MEETING_AUDIO,
    database_path=SPEAKER_DATABASE,
    language=LANGUAGE,
    whisper_model=WHISPER_MODEL_PATH,
    output_dir="diarization_alternative"
)

print(f"✓ Processing complete")
print(f"Results saved to: {result_alt['output_dir']}")

## Step 7: Save Results Summary

In [None]:
import json

# Create summary
summary = {
    "audio_file": MEETING_AUDIO,
    "num_speakers": result_full['num_speakers'],
    "identified_speakers": result_full['identified_speakers'],
    "enrolled_speakers": result_full['enrolled_speakers'],
    "total_segments": len(result_full['segments']),
    "output_files": result_full['output_files']
}

# Save summary
summary_path = Path(result_full['output_dir']) / "summary.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✓ Summary saved to: {summary_path}")
print("\nSummary:")
print(json.dumps(summary, indent=2))

## Advanced: Manual Testing of Components

Test individual components separately

In [None]:
# Test 1: Voice Enrollment
print("Testing Voice Enrollment...")
test_enrollment = VoiceEnrollment("test_database.json")

# Enroll a test speaker (replace with actual audio path)
# test_enrollment.enroll_speaker("TestSpeaker", "test_audio.wav")
# test_enrollment.save_database()

print("✓ Voice Enrollment working")

In [None]:
# Test 2: Diarization Processor
print("Testing Diarization Processor...")
test_diarizer = DiarizationProcessor()

# Perform diarization (replace with actual audio path)
# test_segments = test_diarizer.perform_diarization("test_meeting.wav")
# print(f"Found {len(test_segments)} segments")

print("✓ Diarization Processor working")

In [None]:
# Test 3: Transcription Processor
print("Testing Transcription Processor...")
test_transcriptor = TranscriptionProcessor(model_name="base")

# Transcribe audio (replace with actual audio path)
# test_result = test_transcriptor.transcribe_audio("test_audio.wav", language="en")
# print(f"Transcribed {len(test_result['segments'])} segments")

print("✓ Transcription Processor working")

## Troubleshooting

Common issues and solutions:

1. **GPU Memory Error**: Use smaller Whisper model or switch to CPU
2. **Poor Identification**: Enroll speakers with longer/multiple samples
3. **Wrong Speaker Count**: Specify `num_speakers` parameter
4. **Language Issues**: Explicitly set `expected_language` parameter

In [None]:
# Check system resources
import torch

print("System Information:")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Complete!

Your diarization system is now ready to use. Check the output directory for all generated files.