In [None]:
! apt-get update && apt-get install -y libsndfile1 ffmpeg
! pip install Cython
! pip install packaging
! pip -q install nemo_toolkit['asr']

In [None]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import EncDecCTCModel
from nemo.collections.asr.models import EncDecDiarLabelModel
from nemo.utils.exp_manager import exp_manager

In [None]:
import os
import zipfile
import json
import pandas as pd
import torch
from omegaconf import OmegaConf
from omegaconf import DictConfig
import librosa

# Manifest Creation:
For inference we will need a manifest file that contains a description row of each sample for the model to diarize. <br><br>
Manifest Row Example:
```json
{"audio_filepath": "wavs/audio_sample_75.wav", "offset": 0, "duration": 11.0, "label": "infer", "text": "-", "rttm_filepath": null}
```

In [None]:
# Code to create the manifest file

import os
import json
import librosa

def create_wav_manifest(folder_path, output_json_path):
    with open(output_json_path, 'w', encoding='utf-8') as fout:
        # Loop through all files in the folder
        for filename in os.listdir(folder_path):

            # Check if the file is a .wav file
            if filename.lower().endswith('.wav'):

                # Create the full file path
                file_path = os.path.join(folder_path, filename)
                duration = librosa.core.get_duration(filename=file_path)

                # Create a dictionary for this file
                file_entry = {
                    "audio_filepath": file_path,
                    "offset": 0,
                    "duration": duration,
                    "label": "infer",
                    "text": "-"
                }

                json.dump(file_entry, fout)
                fout.write('\n')
    
    
    print(f"Manifest file created at {output_json_path}")

# Example usage
folder_path = 'inference/wavs/'
output_json_path = 'inference/test_manifest.json'
create_wav_manifest(folder_path, output_json_path)

# Inference

In [None]:
inf_config = {
    "name": "ClusterDiarizer",
    "num_workers": 1,
    "sample_rate": 16000,
    "batch_size": 64,
    "device": 'cuda',
    "verbose": True,
    "diarizer": {
        "manifest_filepath": "inference/test_manifest.json",
        "out_dir": "/kaggle/working/",
        "oracle_vad": False,
        "collar": 0.25,
        "ignore_overlap": True,
        "vad": {
            "model_path": "models/vad_multilingual_marblenet.nemo",
            "external_vad_manifest": None,
            "parameters": {
                "window_length_in_sec": 0.63,
                "shift_length_in_sec": 0.01,
                "smoothing": False,
                "overlap": 0.5,
                "onset": 0.8,
                "offset": 0.6,
                "pad_onset": -0.05,
                "pad_offset": 0,
                "min_duration_on": 0,
                "min_duration_off": 0.6,
                "filter_speech_first": True
            }
        },
        "speaker_embeddings": {
            "model_path": "models/titanet-l.nemo",
            "parameters": {
                "window_length_in_sec": [1.5,1.25,1.0,0.75,0.5],
                "shift_length_in_sec": [0.75,0.625,0.5,0.375,0.1],
                "multiscale_weights": [1,1,1,1,1],
                "save_embeddings": True
            }
        },
        "clustering": {
            "parameters": {
                "oracle_num_speakers": False,
                "max_num_speakers": 5,
                "enhanced_count_thres": 80,
                "max_rp_threshold": 0.25,
                "sparse_search_volume": 30,
                "maj_vote_spk_count": False,
                "chunk_cluster_count": 50,
                "embeddings_per_chunk": 10000
            }
        },
        "msdd_model": {
            "model_path": 'models/diar_msdd_telephonic.nemo',
            "parameters": {
                "use_speaker_model_from_ckpt": True,
                "infer_batch_size": 25,
                "sigmoid_threshold": [0.7, 1.0],
                "seq_eval_mode": False,
                "split_infer": True,
                "diar_window_length": 50,
                "overlap_infer_spk_limit": 5
            }
        }
    }
}

inf_config = OmegaConf.create(inf_config)

In [None]:
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
oracle_vad_msdd_model = NeuralDiarizer(cfg=inf_config)

oracle_vad_msdd_model.diarize()