In [None]:
! apt-get update && apt-get install -y libsndfile1 ffmpeg
! pip install Cython
! pip install packaging
! pip -q install nemo_toolkit['asr']

In [None]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import EncDecCTCModel
from nemo.collections.asr.models import EncDecDiarLabelModel
from nemo.utils.exp_manager import exp_manager

In [None]:
import os
import zipfile
import json
import pandas as pd
import torch
from omegaconf import OmegaConf
from omegaconf import DictConfig
import librosa

# Inference

In [None]:
inf_config = {
    "name": "ClusterDiarizer",
    "num_workers": 1,
    "sample_rate": 16000,
    "batch_size": 64,
    "device": 'cuda',
    "verbose": True,
    "diarizer": {
        "manifest_filepath": "/kaggle/input/sample/sample/manifest.json",
        "out_dir": "/kaggle/working/",
        "oracle_vad": False,
        "collar": 0.25,
        "ignore_overlap": True,
        "vad": {
            "model_path": "/kaggle/input/diarization-models/models/vad_multilingual_marblenet.nemo",
            "external_vad_manifest": None,
            "parameters": {
                "window_length_in_sec": 0.63,
                "shift_length_in_sec": 0.01,
                "smoothing": False,
                "overlap": 0.5,
                "onset": 0.8,
                "offset": 0.6,
                "pad_onset": -0.05,
                "pad_offset": 0,
                "min_duration_on": 0,
                "min_duration_off": 0.6,
                "filter_speech_first": True
            }
        },
        "speaker_embeddings": {
            "model_path": "/kaggle/input/diarization-models/models/titanet-l.nemo",
            "parameters": {
                "window_length_in_sec": [1.5,1.25,1.0,0.75,0.5],
                "shift_length_in_sec": [0.75,0.625,0.5,0.375,0.1],
                "multiscale_weights": [1,1,1,1,1],
                "save_embeddings": True
            }
        },
        "clustering": {
            "parameters": {
                "oracle_num_speakers": False,
                "max_num_speakers": 5,
                "enhanced_count_thres": 80,
                "max_rp_threshold": 0.25,
                "sparse_search_volume": 30,
                "maj_vote_spk_count": False,
                "chunk_cluster_count": 50,
                "embeddings_per_chunk": 10000
            }
        },
        "msdd_model": {
            "model_path": '/kaggle/working/msdd_final.nemo',
            "parameters": {
                "use_speaker_model_from_ckpt": True,
                "infer_batch_size": 25,
                "sigmoid_threshold": [0.7, 1.0],
                "seq_eval_mode": False,
                "split_infer": True,
                "diar_window_length": 50,
                "overlap_infer_spk_limit": 5
            }
        }
    }
}

inf_config = OmegaConf.create(inf_config)

In [None]:
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
oracle_vad_msdd_model = NeuralDiarizer(cfg=inf_config)

oracle_vad_msdd_model.diarize()