# Following https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb

In [1]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.

# Install dependencies
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython
!pip install nemo_toolkit['all']
!pip install wget
!apt-get -y install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install resampy

## Install NeMo
BRANCH = 'r1.21.0'
# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

# Install TorchAudio
!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html


Get:1 https://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]
Get:2 https://packages.cloud.google.com/apt google-fast-socket InRelease [5015 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [553 kB]
Hit:6 http://packages.cloud.google.com/apt gcsfuse-focal InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [631 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [1467 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1520 kB]
Get:13 http://security.ubuntu.

In [2]:
import os
NEMO_ROOT = os.getcwd()
print(NEMO_ROOT)
from glob import glob
import subprocess
import tarfile
import wget
import pandas as pd
import librosa
import soundfile as sf
import json

data_dir = os.path.join(NEMO_ROOT,'data')
os.makedirs(data_dir, exist_ok=True)

/kaggle/working


In [3]:
org_path = glob("/kaggle/input/aiconnect-speaker-recognition/train/*/*.wav")
for path in org_path:
#     print(os.path.join(data_dir, path.split("/")[-2]))
    os.makedirs(os.path.join(data_dir, path.split("/")[-2]), exist_ok=True)
    y, sr = librosa.load(path)
    newy = librosa.resample(y, orig_sr = sr, target_sr = 16000)
    sf.write(os.path.join(data_dir, "/".join(path.split("/")[-2:])) , newy, 16000, subtype='PCM_24')



In [4]:
all_path = glob("/kaggle/working/data/*/*.wav")

# print(all_path)

all_df = pd.DataFrame(all_path, columns = ["audio_filepath"])
all_df["label"] = all_df["audio_filepath"].str.split('/').str[-2]

print(all_df)

                                          audio_filepath       label
0      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_18.wav  IcsXs0U8hE
1       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_8.wav  IcsXs0U8hE
2      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_11.wav  IcsXs0U8hE
3       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_7.wav  IcsXs0U8hE
4      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_10.wav  IcsXs0U8hE
...                                                  ...         ...
37529  /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_14.wav  NtUrXttX7D
37530   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_7.wav  NtUrXttX7D
37531   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_8.wav  NtUrXttX7D
37532   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_5.wav  NtUrXttX7D
37533  /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_17.wav  NtUrXttX7D

[37534 rows x 2 columns]


In [5]:
all_df["duration"] = all_df["audio_filepath"].apply(lambda x : librosa.get_duration(path=x))
print(all_df)

                                          audio_filepath       label  duration
0      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_18.wav  IcsXs0U8hE  3.497875
1       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_8.wav  IcsXs0U8hE  3.412562
2      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_11.wav  IcsXs0U8hE  4.777563
3       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_7.wav  IcsXs0U8hE  3.412562
4      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_10.wav  IcsXs0U8hE  4.095063
...                                                  ...         ...       ...
37529  /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_14.wav  NtUrXttX7D  4.180375
37530   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_7.wav  NtUrXttX7D  4.692250
37531   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_8.wav  NtUrXttX7D  4.606937
37532   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_5.wav  NtUrXttX7D  3.156625
37533  /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_17.wav  NtUrXttX7D  3.241938

[37534 rows x 3 columns]


In [6]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=26, random_state = 11, shuffle=True)


train_idx , valid_idx = next(skf.split(all_df, all_df["label"]))

train_df = all_df.iloc[train_idx]
valid_df = all_df.iloc[valid_idx]

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [7]:
print(all_df["label"].value_counts())
print(train_df)
print(valid_df)
print(train_df["label"].value_counts())

label
VNo8Yjdg7z    36
2QW2iHGEsR    36
0MkECoR1sC    36
IfQF0pG43X    36
B3XvjX8QkX    36
              ..
U7b7CPr0Rr     1
6wqhqNRNyO     1
GHcqB5KSDR     1
8JjCAArDMt     1
TUrDw0WLXO     1
Name: count, Length: 1592, dtype: int64
                                          audio_filepath       label  duration
0      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_18.wav  IcsXs0U8hE  3.497875
1       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_8.wav  IcsXs0U8hE  3.412562
2      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_11.wav  IcsXs0U8hE  4.777563
3       /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_7.wav  IcsXs0U8hE  3.412562
4      /kaggle/working/data/IcsXs0U8hE/IcsXs0U8hE_10.wav  IcsXs0U8hE  4.095063
...                                                  ...         ...       ...
37529  /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_14.wav  NtUrXttX7D  4.180375
37530   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_7.wav  NtUrXttX7D  4.692250
37531   /kaggle/working/data/NtUrXttX7D/NtUrXttX7D_8.wav

In [8]:
def write_file(name, lines):
    with open(name, 'w', encoding='utf-8') as fout:
        for line in lines:
        
            dic = line
            json.dump(dic, fout)
            fout.write('\n')
    print("wrote", name)

In [9]:

write_file(os.path.join(data_dir, "all_manifest.json"), all_df.to_dict('records'))
write_file(os.path.join(data_dir, "train.json"), train_df.to_dict('records'))
write_file(os.path.join(data_dir, "dev.json"), valid_df.to_dict('records'))

wrote /kaggle/working/data/all_manifest.json
wrote /kaggle/working/data/train.json
wrote /kaggle/working/data/dev.json


In [10]:
import nemo
# NeMo's ASR collection - This collection contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf

In [11]:
!wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml
MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/titanet-finetune.yaml')
finetune_config = OmegaConf.load(MODEL_CONFIG)
print(OmegaConf.to_yaml(finetune_config))

--2023-11-28 04:18:45--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4507 (4.4K) [text/plain]
Saving to: ‘conf/titanet-finetune.yaml’


2023-11-28 04:18:45 (61.2 MB/s) - ‘conf/titanet-finetune.yaml’ saved [4507/4507]

name: TitaNet-Finetune
sample_rate: 16000
init_from_pretrained_model:
  speaker_tasks:
    name: titanet_large
    include:
    - preprocessor
    - encoder
    exclude:
    - decoder.final
model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      

In [12]:
print(train_df["label"].value_counts())
print(valid_df["label"].value_counts())

label
VLFwa2gWGG    35
QyXzZZlUBO    35
CAFgG5x2Le    35
NANnkS9KVa    35
Pri4r7OGuS    35
              ..
LJ8zZEZpCz     1
2psBL0K3Ex     1
1YQF5V1Hqm     1
9O3u1HdnVl     1
CrEcA2d9sv     1
Name: count, Length: 1592, dtype: int64
label
RooXmxDuKj    2
III0D7MR6N    2
Q0LNS48LZy    2
J89PpRqIyA    2
SCSFQX4i3J    2
             ..
LFtYiw5IkT    1
S2ZFVG6gBA    1
KUkguc4yAS    1
GAwmJxouyK    1
NtUrXttX7D    1
Name: count, Length: 1345, dtype: int64


In [13]:
train_manifest = os.path.join(data_dir,'train.json')
valid_manifest = os.path.join(data_dir,'dev.json')
finetune_config.model.train_ds.manifest_filepath = train_manifest
finetune_config.model.validation_ds.manifest_filepath = valid_manifest
finetune_config.model.decoder.num_classes = 1592
finetune_config.model.train_ds.batch_size = 8
finetune_config.model.validation_ds.batch_size = 8
finetune_config.model.train_ds["num_workers"] = 4
finetune_config.model.validation_ds["num_workers"] = 4

In [14]:
import torch
import pytorch_lightning as pl
# Setup the new trainer object
# Let us modify some trainer configs for this demo
# Checks if we have GPU available and uses it
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

trainer_config = OmegaConf.create(dict(
    devices=1,
    accelerator=accelerator,
    max_epochs=5,
    max_steps=-1,  # computed at runtime if not set
    num_nodes=1,
    accumulate_grad_batches=1,
    enable_checkpointing=False,  # Provided by exp_manager
    logger=False,  # Provided by exp_manager
    log_every_n_steps=1,  # Interval of logging.
    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))
print(OmegaConf.to_yaml(trainer_config))

devices: 1
accelerator: gpu
max_epochs: 5
max_steps: -1
num_nodes: 1
accumulate_grad_batches: 1
enable_checkpointing: false
logger: false
log_every_n_steps: 1
val_check_interval: 1.0



In [15]:
trainer_finetune = pl.Trainer(**trainer_config,precision="16-mixed")

In [16]:
from nemo.utils.exp_manager import exp_manager
log_dir_finetune = exp_manager(trainer_finetune, finetune_config.get("exp_manager", None))
print(log_dir_finetune)

[NeMo I 2023-11-28 04:18:46 exp_manager:386] Experiments will be logged at /kaggle/working/nemo_experiments/TitaNet-Finetune/2023-11-28_04-18-46
[NeMo I 2023-11-28 04:18:46 exp_manager:825] TensorboardLogger has been set up
/kaggle/working/nemo_experiments/TitaNet-Finetune/2023-11-28_04-18-46


In [17]:
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=finetune_config.model, trainer=trainer_finetune)
speaker_model.maybe_init_from_pretrained_checkpoint(finetune_config)

# speaker_model.setup_training_data(finetune_config.model.train_ds)
# speaker_model.setup_validation_data(finetune_config.model.valid_ds)

[NeMo I 2023-11-28 04:18:48 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-11-28 04:18:48 collections:302] Dataset loaded with 36090 items, total duration of  40.11 hours.
[NeMo I 2023-11-28 04:18:48 collections:304] # 36090 files loaded accounting to # 1592 labels


[NeMo W 2023-11-28 04:18:48 label_models:187] Total number of 1592 found in all the manifest files.


[NeMo I 2023-11-28 04:18:48 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-11-28 04:18:48 collections:302] Dataset loaded with 36090 items, total duration of  40.11 hours.
[NeMo I 2023-11-28 04:18:48 collections:304] # 36090 files loaded accounting to # 1592 labels
[NeMo I 2023-11-28 04:18:49 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-11-28 04:18:49 collections:302] Dataset loaded with 1444 items, total duration of  1.63 hours.
[NeMo I 2023-11-28 04:18:49 collections:304] # 1444 files loaded accounting to # 1345 labels
[NeMo I 2023-11-28 04:18:49 features:289] PADDING: 16
[NeMo I 2023-11-28 04:18:49 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /root/.cache/torch/NeMo/NeMo_1.21.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2023-11-28 04:18:53 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-28 04:18:53 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2023-11-28 04:18:53 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2023-11-28 04:18:53 features:289] PADDING: 16
[NeMo I 2023-11-28 04:18:54 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2023-11-28 04:18:54 modelPT:1150] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`
[NeMo I 2023-11-28 04:18:54 modelPT:1152] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']
[NeMo I 2023-11-28 04:18:54 modelPT:1155] Make sure that this is what you wanted!


In [18]:
trainer_finetune.fit(speaker_model)

[NeMo I 2023-11-28 04:18:59 modelPT:728] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.0002
    
    Parameter Group 1
        amsgrad: False
        betas: (0.9, 0.999)
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.001
        maximize: False
        weight_decay: 0.0002
    )
[NeMo I 2023-11-28 04:18:59 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7bcb4f374ee0>" 
    will be used during training (effective maximum steps = 22560) - 
    Parameters : 
    (warmup_ratio: 0.1
    min_lr: 0.0
    max_steps: 22560
    )


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[NeMo I 2023-11-28 04:19:06 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption


    


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [19]:
speaker_model.save_to(os.path.join(log_dir_finetune, '..',"titanet-large-finetune.nemo"))