In [21]:
# """
# You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

# Instructions for setting up Colab are as follows:
# 1. Open a new Python 3 notebook.
# 2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
# 3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
# 4. Run this cell to set up dependencies.
# """
# # If you're using Google Colab and not running locally, run this cell.

# # Install dependencies
# !pip install wget
# !apt-get install sox libsndfile1 ffmpeg
# !pip install unidecode

# ## Install NeMo
BRANCH = 'main'
# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

# # Install TorchAudio
# !pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html

In [22]:
import os
BRANCH = 'main'

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# CUDA_VISIBLE_DEVICES options = "0,1,2,3". Make sure to restart kernel
!echo $CUDA_VISIBLE_DEVICES

3


In [3]:
import os
NEMO_ROOT = os.getcwd()
print(NEMO_ROOT)
import glob
import subprocess
import tarfile
import wget

data_dir = os.path.join(NEMO_ROOT,'data')
os.makedirs(data_dir, exist_ok=True)

# Download the dataset. This will take a few moments...
print("******")
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz 
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

# Untar and convert .sph to .wav (using sox)
tar = tarfile.open(an4_path)
tar.extractall(path=data_dir)

print("Converting .sph to .wav...")
sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
for sph_path in sph_list:
    wav_path = sph_path[:-4] + '.wav'
    cmd = ["sox", sph_path, wav_path]
    subprocess.run(cmd)
print("Finished conversion.\n******")

/home/DATA/amit_kesari/SD1/NeMo-Nvidia
******
Tarfile already exists.
Converting .sph to .wav...
Finished conversion.
******


In [4]:
# !find {data_dir}/an4/wav/an4_clstk  -iname "*.wav" > data/an4/wav/an4_clstk/train_all.txt
# !cat data/an4/wav/an4_clstk/train_all.txt
# -----
# create a list file which has all the wav files with absolute paths for each of the train, dev, and test set

# use headset dataset for now
!find /home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset -iname "*.wav" > data/ami_headset/train_all.txt
!cat {data_dir}/ami_headset/train_all.txt

/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-0.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-1.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-2.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-3.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-4.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001b/audio/EN2001b.Headset-0.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001b/audio/EN2001b.Headset-1.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001b/audio/EN2001b.Headset-2.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001b/audio/EN2001b.Headset-3.wav
/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/heads

In [5]:
# if not os.path.exists('scripts'):
#   print("Downloading necessary scripts")
#   #TODO: change to python
#   !mkdir -p scripts/speaker_tasks
#   !wget -P scripts/speaker_tasks/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/speaker_tasks/filelist_to_manifest.py
# !python {NEMO_ROOT}/scripts/speaker_tasks/filelist_to_manifest.py --filelist {data_dir}/an4/wav/an4_clstk/train_all.txt --id -2 --out {data_dir}/an4/wav/an4_clstk/all_manifest.json --split

# ------

#  convert this text file to a manifest file 
# optionally split the files to train \& dev for evaluating the models during training by using the --split flag
if not os.path.exists('scripts'):
  print("Downloading necessary scripts")
  #TODO: change to python
  !mkdir -p scripts/speaker_tasks
  !wget -P scripts/speaker_tasks/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/speaker_tasks/filelist_to_manifest.py
!python {NEMO_ROOT}/scripts/speaker_tasks/filelist_to_manifest.py --filelist {data_dir}/ami_headset/train_all.txt --id -3 --out {data_dir}/ami_headset/all_manifest.json 
!head -n 10 {data_dir}/ami_headset/train_all_manifest.json
# --id 3 means from last slash 3rd label name of train_all.txt (see script for more info)
# add --split for train and dev
# TODO - test and train manifest are same

# Format:
# manifest file describes a training sample 
# - audio_filepath contains the path to the wav file
# - duration it's duration in seconds, and 
# - label is the speaker class label:
# {"audio_filepath": "<absolute path to dataset>data/an4/wav/an4test_clstk/menk/cen4-menk-b.wav", "duration": 3.9, "label": "menk"}

100%|████████████████████████████████████████| 687/687 [00:00<00:00, 695.43it/s]
wrote /home/DATA/amit_kesari/SD1/NeMo-Nvidia/data/ami_headset/train_all_manifest.json
100%|█████████████████████████████████████| 687/687 [00:00<00:00, 218294.46it/s]
wrote /home/DATA/amit_kesari/SD1/NeMo-Nvidia/data/ami_headset/all_manifest.json
{"audio_filepath": "/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-0.wav", "offset": 0, "duration": 5250.240063, "label": "EN2001a"}
{"audio_filepath": "/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-1.wav", "offset": 0, "duration": 5250.240063, "label": "EN2001a"}
{"audio_filepath": "/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-2.wav", "offset": 0, "duration": 5250.240063, "label": "EN2001a"}
{"audio_filepath": "/home/DATA/amit_kesari/downloaded-big-datasets/AMI_Headset/headset/EN2001a/audio/EN2001a.Headset-3.wav"

In [6]:
# !find {data_dir}/an4/wav/an4test_clstk  -iname "*.wav" > {data_dir}/an4/wav/an4test_clstk/test_all.txt
# !python {NEMO_ROOT}/scripts/speaker_tasks/filelist_to_manifest.py --filelist {data_dir}/an4/wav/an4test_clstk/test_all.txt --id -2 --out {data_dir}/an4/wav/an4test_clstk/test.json
# ---

In [7]:
# train_manifest = os.path.join(data_dir,'an4/wav/an4_clstk/train.json')
# validation_manifest = os.path.join(data_dir,'an4/wav/an4_clstk/dev.json')
# test_manifest = os.path.join(data_dir,'an4/wav/an4_clstk/dev.json')
# -----------

# NOTE!!! - all labels should be present in train.json (Eg: 2003a is in dev set, still that label should be in train)
# path to manifest
train_manifest = os.path.join(data_dir,'ami_headset/train.json')
validation_manifest = os.path.join(data_dir,'ami_headset/dev.json')
test_manifest = os.path.join(data_dir,'ami_headset/test.json')
print(f"Paths:  \n{train_manifest} \n{validation_manifest} \n{test_manifest}")

Paths:  
/home/DATA/amit_kesari/SD1/NeMo-Nvidia/data/ami_headset/train.json 
/home/DATA/amit_kesari/SD1/NeMo-Nvidia/data/ami_headset/dev.json 
/home/DATA/amit_kesari/SD1/NeMo-Nvidia/data/ami_headset/test.json


# Training with config

# train for speaker embeding -> use it on SD

In [8]:
import nemo
# NeMo's ASR collection - This collection contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import ClusteringDiarizer
from omegaconf import OmegaConf
from nemo.collections.asr.parts.utils.speaker_utils import rttm_to_labels, labels_to_pyannote_object
# since for evaluation we use pyannote.metrics, convert rttm formats to pyannote Annotation objects

  warn(f"Failed to load image Python extension: {e}")
[NeMo W 2022-07-18 20:48:56 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.


In [9]:
# The TitaNet model is defined in a config file about:
# 1) model: All arguments that will relate to the Model - preprocessors, encoder, decoder, optimizer and schedulers, datasets etc
# 2) trainer: Any argument to be passed to PyTorch Lightning

if not os.path.exists('conf/titanet-large.yaml'):
    print("Downloading necessary scripts")
    !mkdir conf 
    !wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/speaker_tasks/recognition/conf/titanet-large.yaml

MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/titanet-large.yaml')

# !wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/speaker_tasks/diarization/conf/offline_diarization.yaml
# MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/offline_diarization.yaml')

config = OmegaConf.load(MODEL_CONFIG)
print(OmegaConf.to_yaml(config))

name: TitaNet
sample_rate: 16000
model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: null
        prob: 0.3
        min_snr_db: 0
        max_snr_db: 30
      speed:
        prob: 0.3
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
  validation_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: null
    batch_size: 128
    shuffle: false
  model_defaults:
    filters: 1024
    repeat: 3
    dropout: 0.1
    separable: true
    se: true
    se_context_size: -1
    kernel_size_factor: 1.0
  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: per_feature
    window_size: 0.025
    sample_rate: 16000
    window_stride: 0.01
    window: hann
    features

In [10]:
# Setting up the train_ds, validation_ds, test_ds datasets and dataloaders within the config
print(OmegaConf.to_yaml(config.model.train_ds))
print(OmegaConf.to_yaml(config.model.validation_ds))

manifest_filepath: ???
sample_rate: 16000
labels: null
batch_size: 64
shuffle: true
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: scatter
augmentor:
  noise:
    manifest_path: null
    prob: 0.3
    min_snr_db: 0
    max_snr_db: 30
  speed:
    prob: 0.3
    sr: 16000
    resample_type: kaiser_fast
    min_speed_rate: 0.95
    max_speed_rate: 1.05

manifest_filepath: ???
sample_rate: 16000
labels: null
batch_size: 128
shuffle: false



In [11]:
# add some configs
config.model.train_ds.manifest_filepath = train_manifest
config.model.validation_ds.manifest_filepath = validation_manifest
# config.model.test_ds.manifest_filepath = test_manifest TODO add to to config

config.model.decoder.num_classes = 74
# TODO: change num speaker


In [12]:
# NeMo models are primarily PyTorch Lightning modules 
import torch
import pytorch_lightning as pl

In [13]:
print("Trainer config - \n")
print(OmegaConf.to_yaml(config.trainer))

Trainer config - 

devices: 1
max_epochs: 250
max_steps: -1
num_nodes: 1
accelerator: gpu
strategy: ddp
deterministic: true
enable_checkpointing: false
logger: false
log_every_n_steps: 1
val_check_interval: 1.0



In [16]:
# Let us modify some trainer configs for this demo
# Checks if we have GPU available and uses it
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
config.trainer.devices = 1
config.trainer.accelerator = accelerator

# Reduces maximum number of epochs to 5 for quick demonstration
config.trainer.max_epochs = 5

# Remove distributed training flags
config.trainer.strategy = None

# Remove augmentations
config.model.train_ds.augmentor=None

# init
trainer = pl.Trainer(**config.trainer)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [17]:
# setup the experiment
from nemo.utils.exp_manager import exp_manager
log_dir = exp_manager(trainer, config.get("exp_manager", None))
# The log_dir provides a path to the current logging directory for easy access
print(log_dir)

[NeMo I 2022-07-18 21:08:36 exp_manager:287] Experiments will be logged at /home/DATA/amit_kesari/SD1/NeMo-Nvidia/nemo_experiments/TitaNet/2022-07-18_21-08-36
[NeMo I 2022-07-18 21:08:36 exp_manager:661] TensorboardLogger has been set up


      rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.")
    
[NeMo W 2022-07-18 21:08:36 exp_manager:895] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to -1. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


/home/DATA/amit_kesari/SD1/NeMo-Nvidia/nemo_experiments/TitaNet/2022-07-18_21-08-36


In [18]:
# TitaNet is a speaker embedding extractor model that can be used for speaker identification tasks 
# it generates one label for the entire provided audio stream. Therefore we encapsulate it inside the EncDecSpeakerLabelModel as follows.

speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=config.model, trainer=trainer)


[NeMo I 2022-07-18 21:08:40 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:08:40 collections:293] # 678 files loaded accounting to # 171 labels


[NeMo W 2022-07-18 21:08:40 label_models:133] Total number of 171 found in all the manifest files.


[NeMo I 2022-07-18 21:08:40 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:08:40 collections:293] # 678 files loaded accounting to # 171 labels
[NeMo I 2022-07-18 21:08:40 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:08:40 collections:293] # 6 files loaded accounting to # 6 labels
[NeMo I 2022-07-18 21:08:40 features:200] PADDING: 16
[NeMo I 2022-07-18 21:08:40 label_models:100] loss is Angular Softmax


                    not been set for this class (TopKClassificationAccuracy). The property determines if `update` by
                    default needs access to the full metric state. If this is not the case, significant speedups can be
                    achieved and we recommend setting this to `False`.
                    We provide an checking function
                    `from torchmetrics.utilities import check_forward_no_full_state`
                    that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                    default for now) or if `full_state_update=False` can be used safely.
                    
    


In [None]:
# # for training
# trainer.fit(speaker_model)



In [None]:
# # for testing
# trainer.test(speaker_model, ckpt_path=None)

In [23]:
# finetune
# !wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml
MODEL_CONFIG = os.path.join(NEMO_ROOT,'conf/titanet-finetune.yaml')
finetune_config = OmegaConf.load(MODEL_CONFIG)
print(OmegaConf.to_yaml(finetune_config))

--2022-07-18 21:10:24--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4335 (4.2K) [text/plain]
Saving to: ‘conf/titanet-finetune.yaml’


2022-07-18 21:10:24 (24.5 MB/s) - ‘conf/titanet-finetune.yaml’ saved [4335/4335]

name: TitaNet-Finetune
sample_rate: 16000
init_from_pretrained_model:
  speaker_tasks:
    name: titanet_large
    include:
    - preprocessor
    - encoder
    exclude:
    - decoder.final
model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      speed:
   

In [24]:
test_manifest = os.path.join(data_dir,'ami_headset/test.json')
finetune_config.model.train_ds.manifest_filepath = test_manifest
finetune_config.model.validation_ds.manifest_filepath = test_manifest
finetune_config.model.decoder.num_classes = 10

In [25]:
# Setup the new trainer object
# Let us modify some trainer configs for this demo
# Checks if we have GPU available and uses it
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

trainer_config = OmegaConf.create(dict(
    devices=1,
    accelerator=accelerator,
    max_epochs=5,
    max_steps=None,  # computed at runtime if not set
    num_nodes=1,
    accumulate_grad_batches=1,
    enable_checkpointing=False,  # Provided by exp_manager
    logger=False,  # Provided by exp_manager
    log_every_n_steps=1,  # Interval of logging.
    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))
print(OmegaConf.to_yaml(trainer_config))

devices: 1
accelerator: gpu
max_epochs: 5
max_steps: null
num_nodes: 1
accumulate_grad_batches: 1
enable_checkpointing: false
logger: false
log_every_n_steps: 1
val_check_interval: 1.0



In [26]:
trainer_finetune = pl.Trainer(**trainer_config)

      rank_zero_deprecation(
    
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [27]:
log_dir_finetune = exp_manager(trainer_finetune, config.get("exp_manager", None))
print(log_dir_finetune)

[NeMo I 2022-07-18 21:11:45 exp_manager:287] Experiments will be logged at /home/DATA/amit_kesari/SD1/NeMo-Nvidia/nemo_experiments/TitaNet/2022-07-18_21-08-36
[NeMo I 2022-07-18 21:11:45 exp_manager:661] TensorboardLogger has been set up


      rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.")
    
[NeMo W 2022-07-18 21:11:45 exp_manager:895] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to -1. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


/home/DATA/amit_kesari/SD1/NeMo-Nvidia/nemo_experiments/TitaNet/2022-07-18_21-08-36


In [28]:
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=finetune_config.model, trainer=trainer_finetune)
speaker_model.maybe_init_from_pretrained_checkpoint(finetune_config)

[NeMo I 2022-07-18 21:11:54 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:11:54 collections:293] # 3 files loaded accounting to # 3 labels


[NeMo W 2022-07-18 21:11:54 label_models:133] Total number of 3 found in all the manifest files.


[NeMo I 2022-07-18 21:11:54 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:11:54 collections:293] # 3 files loaded accounting to # 3 labels
[NeMo I 2022-07-18 21:11:54 collections:289] Filtered duration for loading collection is 0.000000.
[NeMo I 2022-07-18 21:11:54 collections:293] # 3 files loaded accounting to # 3 labels
[NeMo I 2022-07-18 21:11:54 features:200] PADDING: 16
[NeMo I 2022-07-18 21:11:54 label_models:100] loss is Angular Softmax


                    not been set for this class (TopKClassificationAccuracy). The property determines if `update` by
                    default needs access to the full metric state. If this is not the case, significant speedups can be
                    achieved and we recommend setting this to `False`.
                    We provide an checking function
                    `from torchmetrics.utilities import check_forward_no_full_state`
                    that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                    default for now) or if `full_state_update=False` can be used safely.
                    
    


[NeMo I 2022-07-18 21:11:54 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.10.0/titanet-l/492c0ab8416139171dc18c21879a9e45/titanet-l.nemo.
[NeMo I 2022-07-18 21:11:54 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.10.0/titanet-l/492c0ab8416139171dc18c21879a9e45/titanet-l.nemo
[NeMo I 2022-07-18 21:11:54 common:789] Instantiating model from pre-trained checkpoint


[NeMo W 2022-07-18 21:11:55 modelPT:148] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    time_length: 3
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2022-07-18 21:11:55 modelPT:155] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_valida

[NeMo I 2022-07-18 21:11:55 features:200] PADDING: 16
[NeMo I 2022-07-18 21:11:55 label_models:100] loss is Angular Softmax
[NeMo I 2022-07-18 21:11:56 save_restore_connector:243] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.10.0/titanet-l/492c0ab8416139171dc18c21879a9e45/titanet-l.nemo.
[NeMo I 2022-07-18 21:11:56 modelPT:912] Model checkpoint partially restored from pretrained chackpoint with name `titanet_large`
[NeMo I 2022-07-18 21:11:56 modelPT:914] The following parameters were excluded from loading from pretrained chackpoint with name `titanet_large` : ['decoder.final.weight']
[NeMo I 2022-07-18 21:11:56 modelPT:917] Make sure that this is what you wanted!


In [29]:
## Fine-tuning for 5 epochs¶
trainer_finetune.fit(speaker_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]


[NeMo I 2022-07-18 21:12:09 modelPT:579] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        eps: 1e-08
        lr: 0.0001
        maximize: False
        weight_decay: 0.0002
    )
[NeMo I 2022-07-18 21:12:09 lr_scheduler:833] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f7a3ae43940>" 
    will be used during training (effective maximum steps = 5) - 
    Parameters : 
    (warmup_ratio: 0.1
    min_lr: 0.0
    max_steps: 5
    )



  | Name         | Type                              | Params
-------------------------------------------------------------------
0 | preprocessor | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder      | ConvASREncoder                    | 19.4 M
2 | decoder      | SpeakerDecoder                    | 2.8 M 
3 | loss         | AngularSoftmaxLoss                | 0     
4 | _accuracy    | TopKClassificationAccuracy        | 0     
-------------------------------------------------------------------
22.1 M    Trainable params
0         Non-trainable params
22.1 M    Total params
88.497    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

      rank_zero_warn(
    


RuntimeError: CUDA out of memory. Tried to allocate 6.01 GiB (GPU 0; 23.65 GiB total capacity; 19.52 GiB already allocated; 475.44 MiB free; 20.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# add manifest then
# oracle_model = ClusteringDiarizer(cfg=config)
