## Training FastPitch
___



In [1]:
train_ruslan = '../../datasets/RUSLAN/train_manifest.json'
test_ruslan = '../../datasets/RUSLAN/test_manifest.json'
train_mcv = 'train.json'
test_mcv = 'test.json'

In [1]:
import os

import torch
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
from nemo.collections.tts.models import FastPitchModel
from pathlib import Path

from omegaconf import DictConfig, OmegaConf, open_dict

try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

[NeMo W 2022-03-27 09:40:55 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

[NeMo W 2022-03-27 09:40:57 experimental:27] Module <class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [2]:
from nemo.collections.tts.models import FastPitchModel
import pytorch_lightning as pl

from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import FastPitchModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
import noisereduce as nr

from nemo.collections.tts.torch.g2ps import EnglishG2p
from nemo.collections.tts.torch.data import TTSDataset
from nemo_text_processing.text_normalization.normalize import Normalizer
from nemo.collections.tts.torch.tts_tokenizers import EnglishPhonemesTokenizer, EnglishCharsTokenizer

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!HYDRA_FULL_ERROR=1 python fastpitch2_finetune.py \
    --config-name=fastpitch_align.yaml \
    train_dataset='train.json'\
    validation_datasets='test.json'\
    prior_folder=./Priors_qqq \
    model.train_ds.dataloader_params.batch_size=8 \
    model.validation_ds.dataloader_params.batch_size=1 \
    exp_manager.exp_dir=./fastpitch_exp_manager \
    model.optim.name=adam \
    +init_from_nemo_model=./tts_en_fastpitch_align.nemo

#     model.optim.lr=1e-1 \
#     trainer.max_epochs=30 \
#     model.preprocessor.log_zero_guard_value=1e-2 \

#     +trainer.max_steps=1000 \
#     ~trainer.max_epochs \
#     +init_from_nemo_model=./tts_en_fastpitch_align.nemo



[NeMo W 2022-02-12 23:25:31 optimizers:47] Apex was not found. Using the lamb optimizer will error out.
    
[NeMo W 2022-02-12 23:25:32 nmse_clustering:54] Using eigen decomposition from scipy, upgrade torch to 1.9 or higher for faster clustering
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

      '"sox" backend is being deprecated. '
    
[NeMo W 2022-02-12 23:25:32 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali._AudioTextDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-02-12 23:25:32 fastpitch2_finetune:27] You are using an optimizer scheduler while finetuning. Are you sure this is intended?
GPU available: True, used: 

In [2]:
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import FastPitchModel

vocoder = HifiGanModel.from_pretrained("tts_hifigan")
vocoder.eval().cuda()

In [5]:
def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Arguments:
    spec_gen_model -- Instance of FastPitch model
    vocoder_model -- Instance of a vocoder model (HiFiGAN in our case)
    str_input -- Text input for the synthesis
    speaker -- Speaker number (in the case of a multi-speaker model -- in the mixing case)
    
    Returns:
    spectrogram, waveform of the synthesized audio.
    """
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

def get_best_ckpt(experiment_base_dir, new_speaker_id, duration_mins, mixing_enabled, original_speaker_id):
    """
    Gives the model checkpoint paths of an experiment  we ran. 
    
    Arguments:
    experiment_base_dir -- Base experiment directory (specified on top of this notebook as exp_base_dir)
    new_speaker_id -- Speaker id of new HiFiTTS speaker we finetuned FastPitch on
    duration_mins -- total minutes of the new speaker data
    mixing_enabled -- True or False depending on whether we want to mix the original speaker data or not
    original_speaker_id -- speaker id of the original HiFiTTS speaker
    
    Returns:
    List of all checkpoint paths sorted by validation error, Last checkpoint path
    """
    if not mixing_enabled:
        exp_dir = "{}/{}_to_{}_no_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    else:
        exp_dir = "{}/{}_to_{}_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    
    ckpt_candidates = []
    last_ckpt = None
    for root, dirs, files in os.walk(exp_dir):
        for file in files:
            if file.endswith(".ckpt"):
                val_error = float(file.split("v_loss=")[1].split("-epoch")[0])
                if "last" in file:
                    last_ckpt = os.path.join(root, file)
                ckpt_candidates.append( (val_error, os.path.join(root, file)))
    ckpt_candidates.sort()
    
    return ckpt_candidates, last_ckpt

In [7]:
import json

In [25]:
duration_mins = 5
mixing = False
last_ckpt = '/media/boris/F/NeMo_own_research/tts/fastpitch_exp_manager/FastPitch/2022-01-31_02-26-12/checkpoints/FastPitch--v_loss=0.1481-epoch=20.ckpt'

spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)

[NeMo E 2022-03-26 13:12:41 vocabs:324] Torch distributed needs to be initialized before you initialized <nemo.collections.common.data.vocabs.Phonemes object at 0x7fec4effde20>. This class is prone to data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this before rank 0, errors might result.
[NeMo W 2022-03-26 13:12:44 modelPT:148] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset
      manifest_filepath: ../../datasets/RUSLAN/train_manifest.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      normalize: true
      sample_rate: 22050
      trim: false
      sup_data_path: ./priors_test
      n_window_stride: 256
      n_window_size: 1024
      pitch_fmin: 52.06
      pitch_f

[NeMo I 2022-03-26 13:12:44 features:259] PADDING: 1
[NeMo I 2022-03-26 13:12:44 features:276] STFT using torch


In [3]:
num_val = 5

manifest_path = 'train.json'
val_records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        val_records.append( json.loads(line) )
        if len(val_records) >= num_val:
            break
            
for val_record in val_records:
    print ("Real validation audio")
    ipd.display(ipd.Audio(val_record['audio_filepath'], rate=22050))
    print ("SYNTHESIZED")
    spec, audio = infer(spec_model, vocoder, val_record['text'], speaker=1)
#     audio = nr.reduce_noise(y=audio, sr=22050)
    ipd.display(ipd.Audio(audio, rate=22050))
    %matplotlib inline
    #if spec is not None:
    imshow(spec, origin="lower", aspect = "auto")
    plt.show()

In [4]:
!python hifigan_finetune.py \
    model.train_ds.dataloader_params.batch_size=4 \
    model.validation_ds.dataloader_params.batch_size=1 \
    train_dataset='hifigan_train_ft.json' \
    validation_datasets='hifigan_val_ft.json' \
    exp_manager.exp_dir=hifigan_ft \
   +init_from_nemo_model=./tts_hifigan.nemo \
#     +trainer.max_steps=10 \
#     ~trainer.max_epochs \
# init_from_ptl_ckpt