## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
# from denoiser import Denoiser
import glow

In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

#### Setup hparams

In [3]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [4]:
checkpoint_path = "checkpoint_97000"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval()
for k in waveglow.convinv:
    k.float()



#### Prepare text input

In [6]:
text = "बारिश होते ही चाची को ज़ोरज़ोर से गाना अच्छा लगता है"
# text = "नमस्ते, मैं आस्था हूँ"
sequence = np.array(text_to_sequence(text, ['transliteration_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [13]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

#### Synthesize audio from spectrogram using WaveGlow

In [14]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=22050)

In [15]:
import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.nlp as nemo_nlp
import nemo.collections.tts as nemo_tts

from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType

In [16]:
vocoder_melgan = nemo_tts.models.MelGanModel.from_pretrained(model_name="tts_melgan").cuda()
vocoder_melgan.eval()

vocoder_hifigan = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_hifigan").cuda()
vocoder_hifigan.eval()

[NeMo I 2022-01-05 20:32:31 cloud:56] Found existing object /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_melgan/38f156f172595e60f02169891e303590/tts_melgan.nemo.
[NeMo I 2022-01-05 20:32:31 cloud:62] Re-using file from: /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_melgan/38f156f172595e60f02169891e303590/tts_melgan.nemo
[NeMo I 2022-01-05 20:32:31 common:729] Instantiating model from pre-trained checkpoint


[NeMo W 2022-01-05 20:32:31 modelPT:130] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.AudioDataset
      manifest_filepath: /raid/LJSpeech/nvidia_ljspeech_train.json
      max_duration: null
      min_duration: 0.75
      n_segments: 16384
      trim: false
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2022-01-05 20:32:31 modelPT:137] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.AudioDataset
      manifest_filepath: /raid/LJSpeech/nvidia_ljspeech_val.jso

[NeMo I 2022-01-05 20:32:31 features:264] PADDING: 0
[NeMo I 2022-01-05 20:32:31 features:281] STFT using torch
[NeMo I 2022-01-05 20:32:31 features:283] STFT using exact pad
[NeMo I 2022-01-05 20:32:32 save_restore_connector:149] Model MelGanModel was successfully restored from /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_melgan/38f156f172595e60f02169891e303590/tts_melgan.nemo.
[NeMo I 2022-01-05 20:32:32 cloud:56] Found existing object /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2022-01-05 20:32:32 cloud:62] Re-using file from: /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2022-01-05 20:32:32 common:729] Instantiating model from pre-trained checkpoint


[NeMo W 2022-01-05 20:32:33 modelPT:130] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2022-01-05 20:32:33 modelPT:137] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2022-01-05 20:32:33 features:264] PADDING: 0
[NeMo I 2022-01-05 20:32:33 features:281] STFT using torch


[NeMo W 2022-01-05 20:32:33 features:241] Using torch_stft is deprecated and will be removed in 1.1.0. Please set stft_conv and stft_exact_pad to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2022-01-05 20:32:33 features:264] PADDING: 0
[NeMo I 2022-01-05 20:32:33 features:281] STFT using torch
[NeMo I 2022-01-05 20:32:34 save_restore_connector:149] Model HifiGanModel was successfully restored from /home/kn1ght/.cache/torch/NeMo/NeMo_1.6.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


HifiGanModel(
  (audio_to_melspec_precessor): FilterbankFeatures()
  (trg_melspec_fn): FilterbankFeatures()
  (generator): Generator(
    (conv_pre): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
    (ups): ModuleList(
      (0): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
      (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
      (2): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
      (3): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
    )
    (resblocks): ModuleList(
      (0): ModuleList(
        (0): ResBlock1(
          (convs1): ModuleList(
            (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
            (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
            (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
          )
          (convs2): ModuleList(
            (0

In [17]:
audio = vocoder_melgan.convert_spectrogram_to_audio(spec=mel_outputs_postnet)
audio = audio.to('cpu').detach().numpy()

ipd.Audio(audio[0], rate=22050)

In [18]:
audio = vocoder_hifigan.convert_spectrogram_to_audio(spec=mel_outputs_postnet)
audio = audio.to('cpu').detach().numpy()

ipd.Audio(audio[0], rate=22050)