## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
#%pip install --upgrade jupyter ipywidgets
#%jupyter nbextension enable --py widgetsnbextension

#%pip install --upgrade matplotlib
#import matplotlib
#matplotlib.use('Agg')
#import matplotlib.pyplot as plt

import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')
    #for i in range(len(data)):
    #    axes[i].imshow(data[i], aspect='auto', origin='bottom', 
    #                   interpolation='none')

#### Setup hparams

In [3]:
hparams = create_hparams()
hparams.sampling_rate = 22050

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


INFO:tensorflow:Final parsed hparams: {'epochs': 500, 'iters_per_checkpoint': 1000, 'seed': 1234, 'dynamic_loss_scaling': True, 'fp16_run': False, 'distributed_run': False, 'dist_backend': 'nccl', 'dist_url': 'tcp://localhost:54321', 'cudnn_enabled': False, 'cudnn_benchmark': False, 'ignore_layers': ['embedding.weight'], 'load_mel_from_disk': False, 'training_files': 'filelists/train_test.txt', 'validation_files': 'filelists/val_test.txt', 'text_cleaners': ['english_cleaners'], 'max_wav_value': 32768.0, 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, '

#### Load model from checkpoint

In [4]:
checkpoint_path = "models/tacotron2_statedict.pt"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
#_ = model.cuda().eval().half()
#_ = model.eval().half()
_ = model.eval().float()

initializing model
initializing encoder
initializing decoder
initializing prenet
initializing attention
initializing location
decoder initialized
initializing postnet
model intialized



#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'models/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
#waveglow.cuda().eval().half()
#waveglow.eval().to(torch.float16)
#waveglow.eval().half()
waveglow.eval().float()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)



initializing denoiser
dtype: torch.float32
device: cpu
waveglow infer
spect type: torch.FloatTensor


#### Prepare text input

In [49]:
text = "Audio processing is neat!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
#sequence = torch.autograd.Variable(
#    torch.from_numpy(sequence)).cuda().long()
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).long()
print(sequence.type())

torch.LongTensor


#### Decode text input and plot results

In [50]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

tacotron inference
torch.FloatTensor
decoder inference
decoder get_go_frame
decoder init states
decoder parse outputs


#### Synthesize audio from spectrogram using WaveGlow

In [51]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

waveglow infer
spect type: torch.FloatTensor


#### (Optional) Remove WaveGlow bias

In [52]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

In [53]:
import os
from scipy.io.wavfile import write

cwd = os.getcwd()
out_directory = os.path.join(cwd, 'examples')
if not os.path.isdir(out_directory):
    os.makedirs(out_directory)
    os.chmod(out_directory, 0o775)

audio_file = os.path.join(out_directory, 'audio.wav')
denoised_file = os.path.join(out_directory, 'audio_denoised.wav')
write(audio_file, hparams.sampling_rate, audio[0].data.cpu().numpy())
write(denoised_file, hparams.sampling_rate, audio_denoised[0].cpu().numpy())