## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
import matplotlib
%matplotlib inline
import sys

import IPython.display as ipd
# matplotlib.rcParams.update(matplotlib.rcParamsDefault)
# matplotlib.use('GTK3')
import matplotlib.pylab as plt
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import HParams
from train import load_model
from text import text_to_sequence
# from denoiser import Denoiser


In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')
    

#### Setup hparams

In [3]:
hparams = HParams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [4]:
checkpoint_path = "models/iter=115000_val-loss=0.337669.ckpt"
model, _ = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
# denoiser = Denoiser(waveglow)



#### Prepare text input

In [6]:
text = "Waveglow is really awesome!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [7]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))


torch.Size([1, 27, 88])


In [8]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

torch.Size([1, 27, 88])


#### Synthesize audio from spectrogram using WaveGlow

In [9]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet.half(), sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [10]:
def generate(style):
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, style)
    plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
               alignments.float().data.cpu().numpy()[0].T))
    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet.half(), sigma=0.666)
    return audio

In [11]:
style = torch.zeros(1, 1, 88)
style = style.repeat_interleave(sequence.size(1), dim=1)
style[0, 0, :] = torch.ones(88) * 0.6
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)


In [12]:
style[0, 0, :] = torch.ones(88) * 0.2
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)



In [13]:
style = torch.rand(1, sequence.size(1), 88)
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [14]:
style[0, :10, :] = torch.ones(88)
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [15]:
style[0, :10, :] = torch.ones(88) * 0.3
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [16]:
style[0, :10, :] = torch.ones(88)
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [17]:
style[0, :10, :] = torch.ones(88) * 0.3
audio = generate(style.cuda())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)