## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [18]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [19]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

#### Setup hparams

In [20]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [26]:
checkpoint_path = "tacotron2_statedict.pt"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path,map_location='cpu')['state_dict'])
_ = model.cpu().eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [27]:
waveglow_path = 'waveglow_256channels.pt'
waveglow = torch.load(waveglow_path,map_location='cpu')['model']
waveglow.cpu().eval()
for k in waveglow.convinv:
    k.float()
    
for m in waveglow.modules():
    if 'Conv' in str(type(m)):
        setattr(m, 'padding_mode', 'zeros')
        #print(m)

#denoiser = Denoiser(waveglow)

#### Prepare text input

In [30]:
import time
start = time.time()

text = "Hi Paul! Waveglow is really awesome!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()

mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

end = time.time()
print("Total calculation time %.2lf" % (end - start))

Total calculation time 34.61


In [31]:
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [33]:
len(audio[0])/hparams.sampling_rate

3.030204081632653

In [34]:
def generate_texts(texts):
  audios = None
  
  for text in texts:
    start = time.time()
    print("Calculating %s" % (text[:10]))
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    end = time.time()
        
    if audios is not None:
      audios = np.append(audios, audio[0].data.cpu().numpy())
    else:
      audios = audio[0].data.cpu().numpy()

    
    print("Total calculation time %.2lf, Length %.2lf, Real-Time: %.2lf" % (end - start, len(audio[0])/hparams.sampling_rate, (end - start)/(len(audio[0])/hparams.sampling_rate)))
      
  return audios 

In [35]:
texts = ["May I have your attention please?", "May I have your attention please?", "Will the real Slim Shady please stand up?", "I repeat, will the real Slim Shady please stand up?", "We're gonna have a problem here."] + ["Cut my life into pieces.",
"This is my last resort.",
"Suffocation.",
"No breathing.",
"Don't give a fuck if I cut my arm bleeding.",
"This is my last resort."]
audios = generate_texts(texts)
ipd.Audio(audios, rate=hparams.sampling_rate)

Calculating May I have
Total calculation time 23.43, Length 2.00, Real-Time: 11.73
Calculating May I have
Total calculation time 24.81, Length 2.21, Real-Time: 11.25
Calculating Will the r
Total calculation time 28.85, Length 2.50, Real-Time: 11.56
Calculating I repeat, 
Total calculation time 42.21, Length 3.69, Real-Time: 11.43
Calculating We're gonn
Total calculation time 21.97, Length 1.93, Real-Time: 11.40
Calculating Cut my lif
Total calculation time 21.85, Length 1.95, Real-Time: 11.20
Calculating This is my
Total calculation time 24.28, Length 1.64, Real-Time: 14.83
Calculating Suffocatio
Total calculation time 13.35, Length 0.94, Real-Time: 14.19
Calculating No breathi
Total calculation time 12.63, Length 0.88, Real-Time: 14.32
Calculating Don't give
Total calculation time 29.73, Length 2.53, Real-Time: 11.75
Calculating This is my
Total calculation time 23.68, Length 1.72, Real-Time: 13.78


In [38]:
torch.set_num_threads(2)

In [39]:
texts = ["May I have your attention please?", "May I have your attention please?", "Will the real Slim Shady please stand up?", "I repeat, will the real Slim Shady please stand up?", "We're gonna have a problem here."] + ["Cut my life into pieces.",
"This is my last resort.",
"Suffocation.",
"No breathing.",
"Don't give a fuck if I cut my arm bleeding.",
"This is my last resort."]
audios = generate_texts(texts)
ipd.Audio(audios, rate=hparams.sampling_rate)

Calculating May I have
Total calculation time 22.64, Length 1.96, Real-Time: 11.54
Calculating May I have


KeyboardInterrupt: 

In [13]:
text = "Waveglow is really awesome!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()

#### Decode text input and plot results

In [14]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))



#### Synthesize audio from spectrogram using WaveGlow

In [15]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [9]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 