# GlowTTS - HiFi-GAN end2end inference

Clone repos, install packages, compile C code

In [1]:
!git clone https://github.com/jaywalnut310/glow-tts.git
!git clone https://github.com/jik876/hifi-gan.git

!pip install Unidecode==1.0.22

!cd glow-tts/monotonic_align/; python setup.py build_ext --inplace

Cloning into 'glow-tts'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 110 (delta 7), reused 16 (delta 6), pack-reused 85[K
Receiving objects: 100% (110/110), 2.22 MiB | 10.74 MiB/s, done.
Resolving deltas: 100% (34/34), done.
Collecting Unidecode==1.0.22
[?25l  Downloading https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl (235kB)
[K     |████████████████████████████████| 235kB 8.0MB/s 
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.0.22


Import GlowTTS source code

In [4]:
import sys
sys.path.append('glow-tts/')

%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import librosa
import numpy as np
import os
import glob
import json

import torch
from text import text_to_sequence, cmudict
from text.symbols import symbols
import commons
import attentions
import modules
import utils
import models

Download pretrained models

In [18]:
# pretrained GlowTTS
!gdown --id 1RxR6JWg6WVBZYb-pIw58hi1XLNb5aHEi -O GlowTTS.pth
# pretrained HiFi-GAN
!gdown --id 1QEBKespXTmsMzsSRBXWdpIT0Ve7nnaRZ -O HiFiGAN.pth
# config for pretrained HiFi-GAN
!gdown --id 1l5EUVBKM0SK7ec4HWf_wZvEITAsdOLFC

Downloading...
From: https://drive.google.com/uc?id=1RxR6JWg6WVBZYb-pIw58hi1XLNb5aHEi
To: /content/GlowTTS.pth
115MB [00:00, 274MB/s]


## Run inference
- step 1: raw text to phonemes
- step 2: phonemes to mel-spectrogram using GlowTTS FlowGenerator
- step 3: mel-spectrogram to audio using HiFi-GAN

In [36]:
# input text for verbalizing

text = "Deep audio distances project is awesome !"

In [19]:
# initialize GlowTTS from checkpoint

hps = utils.get_hparams_from_file("./glow-tts/configs/base_blank.json")
checkpoint_path = "./GlowTTS.pth"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,
    **hps.model).to("cuda")

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()

hps.data.cmudict_path = './glow-tts/data/cmu_dictionary'
cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)

INFO:root:Loaded checkpoint './GlowTTS.pth' (iteration 0)


In [37]:
# prepare phonemes

if getattr(hps.data, "add_blank", False):
    text_norm = text_to_sequence(text.strip(), ['english_cleaners'], cmu_dict)
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], cmu_dict)
sequence = np.array(text_norm)[None, :]
print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda()

<BNK>@D<BNK>@IY1<BNK>@P<BNK> <BNK>@AO2<BNK>@D<BNK>@IY0<BNK>@OW0<BNK> <BNK>@D<BNK>@IH1<BNK>@S<BNK>@T<BNK>@AH0<BNK>@N<BNK>@S<BNK>@AH0<BNK>@Z<BNK> <BNK>@P<BNK>@R<BNK>@AA1<BNK>@JH<BNK>@EH0<BNK>@K<BNK>@T<BNK> <BNK>@IH1<BNK>@Z<BNK> <BNK>@AO1<BNK>@S<BNK>@AH0<BNK>@M<BNK>


In [41]:
# prepare and save mel-frames

with torch.no_grad():
  noise_scale = .667
  length_scale = 1.0
  (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, gen=True, noise_scale=noise_scale, length_scale=length_scale)

if not os.path.exists('./hifi-gan/test_mel_files'):
    os.makedirs('./hifi-gan/test_mel_files')
np.save("./hifi-gan/test_mel_files/sample.npy", y_gen_tst.cpu().detach().numpy())

In [42]:
# use finetuned HiFi-GAN with Tacotron 2, which is provided in the repo of HiFi-GAN

!python ./hifi-gan/inference_e2e.py --input_mels_dir ./hifi-gan/test_mel_files \
                                    --output_dir ./hifi-gan/generated_files_from_mel \
                                    --checkpoint_file HiFiGAN.pth

Initializing Inference Process..
Loading 'HiFiGAN.pth'
Complete.
Removing weight norm...
./hifi-gan/generated_files_from_mel/sample_generated_e2e.wav


In [44]:
# listen!

ipd.Audio("./hifi-gan/generated_files_from_mel/sample_generated_e2e.wav")