# Set up

In [None]:
!pip install cython==0.29.12
!pip install pypinyin

!git clone https://github.com/Francis-Komizu/Glow-TTS
%cd Glow-TTS
!git clone https://github.com/jik876/hifi-gan

%cd monotonic_align
!python setup.py build_ext --inplace
%cd ..

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import librosa
import numpy as np
import os
import glob
import json

import torch
from text import text_to_sequence
from text.symbols import symbols
import commons
import attentions
import modules
import models
import utils

# Load models

## Flow generator

In [None]:
hps = utils.get_hparams_from_file("/content/Glow-TTS/configs/base_blank.json")
checkpoint_path = "/content/drive/MyDrive/Starglow/G_paimon.pth"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,
    **hps.model).to("cuda")

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()

## HiFi-GAN

In [None]:
!mkdir hifi-gan
!gdown --id '14NENd4equCBLyyCSke114Mv6YR_j_uFs' --output hifi-gan/
!gdown --id '1aDh576AEYA5eTjhx7sew1qcCM_Y526jc' --output hifi-gan/

# Text-to-speech

## Text-to-mel

In [None]:
tst_stn = "你好，我是派蒙。" 

if getattr(hps.data, "add_blank", False):
    text_norm = text_to_sequence(tst_stn.strip(), ['chinese_cleaners'])
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['chinese_cleaners'])
sequence = np.array(text_norm)[None, :]
print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda()

with torch.no_grad():
  noise_scale = .667
  length_scale = 1.0
  (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, gen=True, noise_scale=noise_scale, length_scale=length_scale)

# save mel-frames
if not os.path.exists('./hifi-gan/test_mel_files'):
    os.makedirs('./hifi-gan/test_mel_files')
np.save("./hifi-gan/test_mel_files/sample.npy", y_gen_tst.cpu().detach().numpy())

## Mel-to-wav

In [None]:
%cd hifi-gan
!python inference_e2e.py --checkpoint_file /content/Glow-TTS/hifi-gan/generator_v1
ipd.Audio("generated_files_from_mel/sample_generated_e2e.wav", rate=22050)