# Mozilla TTS Synthesis with GL
## Use your model 

### Set the paths for your model


In [1]:
# model paths
TTS_MODEL = "data/tts_model.pth.tar"
TTS_CONFIG = "data/config.json"
VOCODER_MODEL = "data/vocoder_model.pth.tar"
VOCODER_CONFIG = "data/config_vocoder.json"

In [2]:
TTS_MODEL = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/checkpoint_30000.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/speakers.json"

In [48]:
TTS_MODEL = "/home/julian/workspace/one_speaker_model/checkpoint_80000.pth.tar"
TTS_CONFIG = "/home/julian/workspace/one_speaker_model/config.json"
TTS_LANGUAGES = None
TTS_SPEAKERS = "/home/julian/workspace/one_speaker_model/speakers.json"

In [35]:
VOCODER_MODEL = "/home/julian/workspace/HifiGAN/HifiGAN.pth.tar"
VOCODER_CONFIG = "/home/julian/workspace/HifiGAN/config.json"

### Define TTS function

In [36]:
def interpolate_vocoder_input(scale_factor, spec):
    print(" > before interpolation :", spec.shape)
    spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0)
    spec = torch.nn.functional.interpolate(spec, scale_factor=scale_factor, mode='bilinear').squeeze(0)
    print(" > after interpolation :", spec.shape)
    return spec

In [45]:
def tts(model, text, CONFIG, use_cuda, ap, speaker_id=None, language_id=None, language_mapping=None, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id=speaker_id, language_id=language_id, language_mapping=language_mapping, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, use_griffin_lim=True)
    vocoder_input = interpolate_vocoder_input(scale_factor, mel_postnet_spec.T)
    #vocoder_input = vocoder_input.unsqueeze(0)
    print(vocoder_input.shape)
    waveform = vocoder_model.inference(vocoder_input.cuda()).cpu().squeeze()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=22050))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [38]:
import os
import torch
import time
import IPython

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.speakers import load_speaker_mapping, load_language_mapping
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis
from TTS.vocoder.utils.generic_utils import setup_generator

In [39]:
# runtime settings
use_cuda = True
use_gl = False

In [49]:
# LOAD TTS MODEL
TTS_CONFIG = load_config(TTS_CONFIG)

TTS_CONFIG.audio['stats_path'] = None
ap = AudioProcessor(**TTS_CONFIG.audio)       

# Load speakers and languages
speaker_mapping = load_speaker_mapping(TTS_SPEAKERS)
language_mapping = dict({}) #load_language_mapping(TTS_LANGUAGES)

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speaker_mapping), len(language_mapping), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:27
 | > do_sound_norm:False
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using model: Tacotron2


In [50]:
if use_gl == False:
    VOCODER_CONFIG = load_config(VOCODER_CONFIG)
    vocoder_model = setup_generator(VOCODER_CONFIG)
    cp = torch.load(VOCODER_MODEL, map_location="cpu")["model"]
    vocoder_model.load_state_dict(cp)
    scale_factor = [1,  VOCODER_CONFIG['audio']['sample_rate'] / ap.sample_rate]
    print(f"scale_factor: {scale_factor}")
    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

TypeError: expected str, bytes or os.PathLike object, not AttrDict

In [51]:
speaker_mapping

{'nadine_eckert_boulet': 0}

In [52]:
language_mapping

{}

## Run Inference

In [53]:
sentence =  "L'exploration du système solaire à l'aide de robots débute à la fin des années 1950."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, figures=True)

 > before interpolation : (80, 357)
 > after interpolation : torch.Size([1, 80, 491])
torch.Size([1, 80, 491])
torch.Size([128256])
 > Run-time: 1.3504705429077148
 > Real-time factor: 0.1684681265178079
 > Time per step: 1.0529313675181832e-05


In [55]:
sentence =  "Chez sopra steria, on se voit une fois par semaine."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, figures=True)

 > before interpolation : (80, 201)
 > after interpolation : torch.Size([1, 80, 277])
torch.Size([1, 80, 277])
torch.Size([73472])
 > Run-time: 0.8940544128417969
 > Real-time factor: 0.19469062833420492
 > Time per step: 1.216826162167958e-05
