# Mozilla TTS Synthesis with GL
## Use your model 

### Set the paths for your model


In [1]:
# model paths
TTS_MODEL = "data/tts_model.pth.tar"
TTS_CONFIG = "data/config.json"
VOCODER_MODEL = "data/vocoder_model.pth.tar"
VOCODER_CONFIG = "data/config_vocoder.json"

In [2]:
TTS_MODEL = "../../train/multilingual_test-April-12-2021_02+49PM-9f3f69e/best_model.pth.tar"
TTS_CONFIG = "../../train/multilingual_test-April-12-2021_02+49PM-9f3f69e/config.json"
TTS_LANGUAGES = "../../train/multilingual_test-April-12-2021_02+49PM-9f3f69e/languages.json"
TTS_SPEAKERS = "../../train/multilingual_test-April-12-2021_02+49PM-9f3f69e/speakers.json"

### Define TTS function

In [3]:
def tts(model, text, CONFIG, use_cuda, ap, speaker_id=None, language_id=None, language_mapping=None, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id=speaker_id, language_id=language_id, language_mapping=language_mapping, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, use_griffin_lim=True)
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [4]:
import os
import torch
import time
import IPython

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.speakers import load_speaker_mapping, load_language_mapping
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis

In [5]:
# runtime settings
use_cuda = True

In [6]:
# load config
TTS_CONFIG = load_config(TTS_CONFIG)

In [7]:
# load the audio processor
TTS_CONFIG.audio['stats_path'] = None
ap = AudioProcessor(**TTS_CONFIG.audio)         

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:35
 | > do_sound_norm:False
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024


In [8]:
# LOAD TTS MODEL

# Load speakers and languages
speaker_mapping = load_speaker_mapping(TTS_SPEAKERS)
language_mapping = load_language_mapping(TTS_LANGUAGES)

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speaker_mapping), len(language_mapping), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Using model: Tacotron2


In [9]:
language_mapping

{'de': 0, 'en-us': 1, 'es': 2, 'fr-fr': 3}

## Run Inference

In [10]:
sentence =  "Bill a pris l'habitude de se demander. Cette pensée est-elle vraie ? et s'il n'en était pas absolument certain, il la laissait tomber."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=1, language_id=3, language_mapping=language_mapping, figures=True)



(123904,)
 > Run-time: 1.3277840614318848
 > Real-time factor: 0.17144925091877458
 > Time per step: 1.0715587803524387e-05


In [11]:
sentence =  "Bill machte es sich zur Gewohnheit, sich selbst zu fragen. Ist dieser Gedanke wahr? und wenn er sich nicht absolut sicher wäre, würde er ihn fallen lassen."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=8, language_id=0, language_mapping=language_mapping, figures=True)

(153344,)
 > Run-time: 1.5708439350128174
 > Real-time factor: 0.16386257189144077
 > Time per step: 1.0241418517193133e-05


In [22]:
sentence =  "Bill got into the habit of asking himself. Is that thought true? and if he wasn't absolutely certain it was, he would let it go."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=9, language_id=1, language_mapping=language_mapping, figures=True)

(143104,)
 > Run-time: 1.473555564880371
 > Real-time factor: 0.16470314778240933
 > Time per step: 1.0293956732707289e-05


In [13]:
sentence =  "Bill se acostumbró a preguntarse. ¿Es cierto ese pensamiento? y si no estaba absolutamente seguro de que lo era, lo dejaba pasar."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=13, language_id=2, language_mapping=language_mapping, figures=True)

(130304,)
 > Run-time: 1.3764050006866455
 > Real-time factor: 0.1689953572155216
 > Time per step: 1.0562215315101189e-05
