# Mozilla TTS Synthesis with GL
## Use your model 

### Set the paths for your model


In [4]:
# model paths
TTS_MODEL = "data/tts_model.pth.tar"
TTS_CONFIG = "data/config.json"
VOCODER_MODEL = "data/vocoder_model.pth.tar"
VOCODER_CONFIG = "data/config_vocoder.json"

In [25]:
TTS_MODEL = "/home/julian/workspace/train/Multilingual_multispeaker-April-16-2021_12+34AM-904c6b1/best_model.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Multilingual_multispeaker-April-16-2021_12+34AM-904c6b1/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Multilingual_multispeaker-April-16-2021_12+34AM-904c6b1/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Multilingual_multispeaker-April-16-2021_12+34AM-904c6b1/speakers.json"

In [12]:
TTS_MODEL = "/home/julian/workspace/train/Multilingual_multispeaker_GST-April-17-2021_01+27PM-904c6b1/checkpoint_30000.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Multilingual_multispeaker_GST-April-17-2021_01+27PM-904c6b1/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Multilingual_multispeaker_GST-April-17-2021_01+27PM-904c6b1/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Multilingual_multispeaker_GST-April-17-2021_01+27PM-904c6b1/speakers.json"

In [1]:
TTS_MODEL = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/checkpoint_30000.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/speakers.json"

### Define TTS function

In [2]:
def tts(model, text, CONFIG, use_cuda, ap, speaker_id=None, language_id=None, language_mapping=None, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id=speaker_id, language_id=language_id, language_mapping=language_mapping, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, use_griffin_lim=True)
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [3]:
import os
import torch
import time
import IPython

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.speakers import load_speaker_mapping, load_language_mapping
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis

In [4]:
# runtime settings
use_cuda = True

In [5]:
# load config
TTS_CONFIG = load_config(TTS_CONFIG)

In [6]:
# load the audio processor
TTS_CONFIG.audio['stats_path'] = None
ap = AudioProcessor(**TTS_CONFIG.audio)         

self.log_func = np.log10
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:27
 | > do_sound_norm:False
 | > stats_path:None
 | > log_func:<ufunc 'log10'>
 | > exp_func:<function AudioProcessor.__init__.<locals>.<lambda> at 0x7f544ad3e040>
 | > hop_length:256
 | > win_length:1024


In [7]:
# LOAD TTS MODEL

# Load speakers and languages
speaker_mapping = load_speaker_mapping(TTS_SPEAKERS)
language_mapping = load_language_mapping(TTS_LANGUAGES)

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speaker_mapping), len(language_mapping), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Using model: Tacotron2


In [8]:
speaker_mapping

{'bernard': 0,
 'elliot_miller': 1,
 'eva_k': 2,
 'ezwa': 3,
 'gilles_g_le_blanc': 4,
 'hajdurova': 5,
 'judy_bieber': 6,
 'karen_savage': 7,
 'karlsson': 8,
 'lisa_caputo': 9,
 'mary_ann': 10,
 'minaev': 11,
 'nadine_eckert_boulet': 12,
 'nikolaev': 13,
 'nina_brown': 14,
 'piotr_nater': 15,
 'ramona_deininger': 16,
 'rebecca_braunert_plunkett': 17,
 'riccardo_fasol': 18,
 'tux': 19,
 'victor_villarraza': 20,
 'zeckou': 21}

In [9]:
language_mapping

{'de': 0, 'en-us': 1, 'es': 2, 'fr-fr': 3, 'it': 4, 'pl': 5, 'ru': 6}

## Run Inference

In [18]:
sentence =  "L'exploration du système solaire à l'aide de robots débute à la fin des années 1950."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=12, language_id=3, language_mapping=language_mapping, figures=True)

(80640,)
 > Run-time: 0.9225606918334961
 > Real-time factor: 0.18303446353427946
 > Time per step: 1.1439668753790477e-05


In [15]:
sentence =  "Unter Sprachsynthese versteht man die künstliche Erzeugung der menschlichen Sprechstimme."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=2, language_id=0, language_mapping=language_mapping, figures=True)

(75520,)
 > Run-time: 1.169550895690918
 > Real-time factor: 0.24776923454413982
 > Time per step: 1.5485599258188473e-05


In [17]:
sentence =  "A computer system used for this purpose is called a speech computer or speech synthesizer, and can be implemented in software or hardware products."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=10, language_id=1, language_mapping=language_mapping, figures=True)

(132864,)
 > Run-time: 1.623436450958252
 > Real-time factor: 0.1954908602958929
 > Time per step: 1.2218205685330724e-05


In [36]:
sentence =  "Then the North Wind blew as hard as he could, but the more he blew the more closely did the traveler fold his cloak around him;"
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=11, language_id=2, language_mapping=language_mapping, figures=True)

(114432,)
 > Run-time: 1.640629529953003
 > Real-time factor: 0.2293830083253933
 > Time per step: 1.4336454688302622e-05


In [20]:
sentence =  "Bóreas empezó de primero, soplando con violencia; y apretó el hombre contra sí sus ropas"
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=19, language_id=2, language_mapping=language_mapping, figures=True)

(73472,)
 > Run-time: 1.2720589637756348
 > Real-time factor: 0.2769990872838356
 > Time per step: 1.7312459180371687e-05


In [75]:
sentence =  "Bill ha preso l'abitudine di chiedersi. È vero quel pensiero? e se non era assolutamente certo che lo fosse, lasciava perdere."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=19, language_id=5, language_mapping=language_mapping, figures=True)

(91904,)
 > Run-time: 1.3231024742126465
 > Real-time factor: 0.23033061067373972
 > Time per step: 1.439569689188163e-05


In [76]:
sentence =  "Билл вошёл в привычку спрашивать себя. Это правда? И если бы он не был абсолютно уверен в этом, он бы отпустил это."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=16, language_id=7, language_mapping=language_mapping, figures=True)

(98048,)
 > Run-time: 1.3689019680023193
 > Real-time factor: 0.2233700403656723
 > Time per step: 1.396065913432572e-05


In [77]:
sentence =  "Bill nabrał nawyku zadawania sobie pytania. Czy ta myśl jest prawdziwa? I jeśli nie był absolutnie pewien, że tak, to odpuszczał."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=15, language_id=6, language_mapping=language_mapping, figures=True)

(102144,)
 > Run-time: 1.4859898090362549
 > Real-time factor: 0.23275445726581084
 > Time per step: 1.454717925467288e-05
