# Mozilla TTS Synthesis with GL
## Use your model 

### Set the paths for your model


In [1]:
# model paths
TTS_MODEL = "data/tts_model.pth.tar"
TTS_CONFIG = "data/config.json"
VOCODER_MODEL = "data/vocoder_model.pth.tar"
VOCODER_CONFIG = "data/config_vocoder.json"

In [2]:
TTS_MODEL = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/checkpoint_30000.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Multilingual_multispeaker_WS-April-18-2021_11+33PM-2ef3868e/speakers.json"

In [32]:
TTS_MODEL = "/home/julian/workspace/train/Tomiinek_multispeaker_multilingual_char-May-20-2021_04+09PM-de298e36/best_model.pth.tar"
TTS_CONFIG = "/home/julian/workspace/train/Tomiinek_multispeaker_multilingual_char-May-20-2021_04+09PM-de298e36/config.json"
TTS_LANGUAGES = "/home/julian/workspace/train/Tomiinek_multispeaker_multilingual_char-May-20-2021_04+09PM-de298e36/languages.json"
TTS_SPEAKERS = "/home/julian/workspace/train/Tomiinek_multispeaker_multilingual_char-May-20-2021_04+09PM-de298e36/speakers.json"

In [33]:
VOCODER_MODEL = "/home/julian/workspace/HifiGAN/checkpoint_490000.pth.tar"
VOCODER_CONFIG = "/home/julian/workspace/HifiGAN/config.json"

### Define TTS function

In [34]:
def tts(model, text, CONFIG, use_cuda, ap, speaker_id=None, language_id=None, language_mapping=None, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id=speaker_id, language_id=language_id, language_mapping=language_mapping, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, use_griffin_lim=True)
    vocoder_input = torch.tensor(mel_postnet_spec.T).unsqueeze(0).cpu()
    waveform = vocoder_model.inference(vocoder_input).squeeze()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=16000))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [35]:
import os
import torch
import time
import IPython

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.speakers import load_speaker_mapping, load_language_mapping
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis
from TTS.vocoder.utils.generic_utils import setup_generator

In [36]:
# runtime settings
use_cuda = True
use_gl = False

In [37]:
# LOAD TTS MODEL
TTS_CONFIG = load_config(TTS_CONFIG)

TTS_CONFIG.audio['stats_path'] = None
ap = AudioProcessor(**TTS_CONFIG.audio)       

# Load speakers and languages
speaker_mapping = load_speaker_mapping(TTS_SPEAKERS)
language_mapping = load_language_mapping(TTS_LANGUAGES)

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else 119
model = setup_model(num_chars, len(speaker_mapping), len(language_mapping), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:27
 | > do_sound_norm:False
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using model: Tacotron2


In [38]:
if use_gl == False:
    VOCODER_CONFIG = load_config(VOCODER_CONFIG)
    vocoder_model = setup_generator(VOCODER_CONFIG)
    cp = torch.load(VOCODER_MODEL, map_location="cpu")["model"]
    vocoder_model.load_state_dict(cp)
    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

 > Generator Model: hifigan_generator
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:True
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [39]:
speaker_mapping

{'bernard': 0,
 'elliot_miller': 1,
 'eva_k': 2,
 'ezwa': 3,
 'hajdurova': 4,
 'judy_bieber': 5,
 'karen_savage': 6,
 'karlsson': 7,
 'lisa_caputo': 8,
 'mary_ann': 9,
 'minaev': 10,
 'nadine_eckert_boulet': 11,
 'nikolaev': 12,
 'nina_brown': 13,
 'piotr_nater': 14,
 'ramona_deininger': 15,
 'rebecca_braunert_plunkett': 16,
 'riccardo_fasol': 17,
 'tux': 18,
 'victor_villarraza': 19,
 'zeckou': 20}

In [40]:
language_mapping

{'de': 0, 'en-us': 1, 'es': 2, 'fr-fr': 3, 'it': 4, 'pl': 5, 'ru': 6}

In [41]:
vocoder_model.cpu()

HifiganGenerator(
  (conv_pre): Conv1d(80, 128, kernel_size=(7,), stride=(1,), padding=(3,))
  (ups): ModuleList(
    (0): ConvTranspose1d(128, 64, kernel_size=(16,), stride=(8,), padding=(4,))
    (1): ConvTranspose1d(64, 32, kernel_size=(16,), stride=(8,), padding=(4,))
    (2): ConvTranspose1d(32, 16, kernel_size=(4,), stride=(2,), padding=(1,))
    (3): ConvTranspose1d(16, 8, kernel_size=(4,), stride=(2,), padding=(1,))
  )
  (resblocks): ModuleList(
    (0): ResBlock1(
      (convs1): ModuleList(
        (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
        (2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
      )
      (convs2): ModuleList(
        (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
        (2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), pad

## Run Inference

In [52]:
sentence =  "N'hésitez pas à saisir autre chose ou bien même à choisir une autre voix."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, language_id=3, language_mapping=language_mapping, figures=True)

torch.Size([57088])
 > Run-time: 0.5332062244415283
 > Real-time factor: 0.14942236278089172
 > Time per step: 9.338997905831701e-06


In [85]:
sentence =  "What I'm intrested in is pushing the limit of machine learning."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, language_id=1, language_mapping=language_mapping, figures=True)

torch.Size([44032])
 > Run-time: 0.40143871307373047
 > Real-time factor: 0.1458483206671338
 > Time per step: 9.115655408349148e-06


In [94]:
sentence =  "science research"
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=9, language_id=1, language_mapping=language_mapping, figures=True)

torch.Size([13312])
 > Run-time: 0.16953730583190918
 > Real-time factor: 0.203684545480288
 > Time per step: 1.2730624383458724e-05


In [83]:
sentence =  "Votre transaction est terminée. Bonne journée."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, language_id=3, language_mapping=language_mapping, figures=True)

torch.Size([38656])
 > Run-time: 0.37804150581359863
 > Real-time factor: 0.15644877162200727
 > Time per step: 9.778171580358847e-06


In [16]:
sentence =  "huó wúcháng hé sǐ yǒu fēn， hé qǐlái shì rénshēng de xiàngzhēng。 rén jiāng sǐ shí， běn zhǐxū sǐ yǒu fēn láidào。"
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=0, language_id=4, language_mapping=language_mapping, figures=True)

torch.Size([102912])
 > Run-time: 1.071552038192749
 > Real-time factor: 0.16658727209366375
 > Time per step: 1.0411771690815835e-05


In [78]:
sentence =  "Der Drehsinn des Milchstraßensystems stimmt nicht mit dem der Planeten um die Sonne überein."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=15, language_id=0, language_mapping=language_mapping, figures=True)

torch.Size([71680])
 > Run-time: 0.6535604000091553
 > Real-time factor: 0.14583846288067953
 > Time per step: 9.114987083843776e-06


In [97]:
sentence =  "Внутренняя часть включает планеты земной группы и астероиды."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=18, language_id=6, language_mapping=language_mapping, figures=True)

torch.Size([54016])
 > Run-time: 0.5900812149047852
 > Real-time factor: 0.17476886934578703
 > Time per step: 1.0923160266537236e-05


In [19]:
sentence =  "Daardoor kunnen uitstulpingen en bergen inzakken door hun eigen gewicht."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, speaker_id=2, language_id=3, language_mapping=language_mapping, figures=True)

torch.Size([68864])
 > Run-time: 0.7063078880310059
 > Real-time factor: 0.1640908115415325
 > Time per step: 1.0255772661985518e-05
