In [1]:
import argparse
import json
import os
import string
import time
import sys
import numpy as np
TTS_PATH = "/home/iref/PycharmProjects/tts-vc/"
sys.path.append(TTS_PATH)
import torch

from mozilla_TTS_utils.tts_generic_utils import setup_model
from mozilla_TTS_utils.synthesis import synthesis
from mozilla_TTS_utils.text.symbols import make_symbols, phonemes, symbols
from mozilla_TTS_utils.audio import AudioProcessor
from mozilla_TTS_utils.io import load_config
from mozilla_TTS_utils.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input

In [2]:
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):
    t_1 = time.time()
    waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T.unsqueeze(0)).T
    if not use_gl:
        #mel_postnet_spec = interpolate_vocoder_input(1.5, mel_postnet_spec)
        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
    if use_cuda and not use_gl:
        waveform = waveform.cpu()
    if not use_gl:
        waveform = waveform.numpy()
    waveform = waveform.squeeze()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    return waveform

In [18]:
TEXT = ''
OUT_PATH = '../tests-audios/'
# create output path
os.makedirs(OUT_PATH, exist_ok=True)

SPEAKER_FILEID = 'Bronevoy_L-02.wav' # if None use the first embedding from speakers.json

# model vars 
MODEL_PATH = '../models_and_weights/checkpoint_220000.pth.tar'
CONFIG_PATH = '../models_and_weights/220_taco_config.json'
SPEAKER_JSON = '../data/preprocessed_mozilla/speaker.json'

# vocoder vars
VOCODER_PATH = ''
#VOCODER_PATH = '../models_and_weights/melgan_best_model.pth.tar'
VOCODER_CONFIG_PATH = '../models_and_weights/melgan_config.json'

USE_CUDA = True

In [19]:
# load the config
C = load_config(CONFIG_PATH)
C.forward_attn_mask = True

# load the audio processor
ap = AudioProcessor(**C.audio)

# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
    symbols, phonemes = make_symbols(**C.characters)

speaker_embedding = None
speaker_embedding_dim = None
num_speakers = 0
# load speakers
if SPEAKER_JSON != '':
    speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))
    num_speakers = len(speaker_mapping)
    if C.use_external_speaker_embedding_file:
        if SPEAKER_FILEID is not None:
            speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']
        else: # if speaker_fileid is not specificated use the first sample in speakers.json
            choise_speaker = list(speaker_mapping.keys())[0]
            print(" Speaker: ",choise_speaker.split('_')[0],'was chosen automatically', "(this speaker seen in training)")
            speaker_embedding = speaker_mapping[choise_speaker]['embedding']
        speaker_embedding_dim = len(speaker_embedding)

# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
model.load_state_dict(cp['model'])
model.eval()

if USE_CUDA:
    model.cuda()

model.decoder.set_r(cp['r'])

# load vocoder model
if VOCODER_PATH!= "":
    VC = load_config(VOCODER_CONFIG_PATH)
    vocoder_model = setup_generator(VC)
    vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location="cpu")["model"])
    vocoder_model.remove_weight_norm()
    if USE_CUDA:
        vocoder_model.cuda()
    vocoder_model.eval()
else:
    vocoder_model = None
    VC = None

# synthesize voice
use_griffin_lim = VOCODER_PATH== ""

if not C.use_external_speaker_embedding_file:
    if SPEAKER_FILEID.isdigit():
        SPEAKER_FILEID = int(SPEAKER_FILEID)
    else:
        SPEAKER_FILEID = None
else:
    SPEAKER_FILEID = None

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:20
 | > do_sound_norm:False
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024
 > Using model: Tacotron2


In [20]:
import IPython
from IPython.display import Audio
print("Synthesize sentence with Speaker: ",choise_speaker.split('_')[0], "(this speaker seen in training)")
while True:
  TEXT = input("Enter sentence: ")
  if TEXT == 'q':
    break
  print(" > Text: {}".format(TEXT))
  wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)
  IPython.display.display(Audio(wav, rate=ap.sample_rate))
  # save the results
  file_name = TEXT.replace(" ", "_")
  file_name = file_name.translate(
      str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
  out_path = os.path.join(OUT_PATH, file_name)
  print(" > Saving output to {}".format(out_path))
  ap.save_wav(wav, out_path)

Synthesize sentence with Speaker:  Andrienko (this speaker seen in training)
Enter sentence: А вас штирлиц я попрошу остаться
 > Text: А вас штирлиц я попрошу остаться




 > Run-time: 2.396601676940918
 > Real-time factor: 1.3998815110910718
 > Time per step: 8.749270759453283e-05


 > Saving output to ../tests-audios/А_вас_штирлиц_я_попрошу_остаться.wav
Enter sentence: А вас штирлиц
 > Text: А вас штирлиц




 > Run-time: 1.088343620300293
 > Real-time factor: 1.447260062745277
 > Time per step: 9.045399170606694e-05


 > Saving output to ../tests-audios/А_вас_штирлиц.wav
Enter sentence: Мюллер старый больной человек
 > Text: Мюллер старый больной человек




 > Run-time: 1.8312647342681885
 > Real-time factor: 1.1112017249598087
 > Time per step: 6.945023439752245e-05


 > Saving output to ../tests-audios/Мюллер_старый_больной_человек.wav
Enter sentence: а я хоть раз устраивал
 > Text: а я хоть раз устраивал




 > Run-time: 1.6200103759765625
 > Real-time factor: 1.2198842792625886
 > Time per step: 7.624291332371263e-05


 > Saving output to ../tests-audios/а_я_хоть_раз_устраивал.wav


KeyboardInterrupt: Interrupted by user