In [1]:
import argparse
import json
import os
import string
import time
import sys
import numpy as np
TTS_PATH = "/home/iref/PycharmProjects/tts-vc/"
sys.path.append(TTS_PATH)
import torch
import io
import torch 
import yaml

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16,5)

import librosa
import librosa.display

import IPython
from IPython.display import Audio

from mozilla_TTS_utils.tts_generic_utils import setup_model
from mozilla_TTS_utils.synthesis import synthesis
from mozilla_TTS_utils.text.symbols import make_symbols, phonemes, symbols
from mozilla_TTS_utils.audio import AudioProcessor
from mozilla_TTS_utils.io import load_config
from mozilla_TTS_utils.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input
from mozilla_TTS_utils.data import *
from mozilla_TTS_utils.tts_io import load_checkpoint
from mozilla_TTS_utils.generic_utils import count_parameters
from mozilla_TTS_utils.text import text_to_sequence
from mozilla_TTS_utils.visual import visualize
from mozilla_TTS_utils.text.symbols import symbols, phonemes, make_symbols
#from mozilla_TTS_utils.models.tacotron import Tacotron
from mozilla_TTS_utils.layers import *

In [6]:
def interpolate_vocoder_input(scale_factor, spec):
    print(" > before interpolation :", spec.shape)
    spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0)
    spec = torch.nn.functional.interpolate(spec, scale_factor=scale_factor, mode='bilinear').squeeze(0)
    print(" > after interpolation :", spec.shape)
    return spec


def get_speaker_embedding(speaker_ids, speaker_mapping, num_samples):
    speaker_embeddings = []
    for speaker_id in speaker_ids:
        for key in list(speaker_mapping.keys()):
            if speaker_id in key:
                speaker_embeddings.append(speaker_mapping[key]['embedding']) 
                if len(speaker_embeddings) > num_samples:
                    break
    # takes the average of the embedings samples of the announcers
    speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0)
    return speaker_embedding
    

def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, file_name=None, speaker_id=None, speaker_embedding=None):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model,
                                                                                     text,
                                                                                     CONFIG,
                                                                                     use_cuda,
                                                                                     ap,
                                                                                     speaker_id,
                                                                                     None,
                                                                                     False,
                                                                                     CONFIG.enable_eos_bos_chars,
                                                                                     use_gl,
                                                                                     speaker_embedding=speaker_embedding)
    if CONFIG.model == "Tacotron" and not use_gl:
        # coorect the normalization differences b/w TTS and the Vocoder.
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    mel_postnet_spec = ap.denormalize(mel_postnet_spec.T).T
    if not use_gl:
        vocoder_input = ap_vocoder.normalize(mel_postnet_spec.T)
        if scale_factor[1] != 1:
            vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
        else:
            vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)
        waveform = vocoder_model.inference(vocoder_input.cuda())
    if use_cuda and not use_gl:
        waveform = waveform.cpu()
    if not use_gl:
        waveform = waveform.numpy()
    waveform = waveform.squeeze()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    print("max- ", mel_postnet_spec.max(), " -- min- ", mel_postnet_spec.min())
    if figures:  
        if mel_spec is not None:
            mel_spec = ap.denormalize(mel_spec.T).T
        fig = visualize(alignment, mel_postnet_spec, text, ap.hop_length, CONFIG, stop_tokens, mel_spec, figsize=[32,16], output_fig=True)                                                                       
    IPython.display.display(Audio(waveform, rate=VOCODER_CONFIG.audio['sample_rate']))  
    os.makedirs(OUT_FOLDER, exist_ok=True)
    if file_name is None:
        file_name = text.replace(" ", "_").replace(".","") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform

In [9]:
MODEL_PATH = '../models_and_weights/checkpoint_130000.pth.tar'
CONFIG_PATH = '../models_and_weights/taco_130k_config.json'

VOCODER_MODEL_PATH = '../models_and_weights/melgan_best_model.pth.tar'
VOCODER_CONFIG_PATH = '../models_and_weights/melgan_config.json'
OUT_FOLDER = '../tests-audios/'

CONFIG = load_config(CONFIG_PATH)
#CONFIG.audio['stats_path'] = 'tts_scale_stats.npy'
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
#VOCODER_CONFIG.audio['stats_path'] = 'vocoder_scale_stats.npy'


# load the audio processor
ap = AudioProcessor(**CONFIG.audio)   
    
# Run FLAGs
use_cuda = True
# Set some config fields manually for testing
# CONFIG.windowing = False
CONFIG.use_forward_attn = False
# Set the vocoder
use_gl = False # use GL if True

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:20
 | > do_sound_norm:False
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024


In [4]:
SPEAKER_JSON = '../data/preprocessed_mozilla/speaker.json'
SPEAKER_FILEID = None # if None use the first embedding from speakers.json
if SPEAKER_JSON != '':
    speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))
    num_speakers = len(speaker_mapping)
    if CONFIG.use_external_speaker_embedding_file:
        if SPEAKER_FILEID is not None:
            speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']
        else: # if speaker_fileid is not specificated use the first sample in speakers.json
            choise_speaker = list(speaker_mapping.keys())[0]
            print(" Speaker: ",choise_speaker.split('_')[0],'was chosen automatically', "(this speaker seen in training)")
            speaker_embedding = speaker_mapping[choise_speaker]['embedding']
        speaker_embedding_dim = len(speaker_embedding)
    
if 'characters' in CONFIG.keys():
    symbols, phonemes = make_symbols(**CONFIG.characters)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, CONFIG, speaker_embedding_dim)
model, _ =  load_checkpoint(model, MODEL_PATH, use_cuda=use_cuda)
print(count_parameters(model))
model.eval();

 Speaker:  Andrienko was chosen automatically (this speaker seen in training)
 > Using model: Tacotron2
 > Model r:  3
52461108


In [7]:
if use_gl == False:
    vocoder_model = setup_generator(VOCODER_CONFIG)
    cp = torch.load(VOCODER_MODEL_PATH, map_location="cpu")["model"]
    vocoder_model.load_state_dict(cp)
    scale_factor = [1,  VOCODER_CONFIG['audio']['sample_rate'] / ap.sample_rate]
    print(f"scale_factor: {scale_factor}")
    ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()
    print(count_parameters(vocoder_model))
    #vocoder_model.compute_noise_level(50, 1e-6, 1e-2)

 > Generator Model: fullband_melgan_generator
scale_factor: [1, 1.5]
 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:20
 | > do_sound_norm:False
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024
4707586


In [None]:
sentence = "Торт очень вкусный"
align, spec, stop_toens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True, file_name='ls1.wav', speaker_id=None, speaker_embedding=speaker_embedding)



 > before interpolation : (80, 78)
 > after interpolation : torch.Size([1, 80, 117])
(30976,)
 > Run-time: 0.9291961193084717
 > Real-time factor: 0.4799005167543396
 > Time per step: 2.99941748380661e-05
max-  12.853371  -- min-  -79.2413




tɔːt ɒʃən viːkʌsnɪi 
