In [3]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = "/Users/neil/Code/TTS_GI/outputs"

dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path="/Users/neil/Code/TTS_GI/dataset",
)

audio_config = VitsAudioConfig(
    sample_rate=16000,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    mel_fmin=0.0,
    mel_fmax=None,
)

config = VitsConfig(
    audio=audio_config,
    run_name="VITS_Kamisato_Ayaka",
    run_description="VITS Kamisato Ayaka / Genshin Impact",
    batch_size=16,
    eval_batch_size=16,
    batch_group_size=1,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    text_cleaner="chinese_cleaners",
    use_phonemes=True,
    phoneme_language="zn-ch",
    phoneme_cache_path=None,
    compute_input_seq_cache=True,
    print_step=1,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    cudnn_benchmark=True,
    use_noise_augment=True,
    test_sentences=[
        ["你好啊，小唐，你今天过的开心么？"],
        ["好久没有联系啦，你最近还好吗？"],
        ["我最近在学习新的技术，感觉很有趣。"],
    ],
)

print(config)

VitsConfig(output_path='/Users/neil/Code/TTS_GI/outputs', logger_uri=None, run_name='VITS_Kamisato_Ayaka', project_name=None, run_description='VITS Kamisato Ayaka / Genshin Impact', print_step=1, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', save_on_interrupt=True, log_model_step=None, save_step=10000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=0, target_loss=None, print_eval=True, test_delay_epochs=-1, run_eval=True, run_eval_steps=None, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=True, precision='fp16', epochs=1, batch_size=16, eval_batch_size=16, grad_clip=[1000, 1000], scheduler_after_epoch=True, lr=0.001, optimizer='AdamW', optimizer_params={'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.01}, lr_scheduler=None, lr_scheduler_params={}, use_grad_scaler=False, allow_tf32=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=True, training_

## Model Test

In [None]:
from TTS.utils.synthesizer import Synthesizer
import soundfile as sf

model_path = "best_model.pth"
config_path = "config.json"

synthesizer =  Synthesizer(model_path, config_path)

text = "今天也是喜欢小唐的一天奥。"

wav = synthesizer.tts(text)
sf.write("output.wav", wav, 16000)

## Terminal Generate Test

In [None]:
! tts --text "今天也是喜欢小唐的一天。" \
    --model_name "tts_models/zh-CN/baker/tacotron2-DDC-GST" \
    --out_path "test_outputs_termial.wav"

## Python Generate Test

In [2]:
from TTS.utils.synthesizer import Synthesizer
import soundfile as sf

model_path = "model_file.pth"
config_path = "config copy.json"

synthesizer =  Synthesizer(model_path, config_path)

text = "今天也是喜欢小唐的一天。"

wav = synthesizer.tts(text)
sf.write("test_outputs_python.wav", wav, 16000)

 > Using model: tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:/Users/neil/Library/Application Support/tts/tts_models--zh-CN--baker--tacotron2-DDC-GST/scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 2
 > Text splitted to sentences.
['今天也是喜欢小唐的一天。']
 > Processing time: 0.6954970359802246


In [9]:
import torch
from TTS.api import TTS

#tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=True)
tts = TTS(model_path="model_file.pth", config_path="config copy.json", progress_bar=True, gpu=False)
tts.tts_to_file(text="今天也是喜欢小唐的一天。", file_path="test_outputs_python_api.wav")

 > Using model: tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:/Users/neil/Library/Application Support/tts/tts_models--zh-CN--baker--tacotron2-DDC-GST/scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 2


AttributeError: 'TTS' object has no attribute 'is_multi_lingual'