In [6]:
import os
import torch

from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [7]:
output_path = "/Users/neil/Code/TTS_GI/outputs"

# init configs

In [11]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path="/Users/neil/Code/TTS_GI/dataset",
)

audio_config = BaseAudioConfig(
    sample_rate=16000,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)    

config = Tacotron2Config(
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16, 
    num_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=6,
    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="chinese_mandarin_cleaners",
    use_phonemes=True,
    phoneme_language="zh-cn",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=8,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
    test_sentences=[
        "你好，我是一个语音合成模型。",
        "我希望你能够听到我说话。",
        "这是一个测试句子。",
        "我喜欢吃冰淇淋。",
        "我喜欢看电影。",
        "我喜欢小唐。"
    ],
)

# init audio processor

In [12]:
ap = AudioProcessor(**config.audio)
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5


# Load Dataset

In [13]:
train_samples, val_samples = load_tts_samples(
    dataset_config, 
    eval_split=0.3,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

print(f"Train samples: {len(train_samples)}")
print(f"Val samples: {len(val_samples)}")
print(f"Train Sample: {train_samples[5]}")
print(f"Val Sample: {val_samples[5]}")

 | > Found 697 files in /Users/neil/Code/TTS_GI/dataset
Train samples: 691
Val samples: 6
Train Sample: {'text': '木南小姐，你好。\n', 'audio_file': '/Users/neil/Code/TTS_GI/dataset/wavs/神里绫华_train_390.wav', 'speaker_name': 'ljspeech', 'root_path': '/Users/neil/Code/TTS_GI/dataset', 'language': '', 'audio_unique_name': '#wavs/神里绫华_train_390'}
Val Sample: {'text': '风花雪月，虽然作为我们熟知的事物存在于世界之中，但也应有属于它们自己的「感情」。\n', 'audio_file': '/Users/neil/Code/TTS_GI/dataset/wavs/神里绫华_train_31.wav', 'speaker_name': 'ljspeech', 'root_path': '/Users/neil/Code/TTS_GI/dataset', 'language': '', 'audio_unique_name': '#wavs/神里绫华_train_31'}


In [14]:
pretrained_model_path = "/Users/neil/Code/TTS_GI/outputs/pretrain_models/model_file.pth"
model = Tacotron2(config, ap, tokenizer)
model.load_state_dict(torch.load(pretrained_model_path, map_location=torch.device("mps")), strict=False)

trainer = Trainer(
    TrainerArgs(), 
    config, 
    output_path, 
    model=model,
    train_samples=train_samples,
    eval_samples=val_samples
)

trainer.fit()

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Num. of CPUs: 10
 | > Num. of Torch Threads: 10
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/Users/neil/Code/TTS_GI/outputs/run-April-24-2024_02+26AM-30411c6
  from .autonotebook import tqdm as notebook_tqdm

 > Model has 47669492 parameters

[4m[1m > EPOCH: 0/1000[0m
 --> /Users/neil/Code/TTS_GI/outputs/run-April-24-2024_02+26AM-30411c6



 > Number of output frames: 6
[*] Pre-computing phonemes...


  0%|          | 0/691 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Loading model cost 0.521 seconds.
Prefix dict has been built successfully.
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Building prefix dict from the default dictionary ...

dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '3' not found in the vocabulary. Discarding it.
dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '4' not found in the vocabulary. Discarding it.
dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '1' not found in the vocabulary. Discarding it.
dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '，' not found in the vocabulary. Discarding it.
dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '2' not found in the vocabulary. Discarding it.
dʒʏ3ʂʏ4 ʈʂu3li3 i1ɕie1 ɕiaʌ3ʂʏ4 ， ma3ʂɑŋ4 dʑio4 lai2 。
 [!] Character '。' not found in the vocabulary. Discarding it.
bu4guo4 ， ʈʂʏ4 ʐœn2 dø5 ʂʏ1dʑi2 dʑi4ʐan2 ʂʏ4 iaʌ4 ʈʂɵŋ2 gei3 dʑiɑŋ1dʑyn1 pin3dʑiɛn4 dø5 dʒoŋ4iaʌ4 wu4pin3 ， kœn3dɨŋ4 xuei4 tsai3ioŋ4 liɑŋ3dʒoŋ3 ban4fa3 dʒʏ1i1 tsai2 duei4 。
 [!] Character 'g' not found in the vocabulary. Discarding it.
bu4guo4 ， ʈʂʏ4 ʐœn2 dø5 ʂʏ1dʑi2 dʑi4ʐan2 ʂʏ4 iaʌ4 ʈʂɵŋ2 

Loading model cost 0.513 seconds.
Prefix dict has been built successfully.
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g_/7bh1l5yd0717vlgf8v9b0j3h0000gn/T/jieba.cache
Loading model cost 0.484 seconds.
Prefix dict has been built successfully.


y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '3' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '4' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '2' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '，' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '1' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '5' not found in the vocabulary. Discarding it.
y3 wo3 duei4dʑy2 ʂʏ2 ， ni3 nan2daʌ4 fœn1ɕin1 lø5 ma5 ？
 [!] Character '？' not found in the vocabulary. Discarding it.
dʒʏ1tɕiɛn2 wo3 tɨŋ1 ɕioŋ1dʒɑŋ3 ti2tɕi3guo4 ，   lei2diɛn4 wu3 ʈʂuan2   dø5 mo4luo4 fei1ʈʂɑŋ2 kuai4 ， dzai4 duan3duan3 ʂu4ʂʏ2niɛn2 li3 ， dʑio4 io3 san1dʑia1 dzaʌ1y4 dʒoŋ3dʒoŋ3 

Loading model cost 0.467 seconds.
Prefix dict has been built successfully.
Loading model cost 0.460 seconds.
Prefix dict has been built successfully.
Loading model cost 0.421 seconds.
Prefix dict has been built successfully.
  1%|          | 8/691 [00:06<09:26,  1.21it/s]Loading model cost 0.425 seconds.
Prefix dict has been built successfully.


… kø3i3 ma5 ？
 [!] Character '…' not found in the vocabulary. Discarding it.
… kø3i3 ma5 ？
 [!] Character '3' not found in the vocabulary. Discarding it.
… kø3i3 ma5 ？
 [!] Character '5' not found in the vocabulary. Discarding it.
… kø3i3 ma5 ？
 [!] Character '？' not found in the vocabulary. Discarding it.
mei3dɑŋ1 wo3 y4daʌ4 ma2fan2ʂʏ4 ， dzoŋ3ʂʏ4 xuei4 ɕiɑŋ3 ： mu3tɕin1 dɑŋ1niɛn2 ʂʏ4fou3 ie3 dʒø4iɑŋ4 dzuo4 guo4 ？ dʒø4iɑŋ4 dø5 ʂʏ4 ， ta1 xuei4 dzœn3mø5 ʈʂu3li3 nø5 ？
 [!] Character '1' not found in the vocabulary. Discarding it.
mei3dɑŋ1 wo3 y4daʌ4 ma2fan2ʂʏ4 ， dzoŋ3ʂʏ4 xuei4 ɕiɑŋ3 ： mu3tɕin1 dɑŋ1niɛn2 ʂʏ4fou3 ie3 dʒø4iɑŋ4 dzuo4 guo4 ？ dʒø4iɑŋ4 dø5 ʂʏ4 ， ta1 xuei4 dzœn3mø5 ʈʂu3li3 nø5 ？
 [!] Character '4' not found in the vocabulary. Discarding it.
mei3dɑŋ1 wo3 y4daʌ4 ma2fan2ʂʏ4 ， dzoŋ3ʂʏ4 xuei4 ɕiɑŋ3 ： mu3tɕin1 dɑŋ1niɛn2 ʂʏ4fou3 ie3 dʒø4iɑŋ4 dzuo4 guo4 ？ dʒø4iɑŋ4 dø5 ʂʏ4 ， ta1 xuei4 dzœn3mø5 ʈʂu3li3 nø5 ？
 [!] Character '2' not found in the vocabulary. Discarding it.
mei3dɑŋ1 wo3 y4daʌ4 

Loading model cost 0.382 seconds.
Prefix dict has been built successfully.
  5%|▍         | 32/691 [00:06<01:40,  6.58it/s]

gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character 'g' not found in the vocabulary. Discarding it.
gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character '2' not found in the vocabulary. Discarding it.
gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character '4' not found in the vocabulary. Discarding it.
gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character '3' not found in the vocabulary. Discarding it.
gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character '1' not found in the vocabulary. Discarding it.
gø2ɕia4 wu4bi4 ɕiaʌ3ɕin1 。
 [!] Character '。' not found in the vocabulary. Discarding it.
in1wei4 ʂʏ4 ni3 dø5 dʑy1suo3 ， i4i4 dʑio4 gɵŋ4dʑia1 bu4toŋ2 。
 [!] Character '5' not found in the vocabulary. Discarding it.
in1wei4 ʂʏ4 ni3 dø5 dʑy1suo3 ， i4i4 dʑio4 gɵŋ4dʑia1 bu4toŋ2 。
 [!] Character '，' not found in the vocabulary. Discarding it.
ie3 dʒʏ3nɵŋ2 ɕiɑŋ1ɕin4 ba1dʒoŋ4 goŋ1 sɪ1 lø5 。 ta1 y3 dʑiɑŋ1dʑyn1 guan1ɕi4mi4tɕie4 ， xuo4ɕy3 xuei4 io3 ban4fa3 …
 [!] Character '…' not found in the vocabulary. Discarding it.
na4mø5 wo3mœn5 ɕiɛn4dzai4 guo4tɕy4

  9%|▉         | 64/691 [00:07<00:33, 18.61it/s]

wo3 ɕiɑŋ3 … wo3 ɕiɑŋ3 tɕɨŋ3 ni3 xø2 wo3 i4tɕi3 tɕy4 dʑi4diɛn3 ！
 [!] Character '！' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character '#' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character '{' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'N' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'I' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'C' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'K' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'A' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ2 ， ni3mœn5xaʌ3 。
 [!] Character 'M' not found in the vocabulary. Discarding it.
# { NICKNAME }   pai4 mɵŋ

696it [00:47, 14.76it/s]                        

[1m > TRAINING (2024-04-24 02:27:16) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: zh-cn
		| > phoneme backend: zh_cn_phonemizer
| > Number of instances : 691
 | > Preprocessing samples
 | > Max text length: 161
 | > Min text length: 3
 | > Avg text length: 25.088277858176557
 | 
 | > Max audio length: 564374.0
 | > Min audio length: 6358.0
 | > Avg audio length: 86968.29232995659
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]

[1m   --> TIME: 2024-04-24 02:28:31 -- STEP: 0/11 -- GLOBAL_STEP: 0[0m
     | > decoder_loss: 23.3160400390625  (23.3160400390625)
     | > postnet_loss: 25.45894432067871  (25.45894432067871)
     | > stopnet_loss: 0.6659126281738281  (0.6659126281738281)
     | > decoder_coarse_loss: 23.277170181274414  (23.277170181274414)
     | > decoder_ddc_loss: 0.008952152915298939  (0.008952152915298939)
     | > ga_loss: 0.03137047216296196  (0.03137047216296196)
     | > decoder_diff_spec_loss: 0.5367538332939148  (0.5367538332939148)
     | > postnet_diff_spec_loss: 4.93181037902832  (4.93181037902832)
     | > decoder_ssim_loss: 0.7220990657806396  (0.7220990657806396)
     | > postnet_ssim_loss: 0.7005515694618225  (0.7005515694618225)
     | > loss: 20.56084632873535  (20.56084632873535)
     | > align_error: 0.9481816925108433  (0.9481816925108433)
     | > grad_norm: tensor(4.2434)  (tensor(4.2434))
     | > curr

