In [None]:
!pip install TTS

In [None]:
import os

In [None]:
# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig

BaseDatasetConfig provides a configuration template for managing dataset-related settings in text-to-speech (TTS) applications

In [None]:
output_path = "tts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
!wget -O tts_train_dir/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 

In [None]:
!tar -xf tts_train_dir/LJSpeech-1.1.tar.bz2

In [None]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path="/content/LJSpeech-1.1")

In [None]:
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1, #This value means that testing will be performed immediately without any delay after each epoch of training. The value -1 indicates that there is no specific number of epochs to wait before starting the testing phase.
    epochs=100,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    save_step=1000,
)

Note: A text cleaner is responsible for performing various text normalization operations to convert the raw input text into a cleaner representation suitable for TTS processing. The "phoneme_cleaners" text cleaner is likely designed to convert the input text into phonemes, which are the smallest units of sound in a language.

By using the "phoneme_cleaners" text cleaner, the TTS model will work with phoneme sequences instead of raw text, enabling it to generate speech that corresponds to the phonetic representation of the input text.

Now, we will initialize the audio processor using AudioProcessor class, responsible for handling feature extraction from audio and performing audio I/O operations.

In [None]:
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)

Next we will initialize the tokenizer which is used to convert text to sequences of token IDs. If characters are not defined in the config, default characters are passed to the config.

In [None]:
from TTS.tts.utils.text.tokenizer import TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)

Next we will load data samples. Each sample is a list of [text, audio_file_path, speaker_name].

In [None]:
from TTS.tts.datasets import load_tts_samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

In [None]:
from TTS.tts.models.glow_tts import GlowTTS
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [None]:
from trainer import Trainer, TrainerArgs
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000


[*] Pre-computing phonemes...


  0%|          | 4/12969 [00:00<37:50,  5.71it/s]

ɪnstɛd əv weɪtɪŋ ðɛɹ, ɔzwɔld əpɛɹəntli wɛnt æz fɑɹ əweɪ æz hi kʊd ænd bɔɹdɪd ðə fɚst oʊk klɪf bʌs wɪt͡ʃ keɪm əlɔŋ
 [!] Character '͡' not found in the vocabulary. Discarding it.


 16%|█▌        | 2059/12969 [01:20<05:06, 35.57it/s]

ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '“' not found in the vocabulary. Discarding it.
ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '”' not found in the vocabulary. Discarding it.


100%|██████████| 12969/12969 [05:02<00:00, 42.84it/s]

[1m > TRAINING (2023-05-16 17:38:35) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 0/406 -- GLOBAL_STEP: 0[0m
     | > current_lr: 0.00000 
     | > step_time: 11.77090  (11.77088)
     | > loader_time: 3.29410  (3.29406)


[1m   --> STEP: 25/406 -- GLOBAL_STEP: 25[0m
     | > loss: 3.65408  (3.52588)
     | > log_mle: 0.78891  (0.78805)
     | > loss_dur: 2.86517  (2.73782)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 10.31936  (9.37831)
     | > current_lr: 0.00000 
     | > step_time: 0.62110  (0.70223)
     | > loader_time: 0.00430  (0.01039)


[1m   --> STEP: 50/406 -- GLOBAL_STEP: 50[0m
     | > loss: 3.60657  (3.52123)
     | > log_mle: 0.78712  (0.78951)
     | > loss_dur: 2.81946  (2.73172)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 10.23356  (9.80654)
     | > current_lr: 0.00000 
     | > step_time: 0.66880  (0.73822)
     | > loader_time: 0.00390  (0.01093)


[1m   --> STEP: 75/406 -- GLOBAL_STEP: 75[0m
     | > loss: 3.61446  (3.52505)
     | > log_mle: 0.79578  (0.78966)
     | > l



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.00918 [0m(+0.00000)
     | > avg_loss: 3.45273 [0m(+0.00000)
     | > avg_log_mle: 0.78346 [0m(+0.00000)
     | > avg_loss_dur: 2.66927 [0m(+0.00000)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_406.pth

[4m[1m > EPOCH: 1/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 17:47:36) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 19/406 -- GLOBAL_STEP: 425[0m
     | > loss: 3.44811  (3.48777)
     | > log_mle: 0.78456  (0.78144)
     | > loss_dur: 2.66355  (2.70633)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 9.44169  (9.39360)
     | > current_lr: 0.00000 
     | > step_time: 0.51060  (0.66665)
     | > loader_time: 0.00330  (0.00899)


[1m   --> STEP: 44/406 -- GLOBAL_STEP: 450[0m
     | > loss: 3.39889  (3.44217)
     | > log_mle: 0.78628  (0.78342)
     | > loss_dur: 2.61261  (2.65875)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 9.31497  (9.33772)
     | > current_lr: 0.00000 
     | > step_time: 0.55360  (0.68935)
     | > loader_time: 0.02040  (0.01189)


[1m   --> STEP: 69/406 -- GLOBAL_STEP: 475[0m
     | > loss: 3.47735  (3.43759)
     | > log_mle: 0.79010  (0.78373)
     | > loss_dur: 2.68725  (2.65386)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 9.40473  (9.32960)
     | > current_lr: 0.00000 
     | > step



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00888 [0m(-0.00030)
     | > avg_loss:[92m 3.31518 [0m(-0.13755)
     | > avg_log_mle:[92m 0.77241 [0m(-0.01105)
     | > avg_loss_dur:[92m 2.54277 [0m(-0.12650)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_812.pth

[4m[1m > EPOCH: 2/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 17:55:55) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 13/406 -- GLOBAL_STEP: 825[0m
     | > loss: 3.23999  (3.36253)
     | > log_mle: 0.76409  (0.77083)
     | > loss_dur: 2.47590  (2.59170)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 8.02373  (8.27000)
     | > current_lr: 0.00000 
     | > step_time: 0.57430  (0.65838)
     | > loader_time: 0.00290  (0.01055)


[1m   --> STEP: 38/406 -- GLOBAL_STEP: 850[0m
     | > loss: 3.30448  (3.32271)
     | > log_mle: 0.77304  (0.77250)
     | > loss_dur: 2.53144  (2.55021)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 8.07801  (8.19287)
     | > current_lr: 0.00000 
     | > step_time: 0.62800  (0.67750)
     | > loader_time: 0.00380  (0.01049)


[1m   --> STEP: 63/406 -- GLOBAL_STEP: 875[0m
     | > loss: 3.30229  (3.30184)
     | > log_mle: 0.76808  (0.77234)
     | > loss_dur: 2.53421  (2.52950)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 8.02814  (8.12263)
     | > current_lr: 0.00000 
     | > step



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01313 [0m(+0.00426)
     | > avg_loss:[92m 2.99811 [0m(-0.31707)
     | > avg_log_mle:[92m 0.74328 [0m(-0.02914)
     | > avg_loss_dur:[92m 2.25483 [0m(-0.28794)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_1218.pth

[4m[1m > EPOCH: 3/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 18:04:25) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 7/406 -- GLOBAL_STEP: 1225[0m
     | > loss: 3.04712  (3.11136)
     | > log_mle: 0.74373  (0.74251)
     | > loss_dur: 2.30339  (2.36885)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 6.26538  (6.31860)
     | > current_lr: 0.00000 
     | > step_time: 0.43660  (0.49502)
     | > loader_time: 0.01160  (0.00942)


[1m   --> STEP: 32/406 -- GLOBAL_STEP: 1250[0m
     | > loss: 3.09510  (3.05955)
     | > log_mle: 0.74397  (0.74435)
     | > loss_dur: 2.35113  (2.31521)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 6.28039  (6.24708)
     | > current_lr: 0.00000 
     | > step_time: 0.58790  (0.64060)
     | > loader_time: 0.01520  (0.01147)


[1m   --> STEP: 57/406 -- GLOBAL_STEP: 1275[0m
     | > loss: 2.98275  (3.03324)
     | > log_mle: 0.74196  (0.74373)
     | > loss_dur: 2.24079  (2.28950)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 5.95565  (6.17040)
     | > current_lr: 0.00000 
     | > st



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00971 [0m(-0.00342)
     | > avg_loss:[92m 2.75287 [0m(-0.24524)
     | > avg_log_mle:[92m 0.68985 [0m(-0.05343)
     | > avg_loss_dur:[92m 2.06302 [0m(-0.19181)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_1624.pth

[4m[1m > EPOCH: 4/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 18:12:49) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 1/406 -- GLOBAL_STEP: 1625[0m
     | > loss: 2.90296  (2.90296)
     | > log_mle: 0.69260  (0.69260)
     | > loss_dur: 2.21036  (2.21036)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 5.28940  (5.28940)
     | > current_lr: 0.00000 
     | > step_time: 0.94060  (0.94061)
     | > loader_time: 0.03240  (0.03236)


[1m   --> STEP: 26/406 -- GLOBAL_STEP: 1650[0m
     | > loss: 2.77666  (2.83294)
     | > log_mle: 0.69429  (0.69573)
     | > loss_dur: 2.08237  (2.13721)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 5.07961  (5.18512)
     | > current_lr: 0.00000 
     | > step_time: 0.62530  (0.62615)
     | > loader_time: 0.01570  (0.01193)


[1m   --> STEP: 51/406 -- GLOBAL_STEP: 1675[0m
     | > loss: 2.79368  (2.80108)
     | > log_mle: 0.68363  (0.69415)
     | > loss_dur: 2.11006  (2.10692)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 5.08514  (5.13650)
     | > current_lr: 0.00000 
     | > st



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01074 [0m(+0.00103)
     | > avg_loss:[92m 2.43641 [0m(-0.31646)
     | > avg_log_mle:[92m 0.61022 [0m(-0.07963)
     | > avg_loss_dur:[92m 1.82619 [0m(-0.23683)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_2030.pth

[4m[1m > EPOCH: 5/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 18:21:29) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 20/406 -- GLOBAL_STEP: 2050[0m
     | > loss: 2.43312  (2.52465)
     | > log_mle: 0.63105  (0.62374)
     | > loss_dur: 1.80207  (1.90090)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 4.48053  (4.61263)
     | > current_lr: 0.00000 
     | > step_time: 0.65520  (0.67430)
     | > loader_time: 0.01330  (0.01126)


[1m   --> STEP: 45/406 -- GLOBAL_STEP: 2075[0m
     | > loss: 2.46401  (2.48640)
     | > log_mle: 0.61868  (0.62113)
     | > loss_dur: 1.84533  (1.86527)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 4.55479  (4.56645)
     | > current_lr: 0.00000 
     | > step_time: 0.66150  (0.69995)
     | > loader_time: 0.01340  (0.01353)


[1m   --> STEP: 70/406 -- GLOBAL_STEP: 2100[0m
     | > loss: 2.41545  (2.47478)
     | > log_mle: 0.59952  (0.61649)
     | > loss_dur: 1.81593  (1.85829)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 4.45098  (4.55244)
     | > current_lr: 0.00000 
     | > s



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01387 [0m(+0.00313)
     | > avg_loss:[92m 2.04308 [0m(-0.39333)
     | > avg_log_mle:[92m 0.51466 [0m(-0.09556)
     | > avg_loss_dur:[92m 1.52842 [0m(-0.29777)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_2436.pth

[4m[1m > EPOCH: 6/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 18:30:04) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 14/406 -- GLOBAL_STEP: 2450[0m
     | > loss: 2.14259  (2.14212)
     | > log_mle: 0.52022  (0.53317)
     | > loss_dur: 1.62237  (1.60895)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 4.07272  (4.03119)
     | > current_lr: 0.00000 
     | > step_time: 0.50390  (0.72681)
     | > loader_time: 0.01850  (0.01334)


[1m   --> STEP: 39/406 -- GLOBAL_STEP: 2475[0m
     | > loss: 2.08823  (2.10643)
     | > log_mle: 0.52820  (0.53091)
     | > loss_dur: 1.56003  (1.57551)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 3.98090  (3.97942)
     | > current_lr: 0.00000 
     | > step_time: 0.49010  (0.71204)
     | > loader_time: 0.00890  (0.01240)


[1m   --> STEP: 64/406 -- GLOBAL_STEP: 2500[0m
     | > loss: 1.94730  (2.08417)
     | > log_mle: 0.52504  (0.52642)
     | > loss_dur: 1.42226  (1.55775)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 3.68343  (3.93746)
     | > current_lr: 0.00000 
     | > s



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00996 [0m(-0.00391)
     | > avg_loss:[92m 1.72562 [0m(-0.31746)
     | > avg_log_mle:[92m 0.44169 [0m(-0.07297)
     | > avg_loss_dur:[92m 1.28393 [0m(-0.24449)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+33PM-0000000/best_model_2842.pth

[4m[1m > EPOCH: 7/100[0m
 --> tts_train_dir/run-May-16-2023_05+33PM-0000000

[1m > TRAINING (2023-05-16 18:38:33) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 8/406 -- GLOBAL_STEP: 2850[0m
     | > loss: 1.81816  (1.82325)
     | > log_mle: 0.46040  (0.46742)
     | > loss_dur: 1.35776  (1.35583)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 3.15904  (3.20936)
     | > current_lr: 0.00000 
     | > step_time: 0.52400  (0.52279)
     | > loader_time: 0.00840  (0.00931)


[1m   --> STEP: 33/406 -- GLOBAL_STEP: 2875[0m
     | > loss: 1.79396  (1.78294)
     | > log_mle: 0.45811  (0.46281)
     | > loss_dur: 1.33585  (1.32012)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 3.19144  (3.16200)
     | > current_lr: 0.00000 
     | > step_time: 0.61600  (0.63241)
     | > loader_time: 0.00520  (0.00993)


[1m   --> STEP: 58/406 -- GLOBAL_STEP: 2900[0m
     | > loss: 1.79182  (1.76837)
     | > log_mle: 0.45139  (0.45966)
     | > loss_dur: 1.34043  (1.30871)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 3.14062  (3.12631)
     | > current_lr: 0.00000 
     | > st

In [None]:
!pip install tensorboard
!tensorboard --logdir=tts_train_dir

In [None]:
import glob, os
output_path = "tts_train_dir"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])

In [None]:
!tts --text "Text for TTS" \
      --model_path $test_ckpt \
      --config_path $test_config \
      --out_path out.wav

In [None]:
import IPython
IPython.display.Audio("out.wav")