In [1]:
!pip install TTS

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting TTS
  Downloading TTS-0.14.0-cp310-cp310-manylinux1_x86_64.whl (736 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m736.7/736.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cython==0.29.28 (from TTS)
  Downloading Cython-0.29.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
Collecting inflect==5.6.0 (from TTS)
  Downloading inflect-5.6.0-py3-none-any.whl (33 kB)
Collecting anyascii (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from TTS)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [2]:
import os

In [3]:
# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig

BaseDatasetConfig provides a configuration template for managing dataset-related settings in text-to-speech (TTS) applications

In [4]:
output_path = "tts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
!wget -O tts_train_dir/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 

--2023-05-16 17:29:50--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 174.138.79.61
Connecting to data.keithito.com (data.keithito.com)|174.138.79.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [application/octet-stream]
Saving to: ‘tts_train_dir/LJSpeech-1.1.tar.bz2’


2023-05-16 17:30:20 (87.2 MB/s) - ‘tts_train_dir/LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]



In [6]:
!tar -xf tts_train_dir/LJSpeech-1.1.tar.bz2

In [7]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path="/content/LJSpeech-1.1")

In [8]:
from TTS.tts.configs.tacotron_config import TacotronConfig

config = TacotronConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,  #This value means that testing will be performed immediately without any delay after each epoch of training. The value -1 indicates that there is no specific number of epochs to wait before starting the testing phase.
    epochs=100,
    text_cleaner="phoneme_cleaners", 
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    save_step=1000,
)


Note: A text cleaner is responsible for performing various text normalization operations to convert the raw input text into a cleaner representation suitable for TTS processing. The "phoneme_cleaners" text cleaner is likely designed to convert the input text into phonemes, which are the smallest units of sound in a language.

By using the "phoneme_cleaners" text cleaner, the TTS model will work with phoneme sequences instead of raw text, enabling it to generate speech that corresponds to the phonetic representation of the input text.

Now, we will initialize the audio processor using AudioProcessor class, responsible for handling feature extraction from audio and performing audio I/O operations.

In [9]:
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


Next we will initialize the tokenizer which is used to convert text to sequences of token IDs. If characters are not defined in the config, default characters are passed to the config.

In [10]:
from TTS.tts.utils.text.tokenizer import TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)

Next we will load data samples. Each sample is a list of [text, audio_file_path, speaker_name].

In [11]:
from TTS.tts.datasets import load_tts_samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

 | > Found 13100 files in /content/LJSpeech-1.1


In [12]:
from TTS.tts.models.tacotron import Tacotron
model = Tacotron(config, ap, tokenizer, speaker_manager=None)

In [None]:
from trainer import Trainer, TrainerArgs
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> tts_train_dir/run-May-16-2023_05+34PM-0000000


[*] Pre-computing phonemes...


  0%|          | 4/12969 [00:00<37:57,  5.69it/s]

ɪnstɛd əv weɪtɪŋ ðɛɹ, ɔzwɔld əpɛɹəntli wɛnt æz fɑɹ əweɪ æz hi kʊd ænd bɔɹdɪd ðə fɚst oʊk klɪf bʌs wɪt͡ʃ keɪm əlɔŋ
 [!] Character '͡' not found in the vocabulary. Discarding it.


 16%|█▌        | 2059/12969 [01:21<05:04, 35.88it/s]

ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '“' not found in the vocabulary. Discarding it.
ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '”' not found in the vocabulary. Discarding it.


100%|██████████| 12969/12969 [05:02<00:00, 42.93it/s]

[1m > TRAINING (2023-05-16 17:40:09) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]

[1m   --> STEP: 0/406 -- GLOBAL_STEP: 0[0m
     | > decoder_loss: 3.96578  (3.96578)
     | > postnet_loss: 2.70405  (2.70405)
     | > stopnet_loss: 0.76831  (0.76831)
     | > ga_loss: 0.02065  (0.02065)
     | > decoder_diff_spec_loss: 0.20948  (0.20948)
     | > postnet_diff_spec_loss: 0.45940  (0.45940)
     | > decoder_ssim_loss: 0.89874  (0.89874)
     | > postnet_ssim_loss: 0.87623  (0.87623)
     | > loss: 3.14995  (3.14995)
     | > align_error: 0.94115  (0.94115)
     | > amp_scaler: 65536.00000  (65536.00000)
     | > grad_norm: 1.60614  (1.60614)
     | > current_lr: 0.00000 
     | > step_time: 6.41410  (6.41414)
     | > loader_time: 3.52430  (3.52430)


[1m   --> STEP: 25/406 -- GLOBAL_STEP: 25[0m
     | > decoder_loss: 4.27260  (4.10153)
     | > postnet_loss: 2.65782  (2.60968)
     | > stopnet_loss: 0.78761  (0.78038)
     | > ga_loss: 0.00844  (0.01072)
     | > decoder_diff_spec_loss: 0.216



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.01750 [0m(+0.00000)
     | > avg_decoder_loss: 4.37734 [0m(+0.00000)
     | > avg_postnet_loss: 2.61879 [0m(+0.00000)
     | > avg_stopnet_loss: 0.84825 [0m(+0.00000)
     | > avg_ga_loss: 0.00340 [0m(+0.00000)
     | > avg_decoder_diff_spec_loss: 0.21481 [0m(+0.00000)
     | > avg_postnet_diff_spec_loss: 0.46460 [0m(+0.00000)
     | > avg_decoder_ssim_loss: 0.90404 [0m(+0.00000)
     | > avg_postnet_ssim_loss: 0.88467 [0m(+0.00000)
     | > avg_loss: 3.23131 [0m(+0.00000)
     | > avg_align_error: 0.98966 [0m(+0.00000)

 > BEST MODEL : tts_train_dir/run-May-16-2023_05+34PM-0000000/best_model_406.pth

[4m[1m > EPOCH: 1/100[0m
 --> tts_train_dir/run-May-16-2023_05+34PM-0000000

[1m > TRAINING (2023-05-16 18:09:22) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 19/406 -- GLOBAL_STEP: 425[0m
     | > decoder_loss: 4.05089  (4.08964)
     | > postnet_loss: 2.57635  (2.60934)
     | > stopnet_loss: 0.78223  (0.77782)
     | > ga_loss: 0.00858  (0.01151)
     | > decoder_diff_spec_loss: 0.22725  (0.21952)
     | > postnet_diff_spec_loss: 0.47503  (0.46762)
     | > decoder_ssim_loss: 0.95224  (0.94702)
     | > postnet_ssim_loss: 0.93842  (0.93564)
     | > loss: 3.13020  (3.15257)
     | > align_error: 0.97351  (0.96531)
     | > amp_scaler: 65536.00000  (65536.00000)
     | > grad_norm: 1.70772  (1.68014)
     | > current_lr: 0.00000 
     | > step_time: 0.93740  (1.02045)
     | > loader_time: 0.02460  (0.02120)


[1m   --> STEP: 44/406 -- GLOBAL_STEP: 450[0m
     | > decoder_loss: 4.16215  (4.10431)
     | > postnet_loss: 2.61785  (2.59956)
     | > stopnet_loss: 0.78413  (0.78220)
     | > ga_loss: 0.00636  (0.00914)
     | > decoder_diff_spec_loss: 0.22544  (0.22160)
     | > postnet_diff_spec_loss: 0.46765  (0.46777)
 



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps
   | > Decoder stopped with 'max_decoder_steps



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.02225 [0m(+0.00474)
     | > avg_decoder_loss:[92m 4.37447 [0m(-0.00288)
     | > avg_postnet_loss:[92m 2.61854 [0m(-0.00026)
     | > avg_stopnet_loss:[91m 0.85078 [0m(+0.00253)
     | > avg_ga_loss:[92m 0.00340 [0m(-0.00000)
     | > avg_decoder_diff_spec_loss:[92m 0.21468 [0m(-0.00013)
     | > avg_postnet_diff_spec_loss:[91m 0.46461 [0m(+0.00001)
     | > avg_decoder_ssim_loss:[91m 0.90414 [0m(+0.00010)
     | > avg_postnet_ssim_loss:[92m 0.88366 [0m(-0.00101)
     | > avg_loss:[91m 3.23280 [0m(+0.00148)
     | > avg_align_error:[91m 0.98967 [0m(+0.00001)


[4m[1m > EPOCH: 2/100[0m
 --> tts_train_dir/run-May-16-2023_05+34PM-0000000

[1m > TRAINING (2023-05-16 18:37:47) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> STEP: 13/406 -- GLOBAL_STEP: 825[0m
     | > decoder_loss: 4.11257  (4.07533)
     | > postnet_loss: 2.61601  (2.61215)
     | > stopnet_loss: 0.78558  (0.77654)
     | > ga_loss: 0.00968  (0.01258)
     | > decoder_diff_spec_loss: 0.21825  (0.21751)
     | > postnet_diff_spec_loss: 0.46324  (0.46630)
     | > decoder_ssim_loss: 0.95060  (0.94448)
     | > postnet_ssim_loss: 0.94050  (0.93265)
     | > loss: 3.15928  (3.15153)
     | > align_error: 0.96955  (0.96235)
     | > amp_scaler: 65536.00000  (65536.00000)
     | > grad_norm: 1.70376  (1.66905)
     | > current_lr: 0.00000 
     | > step_time: 1.47900  (1.06405)
     | > loader_time: 0.01100  (0.02020)


[1m   --> STEP: 38/406 -- GLOBAL_STEP: 850[0m
     | > decoder_loss: 4.16503  (4.10439)
     | > postnet_loss: 2.58850  (2.60210)
     | > stopnet_loss: 0.78525  (0.78110)
     | > ga_loss: 0.00698  (0.00955)
     | > decoder_diff_spec_loss: 0.22204  (0.22100)
     | > postnet_diff_spec_loss: 0.46888  (0.46787)
 

In [None]:
!pip install tensorboard
!tensorboard --logdir=tts_train_dir

In [None]:
import glob, os
output_path = "tts_train_dir"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])

In [None]:
!tts --text "Text for TTS" \
      --model_path $test_ckpt \
      --config_path $test_config \
      --out_path out.wav

In [None]:
import IPython
IPython.display.Audio("out.wav")