Source: https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html#training-a-tts-model 

https://github.com/neonsecret/TTS-With-Voice-Cloning-Multilang

In [None]:
!pip install TTS

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting TTS
  Downloading TTS-0.11.1-cp38-cp38-manylinux1_x86_64.whl (604 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m604.1/604.1 KB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting g2pkk>=0.1.1
  Downloading g2pkk-0.1.2-py3-none-any.whl (25 kB)
Collecting umap-learn==0.5.1
  Downloading umap-learn-0.5.1.tar.gz (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.9/80.9 KB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mecab-python3==1.0.5
  Downloading mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (577 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m577.3/577.3 KB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting librosa==0.8.0
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download -d freezerainml/ruslan

Downloading ruslan.zip to /content
100% 3.84G/3.84G [03:24<00:00, 23.1MB/s]
100% 3.84G/3.84G [03:24<00:00, 20.2MB/s]


In [None]:
!unzip '/content/ruslan.zip'

In [1]:
import os
from glob import glob

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = "/content/"

dataset_config = BaseDatasetConfig(
    formatter="ruslan", meta_file_train="/content/ruslan_dataset/metadata_RUSLAN_22200.csv", 
    path='/content/ruslan_dataset/'
)

audio_config = VitsAudioConfig(
    sample_rate=16000,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    mel_fmin=0,
    mel_fmax=None,
)

vitsArgs = VitsArgs(
    use_language_embedding=True,
    embedded_language_dim=4,
    use_speaker_embedding=True,
    use_sdp=False,
)
config = VitsConfig(
    model_args=vitsArgs,
    audio=audio_config,
    run_name="vits_vctk",
    use_speaker_embedding=True,
    save_all_best=True,
    batch_size=32,
    eval_batch_size=16,
    batch_group_size=0,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner="multilingual_cleaners",
    use_phonemes=False,
    phoneme_language="ru-ru",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    use_language_weighted_sampler=True,
    print_eval=False,
    mixed_precision=False,
    min_audio_len=32 * 256 * 4,
    max_audio_len=160000,
    output_path=output_path,
    datasets=[dataset_config],
    characters=CharactersConfig(
        characters_class="TTS.tts.models.vits.VitsCharacters",
        pad="<PAD>",
        eos="<EOS>",
        bos="<BOS>",
        blank="<BLNK>",
        characters="!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„",
        punctuations="!¡'(),-.:;¿? ",
        phonemes=None,
    ),
    test_sentences=[
        [
            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "mary_ann",
            None,
            "en_US",
        ],
        [
            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
            "ezwa",
            None,
            "fr_FR",
        ],
        ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, "de_DE"],
        ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, "ru_RU"],
    ],
)

# init audio processor
ap = AudioProcessor.init_from_config(config)

# load training samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)


# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

# init model
model = Vits(config, ap, tokenizer)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 22200 files in /content/ruslan_dataset


 > Training Environment:
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 > Start Tensorboard: tensorboard --logdir=/content/vits_vctk-March-05-2023_07+59PM-0000000

 > Model has 82090365 parameters


In [2]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> /content/vits_vctk-March-05-2023_07+59PM-0000000




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 21978



[1m > TRAINING (2023-03-05 19:59:55) [0m


 | > Preprocessing samples
 | > Max text length: 132
 | > Min text length: 15
 | > Avg text length: 77.22354371498044
 | 
 | > Max audio length: 159989.0
 | > Min audio length: 32788.0
 | > Avg audio length: 103850.23897875039
 | > Num. instances discarded samples: 3060
 | > Batch group size: 0.
['<BLNK>', '–', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'ж', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'т', '<BLNK>', 'ы', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'з', '<BLNK>', 'д', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'в', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', ',', '<BLNK>', ' ', '<BLNK>', 'с', '<BLNK>', 'ы', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'к', '<BLNK>', '?', '<BLNK>']
 [!] Character '–' not found in the vocabulary. Discarding it.
['<BLNK>', 'ж', '<BLNK>', 'и', '<BLNK>', 'т', '<BLNK>', 'ь', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'р', '<BLNK>', 'е', '<BLNK>', 'д', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', 'ч'

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


['<BLNK>', '–', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', ' ', '<BLNK>', 'с', '<BLNK>', 'л', '<BLNK>', 'е', '<BLNK>', 'д', '<BLNK>', 'у', '<BLNK>', 'ю', '<BLNK>', 'щ', '<BLNK>', 'и', '<BLNK>', 'й', '<BLNK>', ' ', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'з', '<BLNK>', ' ', '<BLNK>', 'б', '<BLNK>', 'у', '<BLNK>', 'д', '<BLNK>', 'ь', '<BLNK>', 'т', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'а', '<BLNK>', 'к', '<BLNK>', 'т', '<BLNK>', 'и', '<BLNK>', 'в', '<BLNK>', 'н', '<BLNK>', 'е', '<BLNK>', 'е', '<BLNK>', '.', '<BLNK>']
 [!] Character '–' not found in the vocabulary. Discarding it.



[1m   --> STEP: 0/592 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 5.99036  (5.99036)
     | > loss_disc_real_0: 1.01957  (1.01957)
     | > loss_disc_real_1: 0.95664  (0.95664)
     | > loss_disc_real_2: 1.02611  (1.02611)
     | > loss_disc_real_3: 0.97346  (0.97346)
     | > loss_disc_real_4: 0.98518  (0.98518)
     | > loss_disc_real_5: 1.02822  (1.02822)
     | > loss_0: 5.99036  (5.99036)
     | > grad_norm_0: 6.73263  (6.73263)
     | > loss_gen: 4.44501  (4.44501)
     | > loss_kl: 148.95711  (148.95711)
     | > loss_feat: 0.33847  (0.33847)
     | > loss_mel: 103.48712  (103.48712)
     | > loss_duration: 0.83971  (0.83971)
     | > loss_1: 258.06741  (258.06741)
     | > grad_norm_1: 1264.68542  (1264.68542)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 4.63650  (4.63651)
     | > loader_time: 2.67350  (2.67347)



['<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'ы', '<BLNK>', 'т', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', 'с', '<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', '‑', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'з', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'ж', '<BLNK>', 'а', '<BLNK>', 'т', '<BLNK>', 'ь', '<BLNK>', '.', '<BLNK>']
 [!] Character '‑' not found in the vocabulary. Discarding it.
['<BLNK>', 'д', '<BLNK>', 'а', '<BLNK>', 'ж', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', '–', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', 'х', '<BLNK>', 'о', '<BLNK>', 'д', '<BLNK>', 'я', '<BLNK>', 'с', '<BLNK>', 'ь', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', 'о', '<BLNK>', 'м', '<BLNK>', 'а', '<BLNK>', 'н', '<BLNK>', 'д', '<BLNK>', 'и', '<BLNK>', 'р', '<BLNK>', 'о', '<BLNK>', 'в', '<BLNK>', 'к', '<BLNK>', 'е', '<BLNK>', '.', '<BLNK>']
 [!] Character '–' not found in the vocabular


[1m   --> STEP: 25/592 -- GLOBAL_STEP: 25[0m
     | > loss_disc: 2.83096  (3.14492)
     | > loss_disc_real_0: 0.25834  (0.27448)
     | > loss_disc_real_1: 0.23310  (0.28461)
     | > loss_disc_real_2: 0.20494  (0.27577)
     | > loss_disc_real_3: 0.23825  (0.29499)
     | > loss_disc_real_4: 0.23998  (0.29856)
     | > loss_disc_real_5: 0.20082  (0.29789)
     | > loss_0: 2.83096  (3.14492)
     | > grad_norm_0: 1.58508  (2.38391)
     | > loss_gen: 1.70050  (1.68391)
     | > loss_kl: 5.44517  (17.97519)
     | > loss_feat: 0.95457  (0.49409)
     | > loss_mel: 54.46906  (60.38720)
     | > loss_duration: 0.66111  (0.68940)
     | > loss_1: 63.23039  (81.22979)
     | > grad_norm_1: 168.79877  (168.18526)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 2.61860  (2.65453)
     | > loader_time: 0.00670  (0.00949)


[1m   --> STEP: 50/592 -- GLOBAL_STEP: 50[0m
     | > loss_disc: 2.33110  (2.88051)
     | > loss_disc_real_0: 0.09546  (0.24476)
 

['<BLNK>', 'в', '<BLNK>', 'я', '<BLNK>', 'л', '<BLNK>', 'ы', '<BLNK>', 'й', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', 'и', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'к', '<BLNK>', '.', '<BLNK>', ' ', '<BLNK>', '–', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', '‑', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', 'и', '<BLNK>', 'т', '<BLNK>', 'с', '<BLNK>', 'я', '<BLNK>', '.', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'у', '<BLNK>', ' ', '<BLNK>', 'д', '<BLNK>', 'а', '<BLNK>', ' ', '<BLNK>', 'л', '<BLNK>', 'а', '<BLNK>', 'д', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', '.', '<BLNK>']
 [!] Character '‑' not found in the vocabulary. Discarding it.



[1m   --> STEP: 75/592 -- GLOBAL_STEP: 75[0m
     | > loss_disc: 2.19935  (2.65611)
     | > loss_disc_real_0: 0.05755  (0.19259)
     | > loss_disc_real_1: 0.26920  (0.25942)
     | > loss_disc_real_2: 0.19904  (0.25785)
     | > loss_disc_real_3: 0.20315  (0.26026)
     | > loss_disc_real_4: 0.15673  (0.25899)
     | > loss_disc_real_5: 0.15420  (0.23713)
     | > loss_0: 2.19935  (2.65611)
     | > grad_norm_0: 10.02472  (6.37573)
     | > loss_gen: 2.69580  (2.09634)
     | > loss_kl: 2.31390  (8.01234)
     | > loss_feat: 4.16959  (2.14979)
     | > loss_mel: 49.73565  (52.38464)
     | > loss_duration: 0.62044  (0.65159)
     | > loss_1: 59.53538  (65.29470)
     | > grad_norm_1: 214.13000  (161.42067)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 2.79610  (2.68060)
     | > loader_time: 0.02080  (0.01057)


[1m   --> STEP: 100/592 -- GLOBAL_STEP: 100[0m
     | > loss_disc: 2.75678  (2.62963)
     | > loss_disc_real_0: 0.29202  (0.18289)

['<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', ' ', '<BLNK>', 'б', '<BLNK>', 'у', '<BLNK>', 'к', '<BLNK>', 'в', '<BLNK>', 'у', '<BLNK>', ' ', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', '/', '<BLNK>', 'б', '<BLNK>', 'и', '<BLNK>', 'б', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', 'о', '<BLNK>', 'г', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'ф', '<BLNK>', 'и', '<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', ' ', '<BLNK>', 'о', '<BLNK>', 'к', '<BLNK>', 'у', '<BLNK>', 'д', '<BLNK>', 'ж', '<BLNK>', 'а', '<BLNK>', 'в', '<BLNK>', 'е', '<BLNK>', '/', '<BLNK>', '.', '<BLNK>']
 [!] Character '/' not found in the vocabulary. Discarding it.



[1m   --> STEP: 150/592 -- GLOBAL_STEP: 150[0m
     | > loss_disc: 2.35126  (2.64785)
     | > loss_disc_real_0: 0.08617  (0.19287)
     | > loss_disc_real_1: 0.20380  (0.25549)
     | > loss_disc_real_2: 0.28239  (0.25259)
     | > loss_disc_real_3: 0.31400  (0.25223)
     | > loss_disc_real_4: 0.32506  (0.25092)
     | > loss_disc_real_5: 0.27229  (0.23973)
     | > loss_0: 2.35126  (2.64785)
     | > grad_norm_0: 9.86216  (7.25380)
     | > loss_gen: 2.59371  (2.12763)
     | > loss_kl: 1.90677  (4.99466)
     | > loss_feat: 3.14257  (2.27799)
     | > loss_mel: 37.10413  (47.96114)
     | > loss_duration: 0.60382  (0.62635)
     | > loss_1: 45.35100  (57.98777)
     | > grad_norm_1: 119.17648  (185.73987)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 2.92180  (2.76725)
     | > loader_time: 0.01000  (0.01122)


[1m   --> STEP: 175/592 -- GLOBAL_STEP: 175[0m
     | > loss_disc: 2.38880  (2.62923)
     | > loss_disc_real_0: 0.10653  (0.18567

['<BLNK>', '/', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', 'п', '<BLNK>', 'р', '<BLNK>', 'и', '<BLNK>', 'м', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', ',', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', 'т', '<BLNK>', 'у', '<BLNK>', 'х', '<BLNK>', 'а', '<BLNK>', 'й', '<BLNK>', ',', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', 'а', '<BLNK>', 'к', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', 'ы', '<BLNK>', ' ', '<BLNK>', 'с', '<BLNK>', 'о', '<BLNK>', 'и', '<BLNK>', 'з', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', ' ', '<BLNK>', 'д', '<BLNK>', 'р', '<BLNK>', 'у', '<BLNK>', 'ж', '<BLNK>', 'е', '<BLNK>', 'с', '<BLNK>', 'к', '<BLNK>', 'и', '<BLNK>', ' ', '<BLNK>', 'м', '<BLNK>', 'е', '<BLNK>', 'н', '<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', 'и', '<BLNK>', 'м', '<BLNK>', 'е', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'в', '<BLNK>', 'а', '<BLNK>', 'т', '<BLNK>', 'ь', '<BLNK>', '.', '<BLNK>']
 [!] Character


[1m   --> STEP: 250/592 -- GLOBAL_STEP: 250[0m
     | > loss_disc: 2.72587  (2.59563)
     | > loss_disc_real_0: 0.17443  (0.19605)
     | > loss_disc_real_1: 0.29597  (0.25135)
     | > loss_disc_real_2: 0.21067  (0.24392)
     | > loss_disc_real_3: 0.24730  (0.24645)
     | > loss_disc_real_4: 0.27740  (0.22823)
     | > loss_disc_real_5: 0.27022  (0.23312)
     | > loss_0: 2.72587  (2.59563)
     | > grad_norm_0: 8.61105  (9.79594)
     | > loss_gen: 2.39904  (2.14861)
     | > loss_kl: 1.49239  (3.67087)
     | > loss_feat: 2.09123  (2.38874)
     | > loss_mel: 33.56944  (42.83024)
     | > loss_duration: 0.55318  (0.60090)
     | > loss_1: 40.10527  (51.63937)
     | > grad_norm_1: 175.32205  (212.35730)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.07630  (2.88350)
     | > loader_time: 0.01280  (0.01243)



['<BLNK>', 'а', '<BLNK>', ' ', '<BLNK>', 'л', '<BLNK>', 'е', '<BLNK>', 'в', '<BLNK>', ' ', '<BLNK>', 'у', '<BLNK>', 'ф', '<BLNK>', 'л', '<BLNK>', 'я', '<BLNK>', 'н', '<BLNK>', 'д', '<BLNK>', '*', '<BLNK>', ' ', '<BLNK>', 'е', '<BLNK>', 'щ', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'б', '<BLNK>', 'о', '<BLNK>', 'л', '<BLNK>', 'ь', '<BLNK>', 'ш', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', 'д', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', 'в', '<BLNK>', 'а', '<BLNK>', 'е', '<BLNK>', 'т', '<BLNK>', ' ', '<BLNK>', 'ж', '<BLNK>', 'е', '<BLNK>', 'л', '<BLNK>', 'ч', '<BLNK>', 'и', '<BLNK>', ',', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'л', '<BLNK>', 'ю', '<BLNK>', 'е', '<BLNK>', 'т', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', ' ', '<BLNK>', 'р', '<BLNK>', 'у', '<BLNK>', 'с', '<BLNK>', 'с', '<BLNK>', 'к', '<BLNK>', 'и', '<BLNK>', 'й', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', 'р', '<BLNK>', 'о', '<BLNK>', 'д', '<BLNK>', '.', '<BLNK>']
 [!] Character


[1m   --> STEP: 275/592 -- GLOBAL_STEP: 275[0m
     | > loss_disc: 2.69542  (2.60528)
     | > loss_disc_real_0: 0.06684  (0.19434)
     | > loss_disc_real_1: 0.18190  (0.25136)
     | > loss_disc_real_2: 0.22689  (0.24436)
     | > loss_disc_real_3: 0.28700  (0.24539)
     | > loss_disc_real_4: 0.25924  (0.23036)
     | > loss_disc_real_5: 0.26144  (0.23443)
     | > loss_0: 2.69542  (2.60528)
     | > grad_norm_0: 18.04631  (9.71653)
     | > loss_gen: 2.28741  (2.12745)
     | > loss_kl: 1.77661  (3.48694)
     | > loss_feat: 1.91113  (2.34792)
     | > loss_mel: 32.40456  (41.97990)
     | > loss_duration: 0.53884  (0.59666)
     | > loss_1: 38.91854  (50.53887)
     | > grad_norm_1: 258.89798  (216.23877)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.14930  (2.91069)
     | > loader_time: 0.01400  (0.01268)


[1m   --> STEP: 300/592 -- GLOBAL_STEP: 300[0m
     | > loss_disc: 2.75781  (2.61672)
     | > loss_disc_real_0: 0.19331  (0.1950

['<BLNK>', 'б', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'т', '<BLNK>', ' ', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'з', '<BLNK>', 'ъ', '<BLNK>', 'е', '<BLNK>', 'з', '<BLNK>', 'ж', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'о', '<BLNK>', 'т', '<BLNK>', 'д', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', 'е', '<BLNK>', 'н', '<BLNK>', 'н', '<BLNK>', 'ы', '<BLNK>', 'м', '<BLNK>', ' ', '<BLNK>', 'л', '<BLNK>', 'а', '<BLNK>', 'г', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', 'н', '<BLNK>', 'ы', '<BLNK>', 'м', '<BLNK>', ' ', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', 'ч', '<BLNK>', 'к', '<BLNK>', 'а', '<BLNK>', 'м', '<BLNK>', '.', '<BLNK>', ' ', '<BLNK>', 'е', '<BLNK>', 'м', '<BLNK>', 'у', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'р', '<BLNK>', 'е', '<BLNK>', 'д', '<BLNK>', 'о', '<BLNK>', 'с', '<BLNK>', 'т', '<BLNK>', 'а', '<BLNK>', 'в', '<BLNK>', 'и', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', 'а', '<BLNK>', 'з', '<BLNK>',


[1m   --> STEP: 350/592 -- GLOBAL_STEP: 350[0m
     | > loss_disc: 2.67921  (2.63747)
     | > loss_disc_real_0: 0.12431  (0.20049)
     | > loss_disc_real_1: 0.24039  (0.25130)
     | > loss_disc_real_2: 0.24483  (0.24443)
     | > loss_disc_real_3: 0.26139  (0.24390)
     | > loss_disc_real_4: 0.19436  (0.23199)
     | > loss_disc_real_5: 0.20098  (0.23563)
     | > loss_0: 2.67921  (2.63747)
     | > grad_norm_0: 9.83336  (10.05711)
     | > loss_gen: 1.85034  (2.07810)
     | > loss_kl: 1.52657  (3.08848)
     | > loss_feat: 1.85405  (2.21443)
     | > loss_mel: 30.52396  (39.87231)
     | > loss_duration: 0.56807  (0.58774)
     | > loss_1: 36.32299  (47.84106)
     | > grad_norm_1: 435.56174  (227.63310)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.24390  (2.98038)
     | > loader_time: 0.01430  (0.01375)


[1m   --> STEP: 375/592 -- GLOBAL_STEP: 375[0m
     | > loss_disc: 2.76193  (2.65065)
     | > loss_disc_real_0: 0.17057  (0.2028

['<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', ' ', '<BLNK>', 'ф', '<BLNK>', 'и', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', 'п', '<BLNK>', 'п', '<BLNK>', 'и', '<BLNK>', 'н', '<BLNK>', 'а', '<BLNK>', 'х', '<BLNK>', ' ', '<BLNK>', 'к', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', '‑', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'з', '<BLNK>', 'а', '<BLNK>', 'с', '<BLNK>', 'т', '<BLNK>', 'р', '<BLNK>', 'е', '<BLNK>', 'л', '<BLNK>', 'и', '<BLNK>', 'л', '<BLNK>', ' ', '<BLNK>', 'р', '<BLNK>', 'у', '<BLNK>', 'к', '<BLNK>', 'о', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'д', '<BLNK>', 'и', '<BLNK>', 'т', '<BLNK>', 'е', '<BLNK>', 'л', '<BLNK>', 'я', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'а', '<BLNK>', 'р', '<BLNK>', 'т', '<BLNK>', 'и', '<BLNK>', 'й', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'й', '<BLNK>', ' ', '<BLNK>', 'о', '<BLNK>', 'п', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', 'з', '<BLNK>', 'и', '<BLNK>', 'ц', '<BLNK>', 'и', '<BLNK>', 'и', '<BLNK>', '.', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>',


[1m   --> STEP: 425/592 -- GLOBAL_STEP: 425[0m
     | > loss_disc: 3.24233  (2.69372)
     | > loss_disc_real_0: 0.23975  (0.20534)
     | > loss_disc_real_1: 0.25260  (0.25123)
     | > loss_disc_real_2: 0.30993  (0.24944)
     | > loss_disc_real_3: 0.15779  (0.24451)
     | > loss_disc_real_4: 0.58766  (0.24456)
     | > loss_disc_real_5: 0.78595  (0.24611)
     | > loss_0: 3.24233  (2.69372)
     | > grad_norm_0: 11.55890  (10.33282)
     | > loss_gen: 2.21889  (2.06226)
     | > loss_kl: 1.71610  (2.83458)
     | > loss_feat: 1.64288  (2.09649)
     | > loss_mel: 29.48994  (38.22359)
     | > loss_duration: 0.56310  (0.58244)
     | > loss_1: 35.63091  (45.79936)
     | > grad_norm_1: 254.42690  (235.23772)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.36960  (3.03808)
     | > loader_time: 0.01640  (0.01455)


[1m   --> STEP: 450/592 -- GLOBAL_STEP: 450[0m
     | > loss_disc: 2.73684  (2.70397)
     | > loss_disc_real_0: 0.25209  (0.206

['<BLNK>', 'н', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'х', '<BLNK>', 'о', '<BLNK>', 'ч', '<BLNK>', 'у', '<BLNK>', ' ', '<BLNK>', 'д', '<BLNK>', 'е', '<BLNK>', 'м', '<BLNK>', 'о', '<BLNK>', 'н', '<BLNK>', 'с', '<BLNK>', 'т', '<BLNK>', 'р', '<BLNK>', 'и', '<BLNK>', 'р', '<BLNK>', 'о', '<BLNK>', 'в', '<BLNK>', 'а', '<BLNK>', 'т', '<BLNK>', 'ь', '<BLNK>', ' ', '<BLNK>', 'с', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'е', '<BLNK>', 'г', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'р', '<BLNK>', 'е', '<BLNK>', 'в', '<BLNK>', 'о', '<BLNK>', 'с', '<BLNK>', 'х', '<BLNK>', 'о', '<BLNK>', 'д', '<BLNK>', 'с', '<BLNK>', 'т', '<BLNK>', 'в', '<BLNK>', 'а', '<BLNK>', '…', '<BLNK>', ' ', '<BLNK>', 'м', '<BLNK>', 'ы', '<BLNK>', ' ', '<BLNK>', 'б', '<BLNK>', 'у', '<BLNK>', 'д', '<BLNK>', 'е', '<BLNK>', 'м', '<BLNK>', ' ', '<BLNK>', 'в', '<BLNK>', 'ы', '<BLNK>', 'ш', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'э', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', 'г', '<BLNK>', 'о', '<BLNK>', '.', '<BLNK>',


[1m   --> STEP: 525/592 -- GLOBAL_STEP: 525[0m
     | > loss_disc: 2.81185  (2.72427)
     | > loss_disc_real_0: 0.26008  (0.20848)
     | > loss_disc_real_1: 0.21447  (0.25069)
     | > loss_disc_real_2: 0.18632  (0.24835)
     | > loss_disc_real_3: 0.31218  (0.24550)
     | > loss_disc_real_4: 0.22553  (0.24374)
     | > loss_disc_real_5: 0.25525  (0.24693)
     | > loss_0: 2.81185  (2.72427)
     | > grad_norm_0: 4.54610  (9.47575)
     | > loss_gen: 1.58986  (2.00099)
     | > loss_kl: 1.68038  (2.61987)
     | > loss_feat: 1.10308  (1.92658)
     | > loss_mel: 27.24844  (36.46859)
     | > loss_duration: 0.59656  (0.58097)
     | > loss_1: 32.21833  (43.59699)
     | > grad_norm_1: 236.32036  (241.44025)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.41070  (3.10641)
     | > loader_time: 0.01620  (0.01538)


[1m   --> STEP: 550/592 -- GLOBAL_STEP: 550[0m
     | > loss_disc: 2.89430  (2.73049)
     | > loss_disc_real_0: 0.17453  (0.20924

['<BLNK>', 'в', '<BLNK>', ' ', '<BLNK>', 'с', '<BLNK>', 'у', '<BLNK>', 'м', '<BLNK>', 'о', '<BLNK>', 'ч', '<BLNK>', 'к', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'е', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'л', '<BLNK>', 'е', '<BLNK>', 'ж', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'е', '<BLNK>', 'ч', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ',', '<BLNK>', ' ', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'з', '<BLNK>', 'м', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', 'о', '<BLNK>', 'м', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLNK>', 'у', '<BLNK>', 'т', '<BLNK>', 'ь', '<BLNK>', ' ', '<BLNK>', 'п', '<BLNK>', 'о', '<BLNK>', 'б', '<BLNK>', 'о', '<BLNK>', 'л', '<BLNK>', 'е', '<BLNK>', 'е', '<BLNK>', ' ', '<BLNK>', 'м', '<BLNK>', 'и', '<BLNK>', 'н', '<BLNK>', 'и', '<BLNK>', 'а', '<BLNK>', 'т', '<BLNK>', 'ю', '<BLNK>', 'р', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'г', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'д', '<BLNK>', 'а', '<BLNK>', 'м', '<BLNK>', 'с', '<BLNK>',


[1m   --> STEP: 575/592 -- GLOBAL_STEP: 575[0m
     | > loss_disc: 2.79625  (2.73517)
     | > loss_disc_real_0: 0.20631  (0.21019)
     | > loss_disc_real_1: 0.22493  (0.25035)
     | > loss_disc_real_2: 0.24030  (0.24810)
     | > loss_disc_real_3: 0.15506  (0.24499)
     | > loss_disc_real_4: 0.22272  (0.24319)
     | > loss_disc_real_5: 0.22666  (0.24667)
     | > loss_0: 2.79625  (2.73517)
     | > grad_norm_0: 3.84570  (9.20816)
     | > loss_gen: 1.63136  (1.97673)
     | > loss_kl: 1.71941  (2.53713)
     | > loss_feat: 1.13384  (1.85776)
     | > loss_mel: 27.24869  (35.68986)
     | > loss_duration: 0.62199  (0.58300)
     | > loss_1: 32.35528  (42.64449)
     | > grad_norm_1: 419.15323  (242.91690)
     | > current_lr_0: 0.00020 
     | > current_lr_1: 0.00020 
     | > step_time: 3.41370  (3.13995)
     | > loader_time: 0.01690  (0.01588)


[1m > EVALUATION [0m





> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 222
 | > Preprocessing samples
 | > Max text length: 118
 | > Min text length: 18
 | > Avg text length: 81.47619047619048
 | 
 | > Max audio length: 159093.0
 | > Min audio length: 33172.0
 | > Avg audio length: 109915.1164021164
 | > Num. instances discarded samples: 33
 | > Batch group size: 0.
['<BLNK>', 'в', '<BLNK>', ' ', '<BLNK>', 'э', '<BLNK>', 'м', '<BLNK>', 'и', '<BLNK>', 'г', '<BLNK>', 'р', '<BLNK>', 'а', '<BLNK>', 'ц', '<BLNK>', 'и', '<BLNK>', 'и', '<BLNK>', ' ', '<BLNK>', 'б', '<BLNK>', 'ы', '<BLNK>', 'л', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', '‑', '<BLNK>', 'т', '<BLNK>', 'о', '<BLNK>', ' ', '<BLNK>', 'н', '<BLNK>', 'е', '<BLNK>', 'р', '<BLNK>', 'е', '<BLNK>', 'а', '<BLNK>', 'л', '<BLNK>', 'ь', '<BLNK>', 'н', '<BLNK>', 'о', '<BLNK>', 'е', '<BLNK>', '.', '<BLNK>', ' ', '<BLNK>', 'ч', '<BLN

 ! Run is removed from /content/vits_vctk-March-05-2023_07+59PM-0000000


 | > Synthesizing test sentences.


Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/trainer/trainer.py", line 1591, in fit
    self._fit()
  File "/usr/local/lib/python3.8/dist-packages/trainer/trainer.py", line 1548, in _fit
    self.test_run()
  File "/usr/local/lib/python3.8/dist-packages/trainer/trainer.py", line 1466, in test_run
    test_outputs = self.model.test_run(self.training_assets)
  File "/usr/local/lib/python3.8/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/TTS/tts/models/vits.py", line 1438, in test_run
    aux_inputs = self.get_aux_input_from_test_sentences(s_info)
  File "/usr/local/lib/python3.8/dist-packages/TTS/tts/models/vits.py", line 1409, in get_aux_input_from_test_sentences
    speaker_id = self.speaker_manager.name_to_id[speaker_name]
AttributeError: 'NoneType' object has no attribute 'name_to_id'


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# cleanup "output" from cache
import shutil
shutil.rmtree("./phoneme_cache")