In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install TTS



In [None]:
!apt-get install espeak-ng

In [None]:
!pip install tensorflow==2.9.1

In [None]:
!pip list -v | grep TTS

TTS                              0.20.1                /usr/local/lib/python3.10/dist-packages pip


In [None]:
!pip show TTS

Name: TTS
Version: 0.20.1
Summary: Deep learning for Text to Speech by Coqui.
Home-page: https://github.com/coqui-ai/TTS
Author: Eren Gölge
Author-email: egolge@coqui.ai
License: MPL-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, anyascii, bangla, bnnumerizer, bnunicodenormalizer, coqpit, cython, einops, encodec, flask, fsspec, g2pkk, gruut, hangul-romanize, inflect, jamo, jieba, k-diffusion, librosa, matplotlib, nltk, num2words, numba, numpy, packaging, pandas, pypinyin, pysbd, pyyaml, scikit-learn, scipy, soundfile, torch, torchaudio, tqdm, trainer, transformers, umap-learn, unidecode
Required-by: 


In [None]:
!python  /usr/local/lib/python3.10/dist-packages/TTS/bin/resample.py --input_dir /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/wavs/ \
  --output_sr 22050 \
  --output_dir /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/wavs2/ \
  --file_ext wav \
  --n_jobs 24

Recursively copying the input folder...
Resampling the audio files...
Found 100 files...
100% 100/100 [00:39<00:00,  2.56it/s]
Done !


In [None]:
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

import torch
torch.cuda.empty_cache()

output_path = "/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/"

data_path = "/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/"

# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
    formatter="ljspeech_test",
    meta_file_train="train.csv",
    path=data_path,
)


audio_config = BaseAudioConfig(
    fft_size=768,
    win_length=768,
    sample_rate=22050,
    resample=False,  # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
    do_trim_silence=True,
    trim_db=24.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    ref_level_db=20,
    spec_gain=1.0,
    log_func="np.log",
    preemphasis=0.0,

)

config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    max_decoder_steps=100000,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=6,
    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    double_decoder_consistency=True,
    #r=2,
    #gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    #double_decoder_consistency=False,
    epochs=1000,
    phonemizer="espeak",
    use_phonemes=True,
    phoneme_language="pl",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=150,
    print_eval=True,
    mixed_precision=False,
    #115,250
    min_text_len=115,
    max_text_len=250,
    min_audio_len=22050 * 0,
    max_audio_len=22050 * 33,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,  # set this to enable multi-sepeaker training
    decoder_ssim_alpha=0.0,  # disable ssim losses that causes NaN for some runs.
    postnet_ssim_alpha=0.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    attention_norm="softmax",
    optimizer="Adam",
    lr_scheduler=None,
    lr=3e-5,
    test_sentences=[
        "Mrowisko jest bardziej wioską.",
        "Zejście po pionowej łodydze.",
        "że choćby mi przyszło porzucić cię.",
        "Pod domami płynie rzeka tłumu. Ulica jest szeroka jak bulwar wielkomiejski.",
        "Były to ogromne wiechcie piór.",
    ]
)

## INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)


print("Number of training samples:", len(train_samples))
print("Number of evaluation samples:", len(eval_samples))


# Handle Empty Dataset
if not train_samples or not eval_samples:
    print("Error: Training or evaluation samples are empty.")

# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")

# init model
model = Tacotron2(config, ap, tokenizer, speaker_manager)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:768
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:24.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:768
 | > Found 100 files in /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir
Number of training samples: 99
Number of evaluation samples: 1
 > Init speaker_embedding layer.


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+54PM-0000000

 > Model has 56706292 parameters


 > `speakers.pth` is saved to /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+54PM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.



[4m[1m > EPOCH: 0/1000[0m
 --> /content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+54PM-0000000

[1m > TRAINING (2023-11-08 14:54:36) [0m



 > Number of output frames: 6


> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: pl
		| > phoneme backend: espeak
| > Number of instances : 99
 | > Preprocessing samples
 | > Max text length: 250
 | > Min text length: 115
 | > Avg text length: 184.26262626262627
 | 
 | > Max audio length: 329670.0
 | > Min audio length: 220522.0
 | > Avg audio length: 271263.9696969697
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]

[1m   --> TIME: 2023-11-08 14:54:51 -- STEP: 0/2 -- GLOBAL_STEP: 0[0m
     | > decoder_loss: 40.21834182739258  (40.21834182739258)
     | > postnet_loss: 42.37211227416992  (42.37211227416992)
     | > stopnet_loss: 0.7851433753967285  (0.7851433753967285)
     | > decoder_coarse_loss: 40.200164794921875  (40.200164794921875)
     | > decoder_ddc_loss: 0.0013703416334465146  (0.0013703416334465146)
     | > ga_loss: 0.0016760959988459945  (0.0016760959988459945)
     | > loss: 31.491519927978516  (31.491519927978516)
     | > align_error: 0.9948520567268133  (0.9948520567268133)
     | > grad_norm: tensor(2.6958, device='cuda:0')  (tensor(2.6958, device='cuda:0'))
     | > current_lr: 3e-05 
     | > step_time: 8.7681  (8.768144369125366)
     | > loader_time: 6.0354  (6.035442113876343)






[1m > EVALUATION [0m





> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: pl
		| > phoneme backend: espeak
| > Number of instances : 1
 | > Preprocessing samples
 | > Max text length: 180
 | > Min text length: 180
 | > Avg text length: 180.0
 | 
 | > Max audio length: 232209.0
 | > Min audio length: 232209.0
 | > Avg audio length: 232209.0
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


[1m   --> STEP: 0[0m
     | > decoder_loss: 28.862407684326172  (28.862407684326172)
     | > postnet_loss: 28.824796676635742  (28.824796676635742)
     | > stopnet_loss: 0.9682840704917908  (0.9682840704917908)
     | > decoder_coarse_loss: 28.829214096069336  (28.829214096069336)
     | > decoder_ddc_loss: 0.00012419026461429894  (0.00012419026461429894)
     | > ga_loss: 0.001495089614763856  (0.001495089614763856)
     | > loss: 22.604894638061523  (22.604894638061523)
     | > align_error: 0.9951699892990291  (0.9951699892990291)



 | > Synthesizing test sentences.
mrɔvˈiskɔ jɛzd bˈardʑɛj vʲˈɔskɔ̃.
 [!] Character '̃' not found in the vocabulary. Discarding it.
   > Decoder stopped with `max_decoder_steps` 100000


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

import glob, os
output_path = "/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])

print(ckpts, configs)

['/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_01+01PM-0000000/best_model.pth', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_01+01PM-0000000/best_model_210.pth', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_01+01PM-0000000/speakers.pth', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+50PM-0000000/speakers.pth', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+54PM-0000000/speakers.pth'] ['/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_01+01PM-0000000/config.json', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+50PM-0000000/config.json', '/content/drive/MyDrive/Software_proj/Tacatron/tts_train_dir/run-November-08-2023_02+54PM-0000000/config.json']


In [None]:
model_path = ckpts[0]
config_path = configs[0]
speaker_idx = 'ljspeech-9'

!tts --text "a oprócz tego kilka tysięcy i namioty." \
      --model_path $model_path \
      --config_path $config_path \
      --speaker_idx $speaker_idx \
      --out_path out.wav

 > Using model: tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:768
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:24
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:768
 > Init speaker_embedding layer.
 > Model's reduction rate `r` is set to: 2
 > Text: a oprócz tego kilka tysięcy i namioty.
 > Text splitted to sentences.
['a oprócz tego kilka tysięcy i namioty.']
   > Decoder stopped 