In [32]:
import pandas as pd
from trainer import Trainer, TrainerArgs

# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.utils.speakers import SpeakerManager

#df_test = pd.read_csv('test.csv', delimiter='|')
df_train = pd.read_csv('train.csv', delimiter='|')

#df_test = df_test[['wav_filename', 'transcript']]
df_train = df_train[['wav_filename', 'transcript']]

#df_test['wav_filename'] = df_test['wav_filename'].apply(lambda x: 'wavs/' + x)
df_train['wav_filename'] = df_train['wav_filename'].apply(lambda x: x.split('/')[-1].split('.')[0])

df_train = df_train.head(100)

df_train.to_csv('./tts_train_dir/train.csv', sep='|', index=False, header=False)

df_train.head(10)

Unnamed: 0,wav_filename,transcript
0,6892_8764_000976,Wokulski przybiegł do niej. Podała mu rękę i w...
1,6892_10920_001533,I ty jeszcze będziesz śmiał bronić Ramzesa?. —...
2,6892_8338_000287,"Choć jest ona w nas, to przecież rozumieć jej ..."
3,6892_10462_000578,Przyszła chwila rozkwitu dla nowe nauki. Nasza...
4,7014_6834_000368,Zresztą nie zakazuję ci żebrać w przerwach mię...
5,6892_8912_000729,"Proszę pana, tu panowie wysiadają. Pan baron j..."
6,6892_10674_000209,Prawdę mówisz — wtrącił Tutmozis. — To samo i ...
7,6892_8912_001912,Więc to Maruszewicz! — pomyślałem. — Ładna par...
8,7014_6834_001294,"Jaśnie Oświecony książe, racz pozwolić przyjec..."
9,6892_8912_000556,Czy jesteś tego pewna? — Najzupełniej. Zresztą...


In [33]:
import os

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig

output_path = "tts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [34]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="train.csv", path=os.path.join(output_path, "")
)

audio_config = BaseAudioConfig(sample_rate=24000, resample=True, do_trim_silence=True, trim_db=23.0)


In [35]:
config = GlowTTSConfig(
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phonemizer="espeak",
    phoneme_language="pl",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache2"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=100000,
    max_audio_len=400000,
)

In [36]:
ap = AudioProcessor.init_from_config(audio_config)

 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:True
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:23.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [37]:
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

tokenizer, config = TTSTokenizer.init_from_config(config)

In [38]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

 | > Found 100 files in /home/lap/study/software_proj/TTS/GlowTTS/tts_train_dir


In [39]:
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)


 > Init speaker_embedding layer.




In [40]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)



fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 2
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=tts_train_dir/run-October-23-2023_01+18AM-0000000

 > Model has 32333713 parameters


 > `speakers.pth` is saved to tts_train_dir/run-October-23-2023_01+18AM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.


In [41]:
# AND... 3,2,1... 🚀
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> tts_train_dir/run-October-23-2023_01+18AM-0000000

[1m > TRAINING (2023-10-23 01:18:51) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: pl
		| > phoneme backend: espeak
| > Number of instances : 99
 | > Preprocessing samples
 | > Max text length: 250
 | > Min text length: 115
 | > Avg text length: 183.6161616161616
 | 
 | > Max audio length: 358822.0
 | > Min audio length: 240022.0
 | > Avg audio length: 294283.83838383836
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


: 

: 

In [None]:
!tensorboard --logdir=tts_train_dir

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.15.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
import glob, os
output_path = "tts_train_dir"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])

print(ckpts[1], configs[0])

tts_train_dir/run-October-22-2023_11+08PM-0000000/best_model_4.pth tts_train_dir/run-October-22-2023_11+08PM-0000000/config.json


In [None]:
path_model = os.path.join(output_path, "run-October-22-2023_11+08PM-0000000/best_model_4.pth")
path_config = os.path.join(output_path, "run-October-22-2023_11+08PM-0000000/config.json")
!tts --text "Nigdy go nie|nigdy go nie kochała myślałem a ponieważ ojciec nie był zakorzeniony w sercu żadnej kobiety przeto nie mógł też wróśić w żadną realność i unosił się wiecznie na periferii życia w półrealnych regionach na krawędziach" \
      --model_path $path_model \
      --config $path_config \
      --out_path out.wav

 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text: Nigdy go nie|nigdy go nie kochała myślałem a ponieważ ojciec nie był zakorzeniony w sercu żadnej kobiety przeto nie mógł też wróśić w żadną realność i unosił się wiecznie na periferii życia w półrealnych regionach na krawędz