In [1]:
#vocoder

import os

from TTS.trainer import Trainer, TrainingArgs
from TTS.utils.audio import AudioProcessor
from TTS.vocoder.configs import HifiganConfig, MultibandMelganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN



output_path =os.path.join("./model/vocoder/mbmelgan/")

config = MultibandMelganConfig(
    batch_size=256,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=5,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4,
    data_path="",
    output_path=output_path,
    steps_to_start_discriminator=0
)


# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
eval_samples, train_samples=[[],[]]
for path in ["./resample1/kss/wavs","./resample1/pansori_tedxkr/wavs","./resample1/zeroth_korean/wavs"]: 
    eval_samples_temp, train_samples_temp = load_wav_data(path, config.eval_split_size)
    eval_samples+=eval_samples_temp
    train_samples+=train_samples_temp
    
eval_samples

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


['./resample1/kss/wavs/4_0240.wav',
 './resample1/kss/wavs/1_1017.wav',
 './resample1/kss/wavs/4_3352.wav',
 './resample1/kss/wavs/4_3408.wav',
 './resample1/kss/wavs/4_4237.wav',
 './resample1/kss/wavs/3_3011.wav',
 './resample1/kss/wavs/3_3026.wav',
 './resample1/kss/wavs/3_3140.wav',
 './resample1/kss/wavs/4_3049.wav',
 './resample1/kss/wavs/2_0631.wav',
 './resample1/pansori_tedxkr/wavs/7J207ZiE7KCV-R15jiXaSYik-0133.wav',
 './resample1/pansori_tedxkr/wavs/67CV7J6E7Iic-ZBNO2Drz36c-0400.wav',
 './resample1/pansori_tedxkr/wavs/7Zmp64aN66y4-beK1Iw23nc8-0055.wav',
 './resample1/pansori_tedxkr/wavs/7KCV6riw7KCV-GJu8ZETMTZU-0240.wav',
 './resample1/pansori_tedxkr/wavs/7J207ISx67KU-znxAJsY__HM-0124.wav',
 './resample1/pansori_tedxkr/wavs/7J207J6Q656M-D35qys8YZpo-0136.wav',
 './resample1/pansori_tedxkr/wavs/6rmA7Zic7KCV-grgRnDg-o94-0052.wav',
 './resample1/pansori_tedxkr/wavs/7ISc66qF7IiZ-lGU_mqIdCAE-0310.wav',
 './resample1/pansori_tedxkr/wavs/6rWs7IK87Jil-jUe7EdiQP1c-0301.wav',
 './resamp

In [None]:


# init model
model = GAN(config)

# init the trainer and 🚀
trainer = Trainer(
    TrainingArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)
trainer.fit()

 > Generator Model: multiband_melgan_generator
 > Discriminator Model: melgan_multiscale_discriminator


fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


 > Using CUDA:  True
 > Number of GPUs:  1

 > Model has 6894446 parameters

[4m[1m > EPOCH: 0/1000[0m
 --> ./model/vocoder/mbmelgan/coqui_tts-December-14-2021_05+54PM-0000000

[1m > TRAINING (2021-12-14 17:54:41) [0m

[1m   --> STEP: 0/150 -- GLOBAL_STEP: 0[0m
     | > G_stft_loss_mg: 3.44786  (3.44786)
     | > G_stft_loss_sc: 1.84004  (1.84004)
     | > G_subband_stft_loss_mg: 3.43845  (3.43845)
     | > G_subband_stft_loss_sc: 1.75431  (1.75431)
     | > G_mse_fake_loss: 0.97940  (0.97940)
     | > G_gen_loss: 2.64395  (2.64395)
     | > G_adv_loss: 2.44851  (2.44851)
     | > loss_0: 5.09246  (5.09246)
     | > grad_norm_0: 13.79779  (13.79779)
     | > D_mse_gan_loss: 0.97922  (0.97922)
     | > D_mse_gan_real_loss: 0.66327  (0.66327)
     | > D_mse_gan_fake_loss: 2.1690791527362308e-06  (2.1690791527362308e-06)
     | > loss_1: 0.97922  (0.97922)
     | > grad_norm_1: 5.04049  (5.04049)
     | > current_lr_0: 0.00010 
     | > current_lr_1: 0.00010 
     | > step_time: 2.