# Main Script HiFiGAN

### Imports

In [1]:
try: 
    import librosa
except:
    !pip install librosa
try: 
    import optuna, plotly
except:
    !pip install optuna
    !pip install plotly
try:
    from audio_diffusion_pytorch import DiffusionVocoder, UNetV0, LTPlugin, VDiffusion, VSampler
except:
    !pip install audio-diffusion-pytorch
#Set Dir 
import sys, os
sys.path.append(os.path.abspath('..'))

# Torch
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, Subset
from torch.nn.utils import weight_norm 
from torch.nn import utils
import torch.optim as optim
import torch.nn.functional as F
import optuna, plotly


# Utils
import numpy as np
from numpy import ndarray
import logging, librosa, itertools, tensorboard
from typing import Sequence, Optional, Callable
from audio_diffusion_pytorch import DiffusionVocoder, UNetV0, LTPlugin, VDiffusion, VSampler


# Base Scripts
from Libraries.Utils import *
from MainScripts.Conf import conf

### Config

### General

In [2]:
remote_kernel: bool = True #Set to true if using a remote Kernel changes the file structure
model_name: str = "Diffusion_Vocoder_v1"
training_label_name: str = "training_full_wave"
full_model_path: str = path_to_remote_path("{}/{}".format(conf["paths"].model_path, model_name + ".pth"), remote_kernel)

Logging

In [3]:
logging_level: int = LIGHT_DEBUG
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
logger: logging.Logger = logging.getLogger(__name__)

Training Params

In [4]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"
n_training_samples: int = 2496 // 2
n_val_samples: int = 200
batch_size: int = 8
tensor_wave_dim: list = [batch_size, 1, 2**17] #B, C, H = Batch, channels, Time domain
tensor_mel_dim: list = [batch_size, 96, 512]
learning_rate: float = 2e-4
betas = [0.8, 0.99]
epochs: int = 300
restart_training: bool = True
checkpoint_freq: int = 5
num_workers: int = 4


### Data Loading

In [5]:
audio_data: ndarray = load_training_data(path_to_remote_path("{}/{}".format(conf["paths"].data_path, training_label_name + ".npy"), remote_kernel))

np.random.seed(50)
indicies: ndarray = np.arange(audio_data.shape[0])
np.random.shuffle(indicies)
audio_data = audio_data[indicies]

data_loader = create_dataloader(Audio_Data(audio_data[:n_training_samples]), batch_size, num_workers)
validation_dataloader = create_dataloader(Audio_Data(audio_data[:-n_val_samples]), batch_size, num_workers)

2025-06-23 17:03:40,510 - LIGHT_DEBUG - Ndarray loaded from Data/training_full_wave.npy of shape: (6867, 131072)


### Original Impl

In [6]:
vocoder = DiffusionVocoder(
    mel_n_fft=1024,
    mel_channels=96,
    mel_sample_rate=32000,
    mel_normalize_log=True, 
    net_t=UNetV0,
    channels=[128, 128, 256, 512],
    factors=[2, 2, 4, 4],
    items=[4, 4, 6, 6],
    diffusion_t=VDiffusion,
    sampler_t=VSampler,
).to(device)

Model Loading

In [7]:
optimizer = optim.AdamW(vocoder.parameters(), lr=learning_rate, betas=betas)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.98)
start_epoch: int = 0
if os.path.exists(full_model_path):
    model = torch.load(full_model_path, map_location=device)
    vocoder.load_state_dict(model["model"])
    if not restart_training:
        optimizer.load_state_dict(model["optim"])
        scheduler.load_state_dict(model["scheduler"])
        start_epoch = model.get("epoch", 0)
    logger.info(f"Model {model_name} loaded with {count_parameters(vocoder)} Parameters")
else: 
    logger.info(f"Model {model_name} created with {count_parameters(vocoder)} Parameters")

2025-06-23 17:03:47,759 - INFO - Model Diffusion_Vocoder_v1 created with ~51.49M Parameters


Initiate Trainer

In [8]:
trainer = Trainer(vocoder, optimizer, scheduler, device, n_dims=1)

### Training

In [None]:
train_loss, val_loss = trainer.train(train_dataset=data_loader,
            val_dataset=validation_dataloader,
            n_epochs=100,
            full_model_path=full_model_path,
            checkpoint_freq=checkpoint_freq,
            patience=-1,
            gradient_clip_norm=None,
            gradient_clip_val=None,
            sample_freq=None,
            )
scatter_plot(train_loss)
scatter_plot(val_loss)

2025-06-23 17:03:47,952 - INFO - Training started on cuda


2025-06-23 17:06:32,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.100 Min/Max params: -2.313, 2.126


2025-06-23 17:11:42,971 - INFO - Epoch 001: Avg. Loss: 2.16561e-01 Avg. val. Loss: 1.38616e-01 Remaining Time: 13h 03min 46s LR: 1.96000e-04 


2025-06-23 17:14:27,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.082 Min/Max params: -2.313, 2.125


2025-06-23 17:19:37,617 - INFO - Epoch 002: Avg. Loss: 1.26975e-01 Avg. val. Loss: 1.23887e-01 Remaining Time: 12h 55min 33s LR: 1.92080e-04 


2025-06-23 17:22:21,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.112 Min/Max params: -2.314, 2.126


2025-06-23 17:27:32,273 - INFO - Epoch 003: Avg. Loss: 1.18992e-01 Avg. val. Loss: 1.10952e-01 Remaining Time: 12h 47min 32s LR: 1.88238e-04 


2025-06-23 17:30:16,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.098 Min/Max params: -2.314, 2.125


2025-06-23 17:35:27,063 - INFO - Epoch 004: Avg. Loss: 1.18356e-01 Avg. val. Loss: 1.14076e-01 Remaining Time: 12h 39min 38s LR: 1.84474e-04 


2025-06-23 17:38:11,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.079 Min/Max params: -2.319, 2.128


2025-06-23 17:43:21,446 - INFO - Epoch 005: Avg. Loss: 1.18155e-01 Avg. val. Loss: 1.14091e-01 Remaining Time: 12h 31min 36s LR: 1.80784e-04 
2025-06-23 17:43:22,053 - LIGHT_DEBUG - Checkpoint saved model to Models/Diffusion_Vocoder_v1_epoch_005.pth


2025-06-23 17:46:06,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.173 Min/Max params: -2.324, 2.133


2025-06-23 17:51:16,615 - INFO - Epoch 006: Avg. Loss: 1.25059e-01 Avg. val. Loss: 1.19373e-01 Remaining Time: 12h 23min 39s LR: 1.77168e-04 


2025-06-23 17:54:00,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.155 Min/Max params: -2.329, 2.135


2025-06-23 17:59:11,034 - INFO - Epoch 007: Avg. Loss: 1.19029e-01 Avg. val. Loss: 1.39597e-01 Remaining Time: 12h 15min 41s LR: 1.73625e-04 


2025-06-23 18:01:55,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.090 Min/Max params: -2.332, 2.137


2025-06-23 18:07:05,405 - INFO - Epoch 008: Avg. Loss: 1.27632e-01 Avg. val. Loss: 1.15976e-01 Remaining Time: 12h 07min 43s LR: 1.70153e-04 


2025-06-23 18:09:49,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.207 Min/Max params: -2.334, 2.136


2025-06-23 18:14:59,852 - INFO - Epoch 009: Avg. Loss: 1.31782e-01 Avg. val. Loss: 1.18716e-01 Remaining Time: 11h 59min 47s LR: 1.66750e-04 


2025-06-23 18:17:44,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.167 Min/Max params: -2.333, 2.137


2025-06-23 18:22:53,944 - INFO - Epoch 010: Avg. Loss: 1.14414e-01 Avg. val. Loss: 1.12327e-01 Remaining Time: 11h 51min 48s LR: 1.63415e-04 
2025-06-23 18:22:54,561 - LIGHT_DEBUG - Models/Diffusion_Vocoder_v1_epoch_005.pth deleted
2025-06-23 18:22:54,562 - LIGHT_DEBUG - Checkpoint saved model to Models/Diffusion_Vocoder_v1_epoch_010.pth


2025-06-23 18:25:38,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.076 Min/Max params: -2.335, 2.138


2025-06-23 18:30:48,760 - INFO - Epoch 011: Avg. Loss: 1.16484e-01 Avg. val. Loss: 1.10507e-01 Remaining Time: 11h 43min 51s LR: 1.60146e-04 


2025-06-23 18:33:33,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.204 Min/Max params: -2.337, 2.141


2025-06-23 18:38:43,029 - INFO - Epoch 012: Avg. Loss: 1.15887e-01 Avg. val. Loss: 1.08803e-01 Remaining Time: 11h 35min 54s LR: 1.56943e-04 


2025-06-23 18:41:27,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.098 Min/Max params: -2.337, 2.143


2025-06-23 18:46:37,410 - INFO - Epoch 013: Avg. Loss: 1.14181e-01 Avg. val. Loss: 1.12903e-01 Remaining Time: 11h 27min 59s LR: 1.53804e-04 


2025-06-23 18:49:21,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.219 Min/Max params: -2.338, 2.145


2025-06-23 18:54:31,563 - INFO - Epoch 014: Avg. Loss: 1.26481e-01 Avg. val. Loss: 1.46629e-01 Remaining Time: 11h 20min 03s LR: 1.50728e-04 


2025-06-23 18:57:15,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.822 Min/Max params: -2.339, 2.145


2025-06-23 19:02:25,490 - INFO - Epoch 015: Avg. Loss: 1.91804e-01 Avg. val. Loss: 6.75335e-01 Remaining Time: 11h 12min 05s LR: 1.47714e-04 
2025-06-23 19:02:26,120 - LIGHT_DEBUG - Models/Diffusion_Vocoder_v1_epoch_010.pth deleted
2025-06-23 19:02:26,121 - LIGHT_DEBUG - Checkpoint saved model to Models/Diffusion_Vocoder_v1_epoch_015.pth


2025-06-23 19:05:10,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.086 Min/Max params: -2.339, 2.145


2025-06-23 19:10:20,090 - INFO - Epoch 016: Avg. Loss: 1.97970e-01 Avg. val. Loss: 1.13151e-01 Remaining Time: 11h 04min 08s LR: 1.44760e-04 


2025-06-23 19:13:04,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.139 Min/Max params: -2.339, 2.145


2025-06-23 19:18:14,001 - INFO - Epoch 017: Avg. Loss: 1.15628e-01 Avg. val. Loss: 1.21063e-01 Remaining Time: 10h 56min 12s LR: 1.41864e-04 


2025-06-23 19:20:58,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.098 Min/Max params: -2.339, 2.145


2025-06-23 19:26:08,198 - INFO - Epoch 018: Avg. Loss: 2.03997e-01 Avg. val. Loss: 1.46542e-01 Remaining Time: 10h 48min 16s LR: 1.39027e-04 


2025-06-23 19:28:52,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.104 Min/Max params: -2.338, 2.145


2025-06-23 19:34:02,466 - INFO - Epoch 019: Avg. Loss: 1.24813e-01 Avg. val. Loss: 1.25734e-01 Remaining Time: 10h 40min 22s LR: 1.36247e-04 


2025-06-23 19:36:46,000 - LIGHT_DEBUG - Batch 156/156 Loss: 0.081 Min/Max params: -2.339, 2.145


2025-06-23 19:41:56,910 - INFO - Epoch 020: Avg. Loss: 1.40714e-01 Avg. val. Loss: 1.27977e-01 Remaining Time: 10h 32min 28s LR: 1.33522e-04 
2025-06-23 19:41:57,548 - LIGHT_DEBUG - Models/Diffusion_Vocoder_v1_epoch_015.pth deleted
2025-06-23 19:41:57,549 - LIGHT_DEBUG - Checkpoint saved model to Models/Diffusion_Vocoder_v1_epoch_020.pth


2025-06-23 19:44:19,000 - LIGHT_DEBUG - Batch 135/156 Loss: 0.136 Min/Max params: -2.338, 2.145