# Main Script for a Vocoder

### Imports

In [1]:
try: 
    import librosa
except:
    !pip install librosa
try:
    from audio_diffusion_pytorch import DiffusionModel, UNetV0, LTPlugin, VDiffusion, VSampler
except:
    !pip install audio-diffusion-pytorch
try: 
    import a_unet
except:
    !pip install a_unet


#Set Dir 
import sys, os
sys.path.append(os.path.abspath('..'))

# Torch
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, Subset
import torch.optim as optim
from audio_diffusion_pytorch import DiffusionModel, UNetV0, LTPlugin, VDiffusion, VSampler, DiffusionVocoder


# Utils
import numpy as np
from numpy import ndarray
import logging
from typing import Sequence, Optional, Callable


# Base Scripts
from Libraries.Utils import *
from MainScripts.Conf import conf

### Config

### General

In [2]:
remote_kernel: bool = False #Set to true if using a remote Kernel changes the file structure
model_name: str = "vocoder_v1"
training_data_name: str = "training_full_wave"
full_model_path: str = path_to_remote_path("{}/{}".format(conf["paths"].model_path, model_name + ".pth"), remote_kernel)

Logging

In [3]:
logging_level: int = logging.INFO
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
logger: logging.Logger = logging.getLogger(__name__)

Training Params

In [4]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"
n_training_samples: int = 5600
n_validation_samples: int = 50
batch_size: int = 48
tensor_dim: list = [batch_size, 1, 512] #B, C, H = Batch, Channels, Time domain

learning_rate: float = 5e-4
epochs: int = 100
restart_training: bool = True
checkpoint_freq: int = 10

### Data Loading

In [5]:
file: ndarray = load_training_data(path_to_remote_path("{}/{}".format(conf["paths"].data_path, training_data_name + ".npy"), remote_kernel))
data_loader = create_dataloader(Audio_Data(file[:n_training_samples]), batch_size)
validation_dataloader = create_dataloader(Audio_Data(file[-n_validation_samples:]), batch_size)
logger.info(f"Data loaded with shape: {file.shape}")

2025-06-01 17:14:51,090 - INFO - Data loaded with shape: (5888, 147200)


### Setup

Model Creation

In [None]:
vocoder = DiffusionVocoder(
    mel_n_fft=2048,
    mel_channels=128,
    mel_sample_rate=32000,
    mel_normalize_log=True,
    mel_hop_length=256,
    net_t=UNetV0,
    channels=[64, 128, 256, 512, 512, 1024, 1024],
    factors=[1, 2, 2, 2, 2, 2, 2],
    items=[1, 2, 2, 2, 2, 2, 2],
    in_channels=1,
    diffusion_t=VDiffusion,
    sampler_t=VSampler,
).to(device)


NameError: name 'DiffusionVocoder' is not defined

Model Loading

In [31]:
optimizer = optim.AdamW(vocoder.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.98)
start_epoch: int = 0
if os.path.exists(full_model_path):
    model = torch.load(full_model_path, map_location=device)
    vocoder.load_state_dict(model["model"])
    if not restart_training:
        optimizer.load_state_dict(model["optim"])
        scheduler.load_state_dict(model["scheduler"])
        start_epoch = model.get("epoch", 0)
    logger.info(f"Model {model_name} loaded with {count_parameters(vocoder)} Parameters")
else: 
    logger.info(f"Model {model_name} created with {count_parameters(vocoder)} Parameters")

2025-06-01 18:38:00,694 - INFO - Model vocoder_v1 created with ~107.7M Parameters


Initiate Trainer

In [32]:
trainer = Trainer(vocoder, optimizer, scheduler, device)

### Training

In [33]:
train_loss, val_loss = trainer.train(train_dataset=data_loader,
            val_dataset=validation_dataloader,
            n_epochs=epochs,
            full_model_path=full_model_path,
            checkpoint_freq=checkpoint_freq,
            patience=-1,
            gradient_clip_norm=None,
            gradient_clip_val=None,
            sample_freq=None,
            #use_embed=False
            )
scatter_plot(train_loss)
scatter_plot(val_loss)

2025-06-01 18:38:04,439 - INFO - Training started on cpu


: 

### Sample

In [None]:
wave_form  = trainer.sample_voc()