# Main Script for a Vocoder

### Imports

In [1]:
try: 
    import librosa
except:
    !pip install librosa


#Set Dir 
import sys, os
sys.path.append(os.path.abspath('..'))

# Torch
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, Subset
import torch.optim as optim
from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle

# Utils
import numpy as np
from numpy import ndarray
import logging, librosa
from typing import Sequence, Optional, Callable


# Base Scripts
from Libraries.Utils import *
from MainScripts.Conf import conf

### Config

### General

In [2]:
remote_kernel: bool = False #Set to true if using a remote Kernel changes the file structure
training_data_name: str = "training_full_wave"

Logging

In [3]:
logging_level: int = LIGHT_DEBUG
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
logger: logging.Logger = logging.getLogger(__name__)

Training Params

In [4]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"


### Data Loading

In [5]:
file: ndarray = load_training_data(path_to_remote_path("{}/{}".format(conf["paths"].data_path, training_data_name + ".npy"), remote_kernel))
spect = audio_to_mel_spectogram(file[1], 2048, 256, 32000, True, 30, 80)


2025-06-02 21:28:08,841 - LIGHT_DEBUG - Ndarray loaded from ../Data/training_full_wave.npy of shape: (5906, 147200)
2025-06-02 21:28:08,843 - LIGHT_DEBUG - Started Mel-Spec
2025-06-02 21:28:12,043 - LIGHT_DEBUG - Created mel-spectogram: (80, 576)


### Convert to wave

In [6]:
save_audio_file(mel_spectrogram_to_audio(spect, sample_rate=32000, len_fft=2048, hop_length=256), "test.wav", 32000)

2025-06-02 21:28:12,058 - LIGHT_DEBUG - Started GL
2025-06-02 21:28:16,190 - LIGHT_DEBUG - Normalized to range: [-0.99999,0.99999]
2025-06-02 21:28:16,191 - LIGHT_DEBUG - Reconstructed audio: (147200,)
2025-06-02 21:28:16,192 - LIGHT_DEBUG - Normalized to range: [-0.99999,0.99999]
2025-06-02 21:28:16,202 - LIGHT_DEBUG - Saved file to:test.wav
