## **<div style="text-align: center"> VOCODER </div>**
=======================================================================================================================================

    + this is a sub-system that helps that translates the mel spectrograms produced ny the synthsizer into wavwforms.
    + This model is trained on  fatchord-waveRNN model.
    + This helps impove the naturalness of speech generated by the synthesizer 
=======================================================================================================================================

**NOTE:** vooder uses the WAV files, mel files and meta data file - `train.txt` generated during data preparation step of synthesizer.
therefore there is no data preparation needed for vocoder.

## **Section1**:  Defining Vocoder_dataset

`Vocoder_dataset` -is a custom class that is used to load data in batches into the model during training.

In [2]:
def parse_meta_file(metadata_fpath): # funcion description
    
    """This is a custom method to help us parse through meta data file - `train.txt`
        returns a list of tuples 
        `audio_fname`, 
        `mel_fname`,
        `embed_fname`,
        `length of mel frames`,
        `max audio timesteps`
        

    Args:
        metadata_fpath (Path): The path to the train.txt file created during dataset preparation.
    """
    metadata = []
    _tuple = None
    with metadata_fpath.open("r") as metadata_file:
        for line in metadata_file:
            utterence = line.split("|")
            for sub_utterance in utterence:
                _tuple = eval(sub_utterance)
                metadata.append(_tuple)
    return metadata

    
    

In [1]:
from torch.utils.data import Dataset
from pathlib import Path
from vocoder import audio
import vocoder.hparams as hp
import numpy as np
import torch


class VocoderDataset(Dataset):
    def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path):
        print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir))
        
        metadata = parse_meta_file(metadata_fpath)       
                
        gta_fnames = [x[1] for x in metadata if int(x[4])]
        gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames]
        wav_fnames = [x[0] for x in metadata if int(x[4])]
        wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames]
        self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths))
        
        print("Found %d samples" % len(self.samples_fpaths))
    
    def __getitem__(self, index):  
        mel_path, wav_path = self.samples_fpaths[index]  
        
        mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value
        
        # Load the wav
        wav = np.load(wav_path)
        if hp.apply_preemphasis:
            wav = audio.pre_emphasis(wav)
        wav = np.clip(wav, -1, 1)
        
        # Fix for missing padding   # TODO: settle on whether this is any useful
        r_pad =  (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
        wav = np.pad(wav, (0, r_pad), mode='constant')
        assert len(wav) >= mel.shape[1] * hp.hop_length
        wav = wav[:mel.shape[1] * hp.hop_length]
        assert len(wav) % hp.hop_length == 0
        
        # Quantize the wav
        if hp.voc_mode == 'RAW':
            if hp.mu_law:
                quant = audio.encode_mu_law(wav, mu=2 ** hp.bits)
            else:
                quant = audio.float_2_label(wav, bits=hp.bits)
        elif hp.voc_mode == 'MOL':
            quant = audio.float_2_label(wav, bits=16)
            
        return mel.astype(np.float32), quant.astype(np.int64)

    def __len__(self):
        return len(self.samples_fpaths)

`collate_vocoder` function generates windows of mels and audio samples and returns them as a list
the size of the mel window is taken to be :


In [6]:
def collate_vocoder(batch):
    mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
    max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
    sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]

    mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)] # picking up random windows of mel and text sequemces in datset

    labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]

    mels = np.stack(mels).astype(np.float32)  # join all the mels to form just one big mel spectrogram
    labels = np.stack(labels).astype(np.int64) # same with the labels

    mels = torch.tensor(mels)
    labels = torch.tensor(labels).long()

    x = labels[:, :hp.voc_seq_len]
    y = labels[:, 1:]

    bits = 16 if hp.voc_mode == 'MOL' else hp.bits

    x = audio.label_2_float(x.float(), bits)

    if hp.voc_mode == 'MOL' :
        y = audio.label_2_float(y.float(), bits)

    return x, y, mels

#### **dataset-testing**

In [2]:
from pathlib import Path

metadata_fpath = Path("C:/Users/Swaroop/real-time-voice-cloner/synthesizer/outdir/train.txt")
mel_dir = Path("C:/Users/Swaroop/real-time-voice-cloner/synthesizer/outdir/mels")
wav_dir = Path("C:/Users/Swaroop/real-time-voice-cloner/synthesizer/outdir/audio")

datset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)

Using inputs from:
	C:\Users\Swaroop\real-time-voice-cloner\synthesizer\outdir\train.txt
	C:\Users\Swaroop\real-time-voice-cloner\synthesizer\outdir\mels
	C:\Users\Swaroop\real-time-voice-cloner\synthesizer\outdir\audio
Found 62477 samples


## **Section 2:**  Model training

we do not include the model architecture in the notebook rather simply import it keep it simple. 

#### **train function**

In [4]:
import time
from pathlib import Path
import numpy as np
import torch 
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader
from vocoder.distribution import discretized_mix_logistic_loss
from vocoder.display import simple_table, stream
from vocoder.gen_wavernn import gen_testset


from vocoder.fatchord_version import  WaveRNN
import vocoder.hparams as hf



def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool = True, save_every: int = 10, force_restart: bool = False):
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode
    )    

    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr
    loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss

    # Load the weights
    model_dir = models_dir / run_id
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir / "vocoder.pt"
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of WaveRNN from scratch\n")
        model.save(weights_fpath, optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("WaveRNN weights loaded from step %d" % model.step)

    # Initialize the dataset
    metadata_fpath = syn_dir.joinpath("outdir/train.txt") if ground_truth else \
        voc_dir.joinpath("synthesized.txt")
    mel_dir = syn_dir.joinpath("outdir/mels") if ground_truth else voc_dir.joinpath("mels_gta")
    wav_dir = syn_dir.joinpath("outdir/audio")
    dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
    test_loader = DataLoader(dataset, batch_size=1, shuffle=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size),
                  ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    for epoch in range(1, 350):
        data_loader = DataLoader(dataset, hp.voc_batch_size, shuffle=True, collate_fn=collate_vocoder)
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(data_loader, 1):          

            # Forward pass
            y_hat = model(x, m)
            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y = y.float()
            y = y.unsqueeze(-1)

            # Backward pass
            loss = loss_func(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            print("running step:", step)
            k = step // 1000

            if save_every != 0 and step % save_every == 0 :
                model.save(weights_fpath, optimizer)
                print("saved step:", step)

            msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
                f"Loss: {avg_loss:.4f} | {speed:.1f} " \
                f"steps/s | Step: {k}k | "
            stream(msg)


        gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
                    hp.voc_target, hp.voc_overlap, model_dir)
        print("")


### Running the train function.

In [None]:
train(run_id = "1",syn_dir= Path("../synthesizer/"), voc_dir=Path("."), models_dir=Path("../trained-models/"), ground_truth=True, force_restart=False)

#### Testing the trained modelfor outputs

In [7]:
from pathlib import Path
from torch.utils.data import DataLoader
syn_dir = Path("../synthesizer/")
mel_dir = syn_dir.joinpath("outdir/mels") 
wav_dir = syn_dir.joinpath("outdir/audio")
metadata_fpath = syn_dir.joinpath("outdir/train.txt")
dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
test_loader = DataLoader(dataset, hp.voc_batch_size, shuffle=True, collate_fn=collate_vocoder)
i, (x, y, m) = next(enumerate(test_loader), 1)
# i, ()next(enumerate(test_loader), 1)

Using inputs from:
	..\synthesizer\outdir\train.txt
	..\synthesizer\outdir\mels
	..\synthesizer\outdir\audio
Found 62477 samples


In [8]:
# x, y, m = collate_vocoder([seq])
model(x, m)

tensor([[[ 1.3248e-01, -1.5036e-01, -1.4575e-01,  ..., -1.8457e-01,
           4.4525e-02, -2.5593e-01],
         [ 1.2617e-01, -1.3193e-01, -1.4731e-01,  ..., -1.7815e-01,
           5.0890e-02, -2.6523e-01],
         [ 1.2282e-01, -1.2400e-01, -1.4757e-01,  ..., -1.7325e-01,
           5.4204e-02, -2.6536e-01],
         ...,
         [ 9.0285e-02, -5.0740e-02, -4.4236e-02,  ...,  1.7669e-02,
           7.8709e-02, -3.1156e-02],
         [ 9.0433e-02, -4.9884e-02, -4.3425e-02,  ...,  1.8069e-02,
           7.9010e-02, -3.0693e-02],
         [ 9.0415e-02, -5.0326e-02, -4.3611e-02,  ...,  1.8248e-02,
           7.9015e-02, -3.0635e-02]],

        [[ 7.0321e-02, -1.7736e-01,  9.0538e-02,  ...,  1.8021e-03,
          -8.8340e-03, -4.9552e-02],
         [ 5.6640e-02, -1.7382e-01,  1.1185e-01,  ...,  9.0460e-03,
           3.9764e-03, -4.1281e-02],
         [ 4.8231e-02, -1.6843e-01,  1.2113e-01,  ...,  1.6675e-02,
           1.0649e-02, -4.0450e-02],
         ...,
         [ 4.0332e-03, -1

## **Section 3:** Inference

loading a storted synthesizer output for inference

In [10]:
import pickle
# synthesizer_output = 
with open('C:/Users/Swaroop/real-time-voice-cloner/synthesizer/synthesiser-output.pkl', 'rb') as file:    
    synthesizer_output = pickle.load(file)
synthesizer_output

[array([[-4.024412 , -4.0200315, -4.0351114, ..., -3.87108  , -3.9133534,
         -3.9686704],
        [-4.0192227, -4.011092 , -4.021403 , ..., -3.8839068, -3.917695 ,
         -3.9993377],
        [-4.0286603, -4.032275 , -4.017156 , ..., -3.8269017, -3.895897 ,
         -3.958156 ],
        ...,
        [-3.9794612, -3.9577441, -3.9867291, ..., -3.1497118, -3.329679 ,
         -3.5726745],
        [-3.9789128, -3.9642243, -3.987669 , ..., -3.2713053, -3.4242873,
         -3.6423304],
        [-3.974428 , -3.9742632, -3.9426513, ..., -3.7245097, -3.7947302,
         -3.9285758]], dtype=float32),
 array([[-4.002446 , -3.9925246, -3.9878685, ..., -3.9150531, -3.9221246,
         -3.8708854],
        [-4.0021977, -4.0023665, -4.0093045, ..., -3.8443673, -3.842957 ,
         -3.881302 ],
        [-4.003618 , -4.0067725, -3.996859 , ..., -3.8036325, -3.825219 ,
         -3.8736362],
        ...,
        [-3.9729254, -3.959733 , -3.9662833, ..., -2.1341083, -2.4925997,
         -2.9703083

In [21]:
from vocoder.fatchord_version import WaveRNN
import vocoder.hparams as hp
import torch


_model = None   # type: WaveRNN

def load_model(weights_fpath, verbose=True):
    global _model, _device
    
    if verbose:
        print("Building Wave-RNN")
    _model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode
    )

    
    if verbose:
        print("Loading model weights at %s" % weights_fpath)
    checkpoint = torch.load(weights_fpath, map_location=torch.device('cpu'))
    _model.load_state_dict(checkpoint['model_state'])
    _model.eval()


def is_loaded():
    return _model is not None


def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800, 
                   progress_callback=None):
    """
    Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 
    that of the synthesizer!)
    
    :param normalize:  
    :param batched: 
    :param target: 
    :param overlap: 
    :return: 
    """
    if _model is None:
        raise Exception("Please load Wave-RNN in memory before using it")
    
    if normalize:
        mel = mel / hp.mel_max_abs_value
    mel = torch.from_numpy(mel[None, ...])
    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
    return wav


In [22]:
load_model(Path("./vocoder.pt"))

Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder.pt


In [25]:
generated_audio_wav = infer_waveform(synthesizer_output[0])

{| ████████████████ 28500/28800 | Batch Size: 3 | Gen Rate: 1.2kHz | }

In [29]:
import librosa
import librosa.display 
import IPython.display as ipd
import matplotlib.pyplot as plt
from synthesizer.hparams import hparams


generated_audio_wav = infer_waveform(synthesizer_output[15])
# scale = inv_mel_spectrogram(synthesized_spectrograms_pre_trained[15], hparams= hparams)
ipd.Audio(generated_audio_wav, rate = 16000)

{| ████████████████ 57000/57600 | Batch Size: 6 | Gen Rate: 2.2kHz | }