# Voice Conversion Training

In this notebook we first load training data in the form of precomputed mel-spectrograms, content encodings and speaker encodings of the VCTK Dataset. Then we decide on a variant of our model to train, and lastly we run the training loop.

We make a few assumptions, for example about Google Drive folder structure. These will be apparent and you can adjust them if necessary.

Parts of this notebook use Code from [Soft-VC](https://github.com/bshall/acoustic-model) as a basis.

For loading the dataset and storing checkpoints, we mount Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Copy and unzip dataset from mounted Google Drive

We have found that copying the dataset files into a location inside the colab environment is much faster than working with the dataset if it stays in the mounted google drive folder. Adjust file paths if necessary.

Copying and unzipping the zip files usually takes around 1-2 minutes per cell for mel data and units.

In [None]:
drive_vctk_path = '/content/drive/MyDrive/VC/VCTK/'
drive_vctk_wav_text_path = drive_vctk_path + 'VCTK-Corpus-mic1.zip'
drive_vctk_mels_path = drive_vctk_path + 'VCTK-Corpus-mic1-mels.zip'
drive_vctk_units_path = drive_vctk_path + 'VCTK-Corpus-mic1-units.zip'
drive_vctk_spk_emb_path = drive_vctk_path + 'VCTK-Corpus-mic1-spk_emb.zip'
drive_vctk_spk_emb_win_path = drive_vctk_path + 'VCTK-Corpus-mic1-spk_emb_win.zip'
drive_vctk_spk_emb_agg_path = drive_vctk_path + 'VCTK-Corpus-mic1-spk_emb_agg.zip'
drive_vctk_splits_path = drive_vctk_path + 'splits_ext.json'

In [None]:
!mkdir /content/vctk

copy mels from google drive

In [None]:
!cp $drive_vctk_mels_path /content/vctk/VCTK-Corpus-mic1-mels.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1-mels.zip -d /content/vctk

In [None]:
!mv /content/vctk/content/vctk/mels /content/vctk/mels

copy units from google drive

In [None]:
!cp $drive_vctk_units_path /content/vctk/VCTK-Corpus-mic1-units.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1-units.zip

In [None]:
!mv /content/content/vctk/units /content/vctk/units

copy spk_emb from google drive

In [None]:
!cp $drive_vctk_spk_emb_path /content/vctk/VCTK-Corpus-mic1-spk_emb.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1-spk_emb.zip -d /content/vctk

In [None]:
!mv /content/vctk/content/vctk/spk_emb /content/vctk/spk_emb

copy spk_emb_win from google drive

In [None]:
!cp $drive_vctk_spk_emb_win_path /content/vctk/VCTK-Corpus-mic1-spk_emb_win.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1-spk_emb_win.zip -d /content/vctk

In [None]:
!mv /content/vctk/content/vctk/spk_emb_win /content/vctk/spk_emb_win

copy spk_emb_agg from google drive

In [None]:
!cp $drive_vctk_spk_emb_agg_path /content/vctk/VCTK-Corpus-mic1-spk_emb_agg.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1-spk_emb_agg.zip -d /content/vctk

In [None]:
!mv /content/vctk/content/vctk/spk_emb_agg /content/vctk/spk_emb_agg

copy dataset split into from google drive

In [None]:
!cp $drive_vctk_splits_path /content/vctk/splits.json

create checkpoint directory

In [None]:
!mkdir /content/ckpt

In [None]:
vctk_path = "/content/vctk"
ckpt_path = "/content/ckpt"
splits_path = '/content/vctk/splits.json'
model_path = '/content/drive/MyDrive/VC'

## Dataset

This is the Dataset we use for training. `spkutts` is a list of speaker utterance id strings, e.g. `"p225_001"`.

In [None]:
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class VCTKDataset(Dataset):
    def __init__(self, root, spkutts, spk_emb_type):
        if spk_emb_type == 'single':
            spk_emb_dirname = 'spk_emb'
        elif spk_emb_type == 'win':
            spk_emb_dirname = 'spk_emb_win'
        elif spk_emb_type == 'agg':
            spk_emb_dirname = 'spk_emb_agg'
        else:
            raise ValueError()

        self.spk_emb_type = spk_emb_type
        self.mels_dir = root / "mels"
        self.units_dir = root / "units"
        self.spk_emb_dir = root / spk_emb_dirname

        # EXAMPLE: self.metadata = [Path('p225/p225_001_mic1'), ...]
        self.metadata = [Path(f"{spkutt[:4]}/{spkutt}_mic1") for spkutt in spkutts]

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        path = self.metadata[index]
        mel_path = self.mels_dir / path
        units_path = self.units_dir / path
        if self.spk_emb_type in ['single', 'win']:
            spk_emb_path = self.spk_emb_dir / path
        else:  # self.spk_emb_type == 'agg'
            spk_emb_path = self.spk_emb_dir / Path(str(path)[:4])

        mel = np.load(mel_path.with_suffix(".npy")).T
        units = np.load(units_path.with_suffix(".npy"))
        spk_emb = np.load(spk_emb_path.with_suffix(".npy"))

        length = 2 * units.shape[0]

        mel = torch.from_numpy(mel[:length, :])
        mel = F.pad(mel, (0, 0, 1, 0))
        units = torch.from_numpy(units)
        spk_emb = torch.from_numpy(spk_emb)
        return mel, units, spk_emb

    def pad_collate(self, batch):
        mels, units, spk_embs = zip(*batch)

        mels, units, spk_embs = list(mels), list(units), list(spk_embs)

        mels_lengths = torch.tensor([x.size(0) - 1 for x in mels])
        units_lengths = torch.tensor([x.size(0) for x in units])

        mels = pad_sequence(mels, batch_first=True)
        units = pad_sequence(
            units, batch_first=True, padding_value=0
        )

        return mels, mels_lengths, units, units_lengths, spk_embs

## Model

Our model code lives in the file `model.py`.

In [None]:
import sys
sys.path.append(model_path)
from model import VCModel

## Training Utils

Before we start the training loop, we need a few extra utils.

In [None]:
import torch
import torch.nn.functional as F
import matplotlib

import torchaudio.transforms as transforms


class Metric:
    def __init__(self):
        self.steps = 0
        self.value = 0

    def update(self, value):
        self.steps += 1
        self.value += (value - self.value) / self.steps
        return self.value

    def reset(self):
        self.steps = 0
        self.value = 0



def save_checkpoint(
    checkpoint_dir,
    model,
    optimizer,
    step,
    loss,
    best,
    logger,
):
    state = {
        "acoustic-model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "step": step,
        "loss": loss,
    }
    checkpoint_dir.mkdir(exist_ok=True, parents=True)
    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
    torch.save(state, checkpoint_path)
    if best:
        best_path = checkpoint_dir / "model-best.pt"
        torch.save(state, best_path)
    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")


## Training loop

In [None]:
torch.cuda.empty_cache()

Decide on the model variant and dataset.

In [None]:
MORE_DROPOUT = False
DIMINCREASE = False
POSTNET = False

USE_CUSTOM_LSTM = False

SPK_EMB_TYPE = 'agg'  # one of 'single', 'win', 'agg'

VAL_ONLY_UNSEEN_SPK_AND_UTT = False

Initialize Model and Datasets and start the training loop

In [None]:
import logging
import json
from pathlib import Path
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Training Hyperparameters

BATCH_SIZE = 64
LEARNING_RATE = 4e-4
BETAS = (0.8, 0.99)
WEIGHT_DECAY = 1e-5
STEPS = 45000  # around 80 Epochs
LOG_INTERVAL = 5
VALIDATION_INTERVAL = 547  # one 1 epoch is 547 steps
CHECKPOINT_INTERVAL = 547

CHECKPOINT_DIR = Path(ckpt_path)
DATASET_DIR = Path(vctk_path)

# Setup logging

log_dir = CHECKPOINT_DIR / "logs"
log_dir.mkdir(exist_ok=True, parents=True)

logger.setLevel(logging.INFO)
handler = logging.FileHandler(log_dir / f"{CHECKPOINT_DIR.stem}.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S"
)
handler.setFormatter(formatter)
logger.addHandler(handler)

writer = SummaryWriter(log_dir)

# Initialize models and optimizer

model = VCModel(
    more_dropout=MORE_DROPOUT,
    dimincrease=DIMINCREASE,
    postnet=POSTNET,
    use_custom_lstm=USE_CUSTOM_LSTM
).to('cuda')

optimizer = optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    betas=BETAS,
    weight_decay=WEIGHT_DECAY,
)

# Initialize datasets and dataloaders

with open(splits_path) as f:
    splits = json.load(f)

train_dataset = VCTKDataset(
    root=DATASET_DIR,
    spkutts=splits["train"],
    spk_emb_type=SPK_EMB_TYPE,
)
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=train_dataset.pad_collate,
    num_workers=1,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
)

if VAL_ONLY_UNSEEN_SPK_AND_UTT:
  validation_dataset_part = "val_uu"
else:
  validation_dataset_part = "val"

validation_dataset = VCTKDataset(
    root=DATASET_DIR,
    spkutts=splits[validation_dataset_part],
    spk_emb_type=SPK_EMB_TYPE,
)
validation_loader = DataLoader(
    validation_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
)

global_step, best_loss = 0, float("inf")

# Start training loop

n_epochs = STEPS // len(train_loader) + 1
start_epoch = global_step // len(train_loader) + 1

logger.info("**" * 40)
logger.info(f"PyTorch version: {torch.__version__}")
logger.info(f"CUDA version: {torch.version.cuda}")
logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}")
logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}")
logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}")
logger.info(f"# of GPUS: {torch.cuda.device_count()}")
logger.info(f"batch size: {BATCH_SIZE}")
logger.info(f"iterations per epoch: {len(train_loader)}")
logger.info(f"# of epochs: {n_epochs}")
logger.info(f"started at epoch: {start_epoch}")
logger.info("**" * 40 + "\n")

average_loss = Metric()
epoch_loss = Metric()

validation_loss = Metric()

for epoch in range(start_epoch, n_epochs + 1):

    model.train()
    epoch_loss.reset()

    for mels, mels_lengths, units, units_lengths, spk_embs in tqdm(train_loader):
        mels, mels_lengths = mels.to('cuda'), mels_lengths.to('cuda')
        units, units_lengths = units.to('cuda'), units_lengths.to('cuda')
        spk_embs = torch.stack(spk_embs)
        spk_embs = spk_embs.to('cuda')

        # Compute training loss

        optimizer.zero_grad()

        mels_ = model(units, spk_embs, mels[:, :-1, :])

        loss = F.l1_loss(mels_, mels[:, 1:, :], reduction="none")
        loss = torch.sum(loss, dim=(1, 2)) / (mels_.size(-1) * mels_lengths)
        loss = torch.mean(loss)

        loss.backward()
        optimizer.step()

        global_step += 1

        # Update and log training metrics

        average_loss.update(loss.item())
        epoch_loss.update(loss.item())

        if global_step % LOG_INTERVAL == 0:
            writer.add_scalar(
                "train/loss",
                average_loss.value,
                global_step,
            )
            average_loss.reset()

        # Start validation loop

        if global_step % VALIDATION_INTERVAL == 0:
            model.eval()
            validation_loss.reset()

            for i, (mels, units, spk_embs) in enumerate(validation_loader, 1):
                mels, units = mels.to('cuda'), units.to('cuda')
                spk_embs = spk_embs.to('cuda')

                with torch.no_grad():
                    mels_ = model(units, spk_embs, mels[:, :-1, :])
                    loss = F.l1_loss(mels_, mels[:, 1:, :])

                # Update validation metrics

                validation_loss.update(loss.item())

            model.train()

            # Log validation metrics

            writer.add_scalar(
                "validation/loss",
                validation_loss.value,
                global_step,
            )
            logger.info(
                f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}"
            )

            new_best = best_loss > validation_loss.value
            if new_best or global_step % CHECKPOINT_INTERVAL == 0:
                if new_best:
                    logger.info("-------- new best model found!")
                    best_loss = validation_loss.value

                save_checkpoint(
                    checkpoint_dir=CHECKPOINT_DIR,
                    model=model,
                    optimizer=optimizer,
                    step=global_step,
                    loss=validation_loss.value,
                    best=new_best,
                    logger=logger,
                )

        # End validation loop

    # Log training metrics

    logger.info(f"train -- epoch: {epoch}, loss: {epoch_loss.value:.4f}")

    # End training loop