# Voice Cloning App - Remote Training

Remote training for the Voice Cloning App.

**Please ensure you have this notebook enabled with GPU before running (Runtime->Change runtime type)**

Steps:
1. Export your dataset from the app & unzip
2. Create a folder called `Voice-Cloning` in your Google Drive
  1. Create a sub-folder called `datasets` and upload your dataset folder to it
3. Configure parameters below
4. Run this notebook one cell at a time
  - Connect to your google drive when prompted
  - Ensure you have selected the correct options before running the training cell

During training and once training is done you'll find your latest checkpoint in Google Drive within the folder `Voice-Cloning/hifigan_training/dataset_name`.

This can be download from your drive and imported into the app under the "Synthasise" menu.

In [7]:
#@title Connect to google drive
from google.colab import drive
import os

drive.mount('/content/drive')

output_directory = "/content/drive/MyDrive/Voice-Cloning"
os.makedirs(output_directory, exist_ok=True)

dataset_directory = os.path.join(output_directory, "datasets")
os.makedirs(output_directory, exist_ok=True)

hifigan_directory = os.path.join(output_directory, "hifigan_training")
os.makedirs(output_directory, exist_ok=True)

checkpoint_directory = hifigan_directory
os.makedirs(checkpoint_directory, exist_ok=True)

alphabet_directory = os.path.join(output_directory, "alphabets")
os.makedirs(alphabet_directory, exist_ok=True)

datasets = os.listdir(dataset_directory)
assert datasets, "No datasets found in 'Voice-Cloning/datasets'. Please export your dataset from the app, unzip and upload to this folder"

# Check datasets
for dataset in datasets:
  try:
    dataset_path = os.path.join(dataset_directory, dataset)
    files = os.listdir(dataset_path)
    assert "metadata.csv" in files, f"Dataset '{dataset}' is missing metadata.csv"
    assert "wavs" in files, f"Dataset '{dataset}' is missing wavs folder"
  except NotADirectoryError:
    raise Exception(f"Dataset '{dataset}' is not a folder. Please ensure all datasets are folders containing your metadata.csv & wavs")

checkpoints = {dataset: os.listdir(os.path.join(hifigan_directory, dataset)) for dataset in datasets if os.path.isdir(os.path.join(checkpoint_directory, dataset))}
languages = os.listdir(alphabet_directory)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
#@title Parameters
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import argparse
import itertools
import logging
import json
import time
import os
from os.path import dirname, abspath
assert torch.cuda.is_available(),  "Please change Runtime type to GPU (Runtime->Change runtime type)"

# Clone the app
!pip install pysrt==1.1.2 pydub==0.24.1 webrtcvad==2.0.10 Unidecode==1.0.22
!git clone https://github.com/BenAAndrew/Voice-Cloning-App.git
%cd /content/Voice-Cloning-App/
#git checkout 0965a3c8b217dceaf60e22f9fd6b6ceab990c82f
from training import DEFAULT_ALPHABET
from training.utils import load_symbols
from training.train import train
from training.hifigan.train import train as train_hifigan
from training.hifigan.utils import get_checkpoint_options, save_checkpoints
from training.hifigan.utils import checkpoint_cleanup as hifigan_checkpoint_cleanup


# Download pretrained model
#from google_drive_downloader import GoogleDriveDownloader as gdd
#transfer_learning_path = "pretrained.pt"
#gdd.download_file_from_google_drive(file_id='1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA', dest_path="./"+transfer_learning_path, unzip=False)

# Get settings
epochs = 1000 #@param {type:"slider", min:100, max:3500, step:100}
batch_size = 38 #@param {type:"slider", min:12, max:70, step:2}
checkpoint_frequency = 1000 #@param {type:"slider", min:250, max:2500, step:250}
backup_checkpoint_frequency = 10000 #@param {type:"slider", min:2500, max:25000, step:500}
validation_size = 0.2 #@param {type:"slider", min:0.05, max:0.2, step:0.025}
early_stopping = True #@param {type:"boolean"}

Cloning into 'Voice-Cloning-App'...
remote: Enumerating objects: 3124, done.[K
remote: Counting objects: 100% (1939/1939), done.[K
remote: Compressing objects: 100% (1191/1191), done.[K
remote: Total 3124 (delta 1371), reused 1227 (delta 709), pack-reused 1185[K
Receiving objects: 100% (3124/3124), 15.48 MiB | 10.70 MiB/s, done.
Resolving deltas: 100% (2096/2096), done.
/content/Voice-Cloning-App


In [13]:
#@title Options
import ipywidgets as widgets
from IPython.display import display

# Form
dataset = widgets.Dropdown(
    options=datasets,
    description='Dataset:',
)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
      checkpoint.options = checkpoints.get(change['new'], [])

dataset.observe(on_change)

checkpointg = widgets.Dropdown(
    options=checkpoints.get(dataset.value, []),
    description='Checkpoint g:',
)

checkpointdo = widgets.Dropdown(
    options=checkpoints.get(dataset.value, []),
    description='Checkpoint do:',
)

alphabet = widgets.Dropdown(
    options=languages,
    description='Language:',
)

button = widgets.Button(
    description="Start training", 
    button_style="success",
)

display(dataset)
display(checkpointg)
display(checkpointdo)
if languages:
  display(alphabet)

Dropdown(description='Dataset:', options=('RCBray',), value='RCBray')

Dropdown(description='Checkpoint g:', options=('do_5000', 'g_5000', 'g_1000', 'do_1000'), value='do_5000')

Dropdown(description='Checkpoint do:', options=('do_5000', 'g_5000', 'g_1000', 'do_1000'), value='do_5000')

In [None]:
#@title Train

dataset_path = os.path.join(dataset_directory, dataset.value)

wavs = os.path.join(dataset_path, "wavs")
output_directory = os.path.join(hifigan_directory, dataset.value)
symbols = load_symbols(os.path.join(alphabet_directory, alphabet.value)) if alphabet.value else DEFAULT_ALPHABET
checkpoint_path = os.path.join(hifigan_directory, dataset.value, checkpoint.value) if checkpoint.value else None
train_hifigan(
     audio_folder=wavs,
    output_directory=output_directory,
    checkpoint_g= os.path.join(hifigan_directory, dataset.value, checkpointg.value),
    checkpoint_do=os.path.join(hifigan_directory, dataset.value, checkpointdo.value),
    epochs=1000,
    batch_size=None,
    iters_per_checkpoint=1000,
    iters_per_backup_checkpoint=10000,
    train_size=0.8,
    logging=logging
)

INFO:root:Setting batch size to 8, learning rate to 0.000282842712474619. (10GB GPU memory free)


3417 train files, 855 test files


INFO:root:Loading /content/drive/MyDrive/Voice-Cloning/hifigan_training/RCBray/g_5000 and /content/drive/MyDrive/Voice-Cloning/hifigan_training/RCBray/do_5000 checkpoints


Epoch: 1


INFO:root:Status - [Epoch 0: Iteration 5001] Loss 26.336 Mel-Spec. Error 0.38162 0.97s/it
INFO:root:Status - [Epoch 0: Iteration 5002] Loss 27.411 Mel-Spec. Error 0.42971 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5003] Loss 28.341 Mel-Spec. Error 0.44262 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5004] Loss 30.954 Mel-Spec. Error 0.47179 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5005] Loss 27.138 Mel-Spec. Error 0.43016 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5006] Loss 33.879 Mel-Spec. Error 0.51388 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5007] Loss 33.875 Mel-Spec. Error 0.57576 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5008] Loss 33.967 Mel-Spec. Error 0.54923 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5009] Loss 35.422 Mel-Spec. Error 0.51613 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5010] Loss 33.653 Mel-Spec. Error 0.51063 0.95s/it
INFO:root:Status - [Epoch 0: Iteration 5011] Loss 33.448 Mel-Spec. Error 0.52939 0.95s/it
INFO:root: