In [2]:
#%pip install resampy
import numpy as np
import os
from pprint import pprint
from bark.api import text_to_semantic, semantic_to_waveform, generate_audio
from bark.generation import SAMPLE_RATE, generate_text_semantic, SEMANTIC_RATE_HZ
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav
from datetime import datetime
import torch
import torchaudio
import soundfile
import resampy
import sys

  from .autonotebook import tqdm as notebook_tqdm


We generate voice lines based on metadata from an old version of the [Mozilla CommonVoice dataset](https://www.kaggle.com/datasets/nickj26/common-voice-corpus-1?resource=download&select=validated.tsv) metadata. Yes, this is just English for now; I will figure out multilingual later.



This notebook creates a synthetic dataset of audio based on voice line prompts from Mozilla CommonVoice. The purpose of this dataset is to reconstruct the Bark semantic tokens codebook, which will enable us to convert ground-truth audio to a semantic prompt for use in fine-tuning and voice cloning. This notebook provides step-by-step instructions for creating the synthetic dataset and saving it in HuBERT dataset format. Let's get started!

In [2]:
import pandas as pd

CV_METADATA_PATH = '../datasets/validated.tsv'
df = pd.read_csv(CV_METADATA_PATH, sep="\t")
df.columns

Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accent'],
      dtype='object')

In [3]:
lines = df["sentence"].unique()
lines

array(['To give chalk for cheese', 'Judge may not think so.',
       'I have already described the appearance of that colossal bulk which was embedded in the ground.',
       ..., "How's the forecast for VI",
       'Please look up the Jenny of the Prairie television show.',
       'Find me the creative work The Pickwick Papers'], dtype=object)

There are enough English lines for ~25 hours of audio with unique voice lines; _hopefully_ we'll need less than that.

In [4]:
# Force cu118 generation if available
#%pip install torch torchaudio --force --extra-index-url https://download.pytorch.org/whl/cu118

On an RTX 4090 with a "large" model there's an RTF of approximately 0.75 with Torch 2.0+cu118, before additional optimizations.

TODO: 
- Disable saving logging?

In [38]:
minutes_generated = 0
minutes_to_generate = 1

# Line index in commonvoice to start with. Useful when resuming
start_line = 0

label_file = open('../datasets/en/labels.txt', "w")
manifest_file = open('../datasets/en/manifest.tsv', 'w')
# Give TSV header at beginning.
# No, this isn't robust. Too bad!
if start_line == 0:
    manifest_file.write(str(os.path.abspath("../datasets/en")) + "\n")

# Because HuBERT is trained on 16khz data
OUTPUT_SAMPLE_RATE = 16_000
resampler = torchaudio.transforms.Resample(orig_freq=SAMPLE_RATE, new_freq=OUTPUT_SAMPLE_RATE)

for i, line in enumerate(lines[start_line:]):
    #try:
    semantic_tokens = generate_text_semantic(text=line, temp=1)
    waveform_arr = semantic_to_waveform(semantic_tokens)

    # Persist sequence to new line
    label_file.write(' '.join(list(map(str, semantic_tokens.tolist()))) + "\n")
    label_file.flush()

    # Downsample generated audio to 16khz and save 
    waveform_tensor = torch.from_numpy(waveform_arr)
    resampled_tensor = resampler(waveform_tensor).unsqueeze(0)
    wav_fname = f"en_{start_line + i}_{line}.wav"
    wav_filepath = f"../datasets/en/{wav_fname}"
    torchaudio.save(wav_filepath, resampled_tensor, OUTPUT_SAMPLE_RATE)

    # Log info to manifest
    seconds_generated = len(semantic_tokens) / SEMANTIC_RATE_HZ
    manifest_file.write(f"{wav_fname}\t{resampled_tensor.shape[1]}" + "\n")
    manifest_file.flush()

    # Cutoff when sufficient data
    minutes_generated += seconds_generated / 60
    print(f"Minutes of audio: {minutes_generated}")
    if minutes_generated > minutes_to_generate:
        break
    """
    except:
        pass
    """

100%|██████████| 100/100 [00:00<00:00, 205.14it/s]
100%|██████████| 6/6 [00:01<00:00,  4.02it/s]


Minutes of audio: 0.034068136272545096


100%|██████████| 100/100 [00:00<00:00, 195.24it/s]
100%|██████████| 6/6 [00:01<00:00,  3.93it/s]


Minutes of audio: 0.06980627922511691


100%|██████████| 100/100 [00:01<00:00, 51.11it/s]
100%|██████████| 19/19 [00:07<00:00,  2.54it/s]


Minutes of audio: 0.19639278557114231


100%|██████████| 100/100 [00:00<00:00, 109.94it/s]
100%|██████████| 10/10 [00:03<00:00,  3.25it/s]


Minutes of audio: 0.2615230460921844


100%|██████████| 100/100 [00:00<00:00, 159.01it/s]
100%|██████████| 7/7 [00:01<00:00,  3.54it/s]


Minutes of audio: 0.3069472277889112


100%|██████████| 100/100 [00:00<00:00, 202.22it/s]
100%|██████████| 6/6 [00:01<00:00,  4.10it/s]


Minutes of audio: 0.34134936539746163


100%|██████████| 100/100 [00:00<00:00, 213.91it/s]
100%|██████████| 5/5 [00:01<00:00,  3.63it/s]


Minutes of audio: 0.37474949899799603


100%|██████████| 100/100 [00:02<00:00, 37.67it/s]
100%|██████████| 25/25 [00:10<00:00,  2.38it/s]


Minutes of audio: 0.5414161656646627


100%|██████████| 100/100 [00:00<00:00, 114.07it/s]
100%|██████████| 9/9 [00:02<00:00,  3.32it/s]


Minutes of audio: 0.5988643954575819


100%|██████████| 100/100 [00:01<00:00, 72.65it/s]
100%|██████████| 15/15 [00:05<00:00,  2.88it/s]


Minutes of audio: 0.6930527722110889


100%|██████████| 100/100 [00:00<00:00, 249.29it/s]
100%|██████████| 5/5 [00:01<00:00,  4.30it/s]


Minutes of audio: 0.7207748830995324


100%|██████████| 100/100 [00:00<00:00, 111.29it/s]
100%|██████████| 10/10 [00:03<00:00,  3.33it/s]


Minutes of audio: 0.7822311289245157


100%|██████████| 100/100 [00:00<00:00, 152.38it/s]
100%|██████████| 7/7 [00:01<00:00,  3.51it/s]


Minutes of audio: 0.8279893119572479


100%|██████████| 100/100 [00:00<00:00, 114.71it/s]
100%|██████████| 9/9 [00:02<00:00,  3.20it/s]


Minutes of audio: 0.8877755511022045


100%|██████████| 100/100 [00:01<00:00, 70.00it/s] 
100%|██████████| 15/15 [00:05<00:00,  2.75it/s]


Minutes of audio: 0.9866399465597864


100%|██████████| 100/100 [00:00<00:00, 101.39it/s]
100%|██████████| 11/11 [00:03<00:00,  3.17it/s]


Minutes of audio: 1.0561122244488979


## ONE-OFF: Convert existing model to new

DELETE THIS after finishing and verifying correctness!

In [12]:
# Create labels
import glob

old_folder_path = '../datasets/en_old/'
search_pattern = os.path.join(old_folder_path, "*.wav")

label_file = open(f'{old_folder_path}/labels.txt', "w")
manifest_file = open(f'{old_folder_path}/manifest.tsv', 'w')
manifest_file.write(str(os.path.abspath("../datasets/en_old")) + "\n")

OUTPUT_SAMPLE_RATE = 16_000
resampler = torchaudio.transforms.Resample(orig_freq=SAMPLE_RATE, new_freq=OUTPUT_SAMPLE_RATE)

for wav_filename in glob.glob(search_pattern):
    # Load file
    basename = os.path.basename(wav_filename)
    wav, sr = torchaudio.load(wav_filename)

    # Convert to 16khz and overwrite original
    if sr != 16_000:
        resampled_tensor = resampler(wav)
        torchaudio.save(wav_filename, resampled_tensor, OUTPUT_SAMPLE_RATE)
        manifest_file.write(f"{basename}\t{resampled_tensor.shape[1]}\n")
    else:
        manifest_file.write(f"{basename}\t{wav.shape[1]}\n")

    
    manifest_file.flush()
    semantic_history = np.load(
        os.path.join(old_folder_path, f"{basename[2:-4]}.npz")
    )["tokens"]
    wav_length_seconds = len(semantic_history) / 49.9

    # Add manifest entry

    # Write tokens to label file
    label_file.write(f'{" ".join(list(map(str, semantic_history.tolist())))}\n')
    label_file.flush()

    # Try only one for now
