In [2]:
#%pip install resampy
import numpy as np
import os
from pprint import pprint
from bark.api import text_to_semantic, semantic_to_waveform, generate_audio
from bark.generation import SAMPLE_RATE, generate_text_semantic, SEMANTIC_RATE_HZ
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav
from datetime import datetime
import torch
import torchaudio
import soundfile
import resampy
import sys

  from .autonotebook import tqdm as notebook_tqdm


## Generate synthetic dataset

This notebook creates a synthetic dataset of audio: semantic tokens pairs based on voice line prompts from Mozilla CommonVoice. The purpose of this dataset is to reconstruct the Bark semantic tokens codebook, which will enable us to convert ground-truth audio to a semantic prompt for use in fine-tuning and voice cloning. This notebook provides step-by-step instructions for creating the synthetic dataset and saving it in Fairseq dataset format. Let's get started!


For prototyping, we generate voice lines based on metadata from an old version of the [Mozilla CommonVoice dataset](https://www.kaggle.com/datasets/nickj26/common-voice-corpus-1?resource=download&select=validated.tsv) metadata. This is far from ideal; down the pike, we need a much more larger dataset with more diverse voice lines, including multilingual and non-spoken.

In [3]:
import pandas as pd

CV_METADATA_PATH = '../datasets/validated.tsv'
df = pd.read_csv(CV_METADATA_PATH, sep="\t")
df.columns

Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accent'],
      dtype='object')

In [4]:
# Preview
lines = df["sentence"].unique()
lines

array(['To give chalk for cheese', 'Judge may not think so.',
       'I have already described the appearance of that colossal bulk which was embedded in the ground.',
       ..., "How's the forecast for VI",
       'Please look up the Jenny of the Prairie television show.',
       'Find me the creative work The Pickwick Papers'], dtype=object)

There are enough English lines for ~25 hours of audio with unique voice lines; _hopefully_ we'll need less than that.

In [5]:
# Force cu118 generation if available
#%pip install torch torchaudio --force --extra-index-url https://download.pytorch.org/whl/cu118

In [None]:
minutes_to_generate = 3 * 60
# Line index in commonvoice to start with. Useful when resuming
start_line = 10307

In [10]:
%%capture log
minutes_generated = 0

label_file = open('../datasets/en/labels.txt', "a")
manifest_file = open('../datasets/en/manifest.tsv', 'a')
# Give TSV header at beginning.
# No, this isn't robust. Too bad!
if start_line == 0:
    manifest_file.write(str(os.path.abspath("../datasets/en")) + "\n")

# Because HuBERT is trained on 16khz data
OUTPUT_SAMPLE_RATE = 16_000
resampler = torchaudio.transforms.Resample(orig_freq=SAMPLE_RATE, new_freq=OUTPUT_SAMPLE_RATE)

for i, line in enumerate(lines[start_line:]):
    try:
        semantic_tokens = generate_text_semantic(text=line, temp=1)
        waveform_arr = semantic_to_waveform(semantic_tokens)

        # Persist sequence to new line
        label_file.write(' '.join(list(map(str, semantic_tokens.tolist()))) + "\n")
        label_file.flush()

        # Downsample generated audio to 16khz and save 
        waveform_tensor = torch.from_numpy(waveform_arr)
        resampled_tensor = resampler(waveform_tensor).unsqueeze(0)
        wav_fname = f"en_{start_line + i}_{line}.wav"
        wav_filepath = f"../datasets/en/{wav_fname}"
        torchaudio.save(wav_filepath, resampled_tensor, OUTPUT_SAMPLE_RATE)

        # Log info to manifest
        seconds_generated = len(semantic_tokens) / SEMANTIC_RATE_HZ
        manifest_file.write(f"{wav_fname}\t{resampled_tensor.shape[1]}" + "\n")
        manifest_file.flush()

        # Cutoff when sufficient data
        minutes_generated += seconds_generated / 60
        print(f"Minutes of audio: {minutes_generated}")
        if minutes_generated > minutes_to_generate:
            break
    except:
        pass

## ONE-OFF: Convert existing model to new

DELETE THIS after finishing and verifying correctness!

In [12]:
# Create labels
import glob

old_folder_path = '../datasets/en_old/'
search_pattern = os.path.join(old_folder_path, "*.wav")

label_file = open(f'{old_folder_path}/labels.txt', "w")
manifest_file = open(f'{old_folder_path}/manifest.tsv', 'w')
manifest_file.write(str(os.path.abspath("../datasets/en_old")) + "\n")

OUTPUT_SAMPLE_RATE = 16_000
resampler = torchaudio.transforms.Resample(orig_freq=SAMPLE_RATE, new_freq=OUTPUT_SAMPLE_RATE)

for wav_filename in glob.glob(search_pattern):
    # Load file
    basename = os.path.basename(wav_filename)
    wav, sr = torchaudio.load(wav_filename)

    # Convert to 16khz and overwrite original
    if sr != 16_000:
        resampled_tensor = resampler(wav)
        torchaudio.save(wav_filename, resampled_tensor, OUTPUT_SAMPLE_RATE)
        manifest_file.write(f"{basename}\t{resampled_tensor.shape[1]}\n")
    else:
        manifest_file.write(f"{basename}\t{wav.shape[1]}\n")

    
    manifest_file.flush()
    semantic_history = np.load(
        os.path.join(old_folder_path, f"{basename[2:-4]}.npz")
    )["tokens"]
    wav_length_seconds = len(semantic_history) / 49.9

    # Add manifest entry

    # Write tokens to label file
    label_file.write(f'{" ".join(list(map(str, semantic_history.tolist())))}\n')
    label_file.flush()

    # Try only one for now
