In [29]:
import numpy as np
import os
from pprint import pprint
from bark.api import text_to_semantic, semantic_to_waveform, generate_audio
from bark.generation import SAMPLE_RATE, generate_text_semantic
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav
from datetime import datetime
import sys

We generate voice lines based on metadata from an old version of the [Mozilla CommonVoice dataset](https://www.kaggle.com/datasets/nickj26/common-voice-corpus-1?resource=download&select=validated.tsv) metadata. Yes, this is just English for now; I will figure out multilingual later.



In [13]:
import pandas as pd

CV_METADATA_PATH = '../datasets/validated.tsv'
df = pd.read_csv(CV_METADATA_PATH, sep="\t")
df.columns

Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accent'],
      dtype='object')

In [15]:
lines = df["sentence"].unique()
lines

array(['To give chalk for cheese', 'Judge may not think so.',
       'I have already described the appearance of that colossal bulk which was embedded in the ground.',
       ..., "How's the forecast for VI",
       'Please look up the Jenny of the Prairie television show.',
       'Find me the creative work The Pickwick Papers'], dtype=object)

There are enough English lines for ~25 hours of audio; _hopefully_ we'll need less than that.

In [24]:
#%pip install torch --force --extra-index-url https://download.pytorch.org/whl/cu118
import torch


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)
Collecting typing-extensions
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting sympy
  Using cached https://download.pytorch.org/whl/sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting jinja2
  Using cached https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting filelock
  Using cached filelock-3.12.0-py3-none-any.whl (10 kB)
Collecting triton==2.0.0
  Using cached https://download.pytorch.org/whl/triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
Collecting cmake
  Using cached cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)
Collecting lit
  Downloading lit-16.0.2.tar.gz (137 kB)
[2K

In [26]:
#generate_text_semantic = torch.compile(generate_text_semantic)
#semantic_to_waveform = torch.compile(semantic_to_waveform)

On an RTX 4090 with a "large" model there's an RTF of approximately 0.75 with Torch 2.0+cu118, before additional optimizations.

In [33]:
minutes_generated = 0
for i, line in enumerate(lines):
    semantic_tokens = generate_text_semantic(text=line, temp=1)
    waveform_arr = semantic_to_waveform(semantic_tokens)

    semantic_tokens_filepath = f"../datasets/en/{i}_{line}.npz"
    np.savez(semantic_tokens_filepath, tokens=semantic_tokens)
    wav_filepath = f"../datasets/en{i}_{line}.wav"
    write_wav(wav_filepath, SAMPLE_RATE, waveform_arr)

    minutes_generated += len(semantic_tokens) / 49.9 / 60
    print(f"Minutes of audio: {minutes_generated}")

    if minutes_generated > 60:
       break

100%|██████████| 100/100 [00:00<00:00, 118.93it/s]
100%|██████████| 8/8 [00:02<00:00,  3.19it/s]


Minutes of audio: 0.050100200400801605


100%|██████████| 100/100 [00:00<00:00, 226.68it/s]
100%|██████████| 4/4 [00:01<00:00,  3.17it/s]


Minutes of audio: 0.07648630594522378


100%|██████████| 100/100 [00:01<00:00, 68.59it/s] 
100%|██████████| 14/14 [00:05<00:00,  2.76it/s]


Minutes of audio: 0.16599866399465596


100%|██████████| 100/100 [00:01<00:00, 94.54it/s]
100%|██████████| 10/10 [00:03<00:00,  2.97it/s]


Minutes of audio: 0.23246492985971942


100%|██████████| 100/100 [00:00<00:00, 126.02it/s]
100%|██████████| 8/8 [00:02<00:00,  3.36it/s]


Minutes of audio: 0.2822311289245157


100%|██████████| 100/100 [00:00<00:00, 350.32it/s]
100%|██████████| 3/3 [00:00<00:00,  3.78it/s]


Minutes of audio: 0.2989311957247829


100%|██████████| 100/100 [00:00<00:00, 200.62it/s]
100%|██████████| 5/5 [00:01<00:00,  3.51it/s]


Minutes of audio: 0.3289913159652638


100%|██████████| 100/100 [00:02<00:00, 49.51it/s]
100%|██████████| 19/19 [00:07<00:00,  2.55it/s]


Minutes of audio: 0.4532398129592518


100%|██████████| 100/100 [00:00<00:00, 253.59it/s]
100%|██████████| 4/4 [00:01<00:00,  3.61it/s]


Minutes of audio: 0.4772879091516366


100%|██████████| 100/100 [00:01<00:00, 66.42it/s] 
100%|██████████| 14/14 [00:05<00:00,  2.72it/s]


Minutes of audio: 0.5684702738810955


100%|██████████| 100/100 [00:01<00:00, 91.37it/s]
100%|██████████| 10/10 [00:03<00:00,  2.99it/s]


Minutes of audio: 0.633934535738143


100%|██████████| 100/100 [00:00<00:00, 296.06it/s]
100%|██████████| 4/4 [00:00<00:00,  4.25it/s]


Minutes of audio: 0.654308617234469


100%|██████████| 100/100 [00:01<00:00, 55.00it/s]
100%|██████████| 17/17 [00:06<00:00,  2.60it/s]


Minutes of audio: 0.7651970607882432


100%|██████████| 100/100 [00:01<00:00, 72.05it/s]
100%|██████████| 13/13 [00:04<00:00,  2.77it/s]


Minutes of audio: 0.850367401469606


100%|██████████| 100/100 [00:00<00:00, 111.18it/s]
100%|██████████| 9/9 [00:02<00:00,  3.30it/s]


Minutes of audio: 0.9061456245824985


100%|██████████| 100/100 [00:01<00:00, 87.33it/s]
100%|██████████| 11/11 [00:03<00:00,  3.00it/s]


Minutes of audio: 0.9772879091516368


100%|██████████| 100/100 [00:02<00:00, 47.88it/s]
100%|██████████| 19/19 [00:07<00:00,  2.56it/s]


Minutes of audio: 1.1012024048096194
