In [1]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Hello, my name is Serpy. And, uh — and I like pizza. [laughs]"
voice_name = "speaker_0" # use your custom voice name here if you have one

# load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [2]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

No GPU being used. Careful, inference might be very slow!


Downloading text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

Downloading coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Downloading fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /Users/bm/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [10:47<00:00, 144kB/s]   


In [3]:
# simple generation
audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)

100%|██████████| 100/100 [00:52<00:00,  1.92it/s]
100%|██████████| 36/36 [03:24<00:00,  5.69s/it]


In [4]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

100%|██████████| 100/100 [04:33<00:00,  2.74s/it]
100%|██████████| 25/25 [30:19<00:00, 72.78s/it]


In [7]:
# save audio to file
audio_array

array([ 4.8529985e-04, -5.3261057e-05,  1.4330441e-04, ...,
        3.2730494e-04,  3.3824064e-04,  3.4972595e-04], dtype=float32)

In [None]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [11]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "./output/audio.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)