In [1]:
#%pip install fairseq

In [31]:
import bark

In [32]:
from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

model = load_codec_model(use_gpu=True)

In [33]:
# Load and pre-process the audio waveform
audio_filepath = 'christopher_lee.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
device = 'cuda' # or 'cpu'
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0).to(device)

In [34]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [14]:
import os
import subprocess
import sys

# Unfortunately, fairseq kmeans package resolution is borked on my machine, so manually adding it
# TODO: Fix this
# Get the git repo root directory
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode("utf-8")

# Append the desired subdirectory
feature_utils_path = os.path.join(git_root, "venv", "lib", "python3.10", "site-packages", "fairseq", "examples", "hubert", "simple_kmeans")

# Add the path to sys.path
sys.path.append(feature_utils_path)
from fairseq.examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader


2023-05-16 21:59:59 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [15]:
# TODO: DELETE THIS
from torch.hub import download_url_to_file

if not os.path.exists("models/hubert_base_ls960.pt"):
    # Yes, hard-coding the URL of the model is jank. Too bad!
    # Update this if this changes! https://github.com/facebookresearch/textlesslib/blob/698e6a039375bac0cd5f1b8683beeec5e8f702c0/textless/checkpoint_manager/__init__.py#L20
    download_url_to_file("https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", "models/hubert_base_ls960.pt")

reader = HubertFeatureReader(
    ckpt_path=os.path.join("models", "hubert_base_ls960.pt"),
    layer=6
)

2023-05-16 22:00:06 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/ritsuko/projects/ai/audio/bark
2023-05-16 22:00:06 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-05-16 22:00:06 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0

In [16]:
semantic_hubert_feats = reader.get_feats(audio_filepath)


In [17]:
import torch
import torch.nn as nn

class hubert_to_wte_projection(nn.Module):
    def __init__(self, input_dim=768, output_dim=1024):
        super().__init__()
        self.proj = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.proj(x)

proj = hubert_to_wte_projection()
proj.to(device)
proj.load_state_dict(torch.load(os.path.join("models", "hubert_bark_proj_20230511-220924.pt")))

<All keys matched successfully>

In [18]:
with torch.no_grad():
    semantic_emb = proj(semantic_hubert_feats).cpu().numpy()

In [37]:
# move codes to cpu
codes = codes.cpu().numpy()

In [38]:
import numpy as np
voice_name = 'lee_2' # whatever you want the name of the voice to be
output_path = 'bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [None]:
# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'

In [None]:
# Heres the generation stuff copy-pasted for convenience

In [5]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Hello, my name is Serpy. And, uh — and I like pizza."
voice_name = "lee" # use your custom voice name here if you have one

# load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [2]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [2]:
# simple generation
audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)

100%|██████████| 100/100 [00:04<00:00, 24.12it/s]
100%|██████████| 32/32 [00:11<00:00,  2.77it/s]


1866
Generated coarse tokens: 1866, predicted: 1866


In [20]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

100%|██████████| 100/100 [00:03<00:00, 31.73it/s]
100%|██████████| 23/23 [00:11<00:00,  1.96it/s]


1378
Generated coarse tokens: 1378, predicted: 1378


In [21]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [20]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "output/lee_clone_baseline.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)