In [1]:
# Uncomment when running for first time!
#%pip install fairseq npy_append_array h5py

In [1]:
import logging
import math
import os
import subprocess
import sys

import fairseq
import h5py
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
import tqdm
from npy_append_array import NpyAppendArray
from torch.hub import download_url_to_file

2023-05-08 10:05:59 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


# Derive Bark semantic token codebook

This notebook embeds a synthetic dataset of wav files <-> semantic token mappings and rederives the "means" for each token, allowing for later routine k-means inference of "ground truth" audio into semantic prompts. This allows for voice cloning and so giving voice to the dead, destroying consensus reality and creating misinformation, destroying copyright, spreading systemic bias, and other similarly enjoyable ways to spend a Saturday afternoon.

## Dump HuBERT features

In this step, we use Fairseq's [Sharded HuBERT feature extraction](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert/simple_kmeans) to obtain embeddings of the audio corresponding to each semantic token step (HuBERT features are 50hz by default, coincidence???).

In [2]:
if not os.path.exists("../models/hubert_base_ls960.pt"):
    # Yes, hard-coding the URL of the model is jank. Too bad!
    # Update this if this changes! https://github.com/facebookresearch/textlesslib/blob/698e6a039375bac0cd5f1b8683beeec5e8f702c0/textless/checkpoint_manager/__init__.py#L20
    download_url_to_file("https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", "../models/hubert_base_ls960.pt")

In [3]:
# Unfortunately, fairseq kmeans package resolution is borked on my machine, so manually adding it
# Get the git repo root directory
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode("utf-8")

# Append the desired subdirectory
feature_utils_path = os.path.join(git_root, "venv", "lib", "python3.10", "site-packages", "fairseq", "examples", "hubert", "simple_kmeans")

# Add the path to sys.path
sys.path.append(feature_utils_path)

In [10]:
dataset_dir = os.path.join("..", "datasets", "en")
feature_dir = os.path.join("..", "datasets", "en_features")
NSHARDS=8

In [82]:
from fairseq.examples.hubert.simple_kmeans import feature_utils
from fairseq.examples.hubert.simple_kmeans import dump_hubert_feature

for i in range(0, NSHARDS):
    dump_hubert_feature.main(
        tsv_dir=dataset_dir,
        split="manifest",
        ckpt_path=os.path.join("..", "models", "hubert_base_ls960.pt"),
        layer=6,
        # Shards: number of fractions, rank: individual fraction
        nshard=NSHARDS,
        rank=i,
        feat_dir=feature_dir,
        max_chunk=1_600_000
    )

2023-05-08 14:24:14 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/ritsuko/projects/ai/audio/bark/notebooks
2023-05-08 14:24:14 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-05-08 14:24:14 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dro

## Extract Bark wte layer

I'd rather not load the whole Bark semantic model when (for now)  we're just using the input embeddings, so let's save it to a new checkpoint:

In [5]:
from bark.generation import load_model
import torch.nn as nn

class BarkWTE(nn.Module):
    def __init__(self, input_vocab_size=129_600, block_size=1024):
        super().__init__()
        self.wte = nn.Embedding(input_vocab_size, block_size)
    
    def forward(self, x):
        return self.wte(x)

bark_wte = BarkWTE()
bark_wte_path = "../models/bark_wte.pt"
if not os.path.exists(bark_wte_path):
    # Export out the WTE layer: we don't need the rest of the model in VRAM
    model_container = load_model(use_gpu=True, model_type="text")
    model = model_container["model"]
    # Get the state_dict of the source model
    source_state_dict = model.transformer.state_dict()

    # Create a new state_dict containing only the wte layer's weights
    wte_state_dict = {k: v for k, v in source_state_dict.items() if 'wte' in k}

    # Load the wte layer's weights into the target layer of the destination model
    bark_wte.load_state_dict(wte_state_dict)
    torch.save(bark_wte.state_dict(), bark_wte_path)
else:
    bark_wte.load_state_dict(torch.load(bark_wte_path))

  from .autonotebook import tqdm as notebook_tqdm


## Create HuBERT embedding to Bark input embedding dataset

In [6]:
# Yes, this is inefficient, but for n ~= 10^4 generations, it's probably not worth doing something fancier
# Revisit this if it becomes a bottleneck!
label_path = os.path.join(dataset_dir, "labels.txt")

with open(label_path, 'r') as file:
    data = []
    for line in file:
        # Split the line by whitespace and convert each item to an integer
        int_list = [int(x) for x in line.split()]
        data.append(int_list)

# Convert the list of lists to a NumPy array
token_array = np.array(data)

  token_array = np.array(data)


In [80]:
def save_shard_to_hdf5(
    f,
    feat_dir,
    split,
    rank, 
    shards,
    track_idx_start,
    tokens_by_track
):
    logger = logging.getLogger("partition_by_token")

    wav_feat_path = f"{feat_dir}/{split}_{rank}_{NSHARDS}.npy"
    wav_leng_path = f"{feat_dir}/{split}_{rank}_{NSHARDS}.len"

    with open(wav_leng_path, "r") as outfile:
        # List of lengths for each track on the shard
        wav_lengs = np.loadtxt(wav_leng_path, dtype=int)
        # Feature start indices for each track on the shard
        wav_feature_offsets = np.hstack(([0], np.cumsum(wav_lengs[:-1])))
    
    wav_features = np.load(wav_feat_path)
    logger.info(f"Processing {len(wav_lengs)} tracks, {len(wav_features)} token-embed maps")

    track_idx = track_idx_start

    hubert_emb_group = f.create_group('hubert_embeddings')
    token_emb_group = f.create_group('token_embeddings')
    initial_shape_hubert_emb = (len(wav_features), 768)
    initial_shape_token_emb = (len(wav_features), 1024)

    hubert_dataset = hubert_emb_group.create_dataset(
        'data', 
        shape=initial_shape_hubert_emb,
        chunks=True,
        dtype=wav_features.dtype,
    )
    tokens_dataset = token_emb_group.create_dataset(
        'data', 
        shape=initial_shape_token_emb,
        chunks=True,
        dtype=wav_features.dtype
    )
    f.attrs["num_embeddings"] = 0

    logger.debug("Datasets created")
    # For each track on the shard:
    for (i, leng) in enumerate(wav_lengs):
        track_tokens = tokens_by_track[track_idx]
        logger.info(f"Track {track_idx}: {len(track_tokens)} semantic, {leng} embeddings")
        # Hertz for semantic tokens and features should be same, modulo a final padding token
        if len(track_tokens) > leng + 1:
            logger.error(f"Track {track_idx} mismatch: {len(track_tokens)} semantic but {leng} embeddings")
        assert len(track_tokens) <= leng + 1

        # Get Bark input embedding for track's semantic tokens
        x = torch.tensor(track_tokens)
        with torch.no_grad():
            track_wte_embed = bark_wte(x)
        
        track_hubert_embed  = wav_features[wav_feature_offsets[i]:wav_feature_offsets[i] + leng, :]
        logger.debug(f"Track {i}: {track_hubert_embed.shape} hubert, {track_wte_embed.shape} wte")

        # Persist
        """
        for i, (token, hub) in enumerate(zip(track_wte_embed.numpy(), track_hubert_embed)):
            # Add to hdf5 file
            hubert_dataset[i, :] = hub
            tokens_dataset[i, :] = token
            f.attrs["num_embeddings"] += 1
        """
        # Save the entire batch of embeddings for the current track
        min_length = min(track_hubert_embed.shape[0], track_wte_embed.shape[0])
        track_hubert_embed = track_hubert_embed[:min_length, :]
        track_wte_embed = track_wte_embed.numpy()[:min_length, :]

        start_idx = f.attrs["num_embeddings"]
        end_idx = start_idx + min_length
        
        hubert_dataset[start_idx:end_idx, :] = track_hubert_embed
        tokens_dataset[start_idx:end_idx, :] = track_wte_embed

        # Update the total number of embeddings
        f.attrs["num_embeddings"] += min_length


        track_idx += 1

    return track_idx

In [8]:
def count_datasets(group):
    return sum(1 for _ in group.values() if isinstance(_, h5py.Dataset))

def partition_by_token(feat_dir, split, shards, tokens_by_track, rank_start=0):
    # HORRIBLE BAD FIX THIS
    track_idx = 0
    logger = logging.getLogger("partition_by_token")

    for shard_idx in range(rank_start, NSHARDS):
        hdf5_file_path = os.path.join(feat_dir, f"embeds_by_token_{shard_idx}_{shards}.h5")
        with h5py.File(hdf5_file_path, 'a') as hdf5_file:
            new_idx = save_shard_to_hdf5(
                f=hdf5_file,
                feat_dir=feat_dir,
                split=split,
                rank=shard_idx,
                shards=shards,
                track_idx_start=track_idx,
                tokens_by_track=tokens_by_track
            )
            logger.info(f"Shard {shard_idx} processed")
            track_idx = new_idx
            hdf5_file.close()
        

In [83]:
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=os.environ.get("LOGLEVEL", "INFO").upper(),
    stream=sys.stdout,
)

partition_by_token(
    feat_dir=feature_dir,
    split="manifest",
    rank_start=0,
    shards=1,
    tokens_by_track=token_array
)

2023-05-08 14:26:32 | INFO | partition_by_token | Processing 1628 tracks, 314127 token-embed maps
2023-05-08 14:26:32 | INFO | partition_by_token | Track 0: 304 semantic, 303 embeddings
2023-05-08 14:26:32 | INFO | partition_by_token | Track 1: 198 semantic, 197 embeddings
2023-05-08 14:26:32 | INFO | partition_by_token | Track 2: 437 semantic, 437 embeddings
2023-05-08 14:26:32 | INFO | partition_by_token | Track 3: 316 semantic, 315 embeddings
2023-05-08 14:26:32 | INFO | partition_by_token | Track 4: 95 semantic, 94 embeddings
2023-05-08 14:26:32 | INFO | partition_by_token | Track 5: 69 semantic, 68 embeddings
2023-05-08 14:26:33 | INFO | partition_by_token | Track 6: 109 semantic, 108 embeddings
2023-05-08 14:26:33 | INFO | partition_by_token | Track 7: 420 semantic, 420 embeddings
2023-05-08 14:26:33 | INFO | partition_by_token | Track 8: 224 semantic, 223 embeddings
2023-05-08 14:26:33 | INFO | partition_by_token | Track 9: 135 semantic, 134 embeddings
2023-05-08 14:26:33 | INFO

AssertionError: 

## Dataloader

Finally, we write a PyTorch dataloader for our embedding mappings:

In [89]:
from torch.utils.data import Dataset, DataLoader, random_split

class EmbeddingTranslationDataset(Dataset):
    def __init__(self, nshards, feature_dir):
        self.filename_list = [f"embeds_by_token_{i}_{nshards}.h5" for i in range(0, nshards)]
        self.files = []

        for file_path in self.filename_list:
            file = h5py.File(os.path.join(feature_dir, file_path), 'r')
            self.files.append(file)
    
    def __len__(self):
        return sum([f.attrs['num_embeddings'] for f in self.files])

    def __getitem__(self, idx):
        for file in self.files:
            num_embeddings = file.attrs['num_embeddings']
            if idx < num_embeddings:
                embed_768 = file['hubert_embeddings/data'][idx]
                label = file['token_embeddings/data'][idx]
                return torch.tensor(embed_768, dtype=torch.float), torch.tensor(label, dtype=torch.float)
            idx -= num_embeddings

    def close(self):
        for file in self.files:
            file.close()

In [95]:
dataset = EmbeddingTranslationDataset(
    nshards=1,
    feature_dir=os.path.join('..', 'datasets', 'en_features')
)

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

total_samples = len(dataset)
train_size = int(train_ratio * total_samples)
val_size = int(val_ratio * total_samples)
test_size = total_samples - train_size - val_size

# Create random splits
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders for each split
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Naive projection layer

We initially try the simplest possible objective, a simple linear projection between the embedding spaces - mostly just to get a baseline for subsequent attempts.

First, let's get a baseline for performance: random input HuBERT embeddings:

In [91]:
num_batches = 10

for batch_idx, (inputs, targets) in enumerate(train_loader):
    if batch_idx >= num_batches:
        break

    # Calculate the mean and standard deviation of the actual embeddings
    mean = torch.mean(inputs)
    std = torch.std(inputs)

    # Generate random Gaussian embeddings with the same shape, mean, and standard deviation
    random_embeddings = torch.normal(mean=mean, std=std, size=inputs.shape)

    # Compute the MSE between the actual embeddings and the random Gaussian embeddings
    mse = torch.mean((inputs - random_embeddings) ** 2)
    print(f"Batch {batch_idx + 1}: MSE = {mse.item()}")


Batch 1: MSE = 0.17270851135253906
Batch 2: MSE = 0.17644421756267548
Batch 3: MSE = 0.17107713222503662
Batch 4: MSE = 0.1731051206588745
Batch 5: MSE = 0.17299486696720123
Batch 6: MSE = 0.17493848502635956
Batch 7: MSE = 0.17077873647212982
Batch 8: MSE = 0.1719347983598709
Batch 9: MSE = 0.17431329190731049
Batch 10: MSE = 0.16893906891345978


In [56]:
%pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.13.0-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting markdown>=2.6.8
  Using cached Markdown-3.4.3-py3-none-any.whl (93 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0
  Using cached tensorboard_data_server-0.7.0-py3-none-manylinux2014_x86_64.whl (6.6 MB)
Collecting grpcio>=1.48.2
  Downloading grpcio-1.54.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting google-auth-oauthlib<1.1,>=0.5
  Using cached google_auth_oauthlib-1.0.0-py2.py3-none-any.whl (18 kB)
Collecting protobuf>=3.19.6
  Downloading protobuf-4.23.0-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.5/304.5 kB

In [99]:
device = torch.device("cuda" if torch.cuda.is_available else "cpu")
model = nn.Linear(768,1024)
model.to(device)

Linear(in_features=768, out_features=1024, bias=True)

In [100]:
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

log_dir = "runs/linear_projection"
writer = SummaryWriter(log_dir)

# Train one epoch, just for a baseline
num_epochs = 1
for epoch in range(num_epochs):
    for batch_idx, (x, labels) in enumerate(train_loader):
        x, labels = x.to(device), labels.to(device)
        optimizer.zero_grad()

        preds = model(x)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
        global_step = epoch * len(train_loader) + batch_idx
        writer.add_scalar("Loss/train", loss.item(), global_step)
        if batch_idx % 100 == 0:
            model.eval()
            # Initialize variables to calculate the average test loss
            test_loss = 0
            num_batches = 0

            # No need to track gradients during evaluation
            with torch.no_grad():
                for batch_idx, (x, labels) in enumerate(test_dataloader):
                    # Move the data to the device
                    x, labels = x.to(device), labels.to(device)

                    # Make predictions
                    preds = model(x)

                    # Calculate the loss
                    loss = criterion(preds, labels)

                    # Accumulate the test loss
                    test_loss += loss.item()
                    num_batches += 1

            # Calculate the average test loss
            average_test_loss = test_loss / num_batches

            # Log the average test loss to TensorBoard
            writer.add_scalar("Loss/test", average_test_loss, global_step)

            model.train()




In [104]:
# Set the model to evaluation mode
model.eval()
model.to(device)

# Initialize variables to calculate the average test loss
test_loss = 0
num_batches = 0

# No need to track gradients during evaluation
with torch.no_grad():
    for batch_idx, (x, labels) in enumerate(test_loader):
        if batch_idx > 500:
            break
        # Move the data to the device
        x, labels = x.to(device), labels.to(device)

        # Make predictions
        preds = model(x)

        # Calculate the loss
        loss = criterion(preds, labels)

        # Accumulate the test loss
        test_loss += loss.item()
        num_batches += 1

# Calculate the average test loss
average_test_loss = test_loss / num_batches
print(average_test_loss)

# Log the average test loss to TensorBoard
writer.add_scalar("Loss/test", average_test_loss, global_step)

# Close the TensorBoard writer
writer.close()


0.02153354656776625


In [None]:
1 in [1,2]