In [1]:
#%pip install fairseq npy_append_array h5py

In [2]:
import logging
import math
import os
import subprocess
import sys

import fairseq
import h5py
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
import tqdm
from npy_append_array import NpyAppendArray
from torch.hub import download_url_to_file

2023-04-27 20:36:24 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


# Derive Bark semantic token codebook

This notebook embeds a synthetic dataset of wav files <-> semantic token mappings and rederives the "means" for each token, allowing for later routine k-means inference of "ground truth" audio into semantic prompts. This allows for voice cloning and so giving voice to the dead, destroying consensus reality and creating misinformation, destroying copyright, spreading systemic bias, and other similarly enjoyable ways to spend a Saturday afternoon.

## Dump HuBERT features

In this step, we use Fairseq's [Sharded HuBERT feature extraction](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert/simple_kmeans) to obtain embeddings of the audio corresponding to each semantic token step (HuBERT features are 50hz by default, coincidence???).

In [3]:
if not os.path.exists("../models/hubert_base_ls960.pt"):
    # Yes, hard-coding the URL of the model is jank. Too bad!
    # Update this if this changes! https://github.com/facebookresearch/textlesslib/blob/698e6a039375bac0cd5f1b8683beeec5e8f702c0/textless/checkpoint_manager/__init__.py#L20
    download_url_to_file("https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", "../models/hubert_base_ls960.pt")

In [4]:
# Get the git repo root directory
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode("utf-8")

# Append the desired subdirectory
feature_utils_path = os.path.join(git_root, "venv", "lib", "python3.10", "site-packages", "fairseq", "examples", "hubert", "simple_kmeans")

# Add the path to sys.path
sys.path.append(feature_utils_path)

In [5]:
dataset_dir = os.path.join("..", "datasets", "en")
feature_dir = os.path.join("..", "datasets", "en_features")

In [7]:
from fairseq.examples.hubert.simple_kmeans import feature_utils
from fairseq.examples.hubert.simple_kmeans import dump_hubert_feature

NSHARDS=8
for i in range(0, NSHARDS):
    dump_hubert_feature.main(
        tsv_dir=dataset_dir,
        split="manifest",
        ckpt_path=os.path.join("..", "models", "hubert_base_ls960.pt"),
        layer=6,
        # Dataset isn't _that_ big for now; process all at once (shards: number of fractions, rank: selected fraction)
        nshard=NSHARDS,
        rank=i,
        feat_dir=feature_dir,
        max_chunk=1_600_000
    )

2023-04-27 20:37:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/ritsuko/projects/ai/audio/bark/notebooks
2023-04-27 20:37:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-04-27 20:37:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dro

## Averaging embeddings by semantic token

In [8]:
# Yes, this is inefficient, but for n ~= 10^4 generations, it's probably not worth doing something fancier
# Revisit this if it becomes a bottleneck!
label_path = os.path.join(dataset_dir, "labels.txt")

with open(label_path, 'r') as file:
    data = []
    for line in file:
        # Split the line by whitespace and convert each item to an integer
        int_list = [int(x) for x in line.split()]
        data.append(int_list)

# Convert the list of lists to a NumPy array
token_array = np.array(data)

  token_array = np.array(data)


In [11]:
test_lengs = np.loadtxt("../datasets/en_features/manifest_0_8.len", dtype=int)
test_lengs[2]

437

In [21]:
def save_shard_to_hdf5(
    f,
    feat_dir,
    split,
    rank, 
    shards,
    track_idx_start,
    tokens_by_track
):
    logger = logging.getLogger("partition_by_token")

    feat_path = f"{feat_dir}/{split}_{rank}_{NSHARDS}.npy"
    leng_path = f"{feat_dir}/{split}_{rank}_{NSHARDS}.len"

    with open(leng_path, "r") as outfile:
        # List of lengths for each track on the shard
        #lengs = [int(line.rstrip()) for line in outfile]
        lengs = np.loadtxt(leng_path, dtype=int)
        # Feature start indices for each track on the shard
        offsets = np.hstack(([0], np.cumsum(lengs[:-1])))
    
    features = np.load(feat_path, mmap_mode="r")
    logger.info(f"Processing {len(lengs)} tracks, {len(features)} token-embed maps")

    track_idx = track_idx_start
    # For each track on the shard:
    for (i, leng) in enumerate(lengs):
        track_tokens = tokens_by_track[track_idx]
        logger.debug(f"Track {track_idx}: {len(track_tokens)} semantic, {leng} embeddings")
        # Hertz for semantic tokens and features should be same, modulo the padding token
        if len(track_tokens) > leng + 1:
            logger.error(f"Track {track_idx} mismatch: {len(track_tokens)} semantic but {leng} embeddings")
        assert len(track_tokens) <= leng + 1

        track_embeddings = features[offsets[i]:offsets[i] + leng, :]
        logger.debug(f"{track_embeddings.shape} retrieved")
        for i, (token, emb) in enumerate(zip(track_tokens, track_embeddings)):
            # Add to hdf5 file
            if str(token) not in f:
                f.create_group(str(token))
            token_group = f[str(token)]
            emb_id = f"{track_idx}_{i}"
            token_group.create_dataset(emb_id, data=emb)

        track_idx += 1

    # If started at track 0 and processed 8 tracks, return 8
    return track_idx

In [23]:
def count_datasets(group):
    return sum(1 for _ in group.values() if isinstance(_, h5py.Dataset))

def partition_by_token(feat_dir, split, shards, tokens_by_track, rank_start=0):
    # HORRIBLE BAD FIX THIS
    track_idx = 1628 + 1627 + 1627
    logger = logging.getLogger("partition_by_token")

    for shard_idx in range(rank_start, NSHARDS):
        hdf5_file_path = os.path.join(feat_dir, f"embeds_by_token_{shard_idx}_{shards}.h5")
        with h5py.File(hdf5_file_path, 'a') as hdf5_file:
            new_idx = save_shard_to_hdf5(
                f=hdf5_file,
                feat_dir=feat_dir,
                split=split,
                rank=shard_idx,
                shards=shards,
                track_idx_start=track_idx,
                tokens_by_track=tokens_by_track
            )
            # TODO: Log this instead!
            logger.info(f"Shard {shard_idx} processed")
            track_idx = new_idx
        

In [25]:
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    #level=os.environ.get("LOGLEVEL", "INFO").upper(),
    level="DEBUG",
    stream=sys.stdout,
)

partition_by_token(
    feat_dir=feature_dir,
    split="manifest",
    rank_start=3,
    shards=NSHARDS,
    tokens_by_track=token_array
)

2023-04-27 22:29:42 | INFO | partition_by_token | Processing 1628 tracks, 306554 token-embed maps
2023-04-27 22:29:42 | ERROR | partition_by_token | Track 3255 mismatch: 256 semantic but 191 embeddings


AssertionError: 

In [110]:
input_filename = '../datasets/en_features/embeds_by_token.h5'
output_filename = '../datasets/en_features/mean_embeds_by_token.h5'

with h5py.File(input_filename, 'r') as input_file, h5py.File(output_filename, 'w') as output_file:
    for token, token_group in input_file.items():
        # Compute the mean of all datasets within the group
        mean_embedding = np.mean([token_group[emb_id][...] for emb_id in token_group], axis=0)

        # Create a new group in the output file with the same token name
        output_token_group = output_file.create_group(token)

        # Create a new dataset in the output group with the mean embedding
        output_token_group.create_dataset('mean_embedding', data=mean_embedding)


## SKlearn (misery time!)

We use k-nearest neighbors.

In [111]:
from sklearn.neighbors import NearestNeighbors

with h5py.File(output_filename, 'r') as input_file:
    centroids = [input_file[token]['mean_embedding'][...] for token in input_file]
    centroids = np.stack(centroids, axis=0)

# Fit the NearestNeighbors model with the centroids
nn = NearestNeighbors(n_neighbors=1, algorithm='auto')
nn.fit(centroids)