In [2]:
from datasets import load_dataset
from dotenv import load_dotenv
import os
from data_pipeline.utils.codec import MimiCodec
from IPython.display import Audio
import numpy as np

codec = MimiCodec()

load_dotenv()


ModuleNotFoundError: No module named 'data_pipeline'

## Checking encoded audio

In [None]:
from datasets import load_from_disk, DatasetDict

root_dir = os.path.expanduser("~/local_datasets/emilia_chunks")
shard_dirs = [
    os.path.join(root_dir, name)
    for name in os.listdir(root_dir)
    if os.path.isdir(os.path.join(root_dir, name))
]

# Optionally sort the shard directories (if they have sortable names)
shard_dirs.sort()
shard_datasets = {}
for shard_dir in shard_dirs:
    # You can name each split using the directory name or a custom name
    split_name = os.path.basename(shard_dir)
    shard_datasets[split_name] = load_from_disk(shard_dir)

# Combine them into a DatasetDict if that suits your workflow:
ds = DatasetDict(shard_datasets)


In [None]:
from datasets import concatenate_datasets
collapsed_dataset = concatenate_datasets([ds[split] for split in ds.keys()])
token = os.getenv("HUGGINGFACE_TOKEN")
len(collapsed_dataset)

In [None]:
def extract_json(row):
    json = row["json"]
    del json["wav"]
    return {
        'dnsmos': json['dnsmos'],
        'duration': json['duration'],
        'id': json['id'],
        'speaker': json['speaker'],
        'text': json['text']
    }

ds = collapsed_dataset.map(extract_json, num_proc=12, remove_columns="json")

In [None]:
durations = ds["duration"]

total_hours = durations.sum().item()/3600
print(f"First 300 shards: {total_hours:02f} hours")


In [None]:
import torch

torch.quantile(ds["dnsmos"], torch.tensor([0.2, 0.4, 0.6, 0.8, 1.0]))

In [None]:
ds.push_to_hub("jkeisling/emilia_en_mimi", token=token, max_shard_size="2GB")

In [None]:

pcm = codec.decode(collapsed_dataset[1_000_001]["codes"])
Audio(np.array(pcm), rate=24_000)

## Inspecting original dataset

In [None]:
paths = [f"Emilia/EN/EN-B00{i:04d}.tar" for i in range(200,202)]
dataset = load_dataset(
    "amphion/Emilia-Dataset",
    data_files=paths,
    split="train",
    token=os.getenv("HUGGINGFACE_TOKEN"),
)
print(dataset)  # here should only shows 90 n_shards
dataset = dataset.with_format('pt')

In [None]:
dataset[21_000]["mp3"]

In [None]:
pcm = dataset[28_000]["mp3"]["array"]
Audio(np.array(pcm), rate=24_000)

In [None]:
slice = dataset[160:180]

codes = codec.encode_batch([s["array"] for s in slice["mp3"]])


In [None]:
pcm = codec.decode(codes[18])
Audio(np.array(pcm), rate=24_000)

In [None]:
import time
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
# please do not run this on a mac, i'm warning you
device = "cuda"

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv')
model = model.to(device)

In [None]:
# Start timing dataset fetching
t4 = time.perf_counter()
sample = dataset[:5]
t5 = time.perf_counter()
print(f"Dataset fetch time: {t5 - t4:.3f}s")
sample['mp3']

In [None]:

audio = [s['array'] for s in sample["mp3"]]

# TODO resampling, fix it immediately you f***ing fool

inputs = feature_extractor(audio, padding="max_length", return_tensors="pt", device=device)

# Start timing model inference
t8 = time.perf_counter()
inputs = {k: v.to(device) for k, v in inputs.items()}
embeddings = model(**inputs).embeddings
t9 = time.perf_counter()
print(f"Model inference time: {t9 - t8:.3f}s")

In [None]:
import os
import shutil

# Define the path and expand ~
dataset_dir = os.path.expanduser("~/.cache/huggingface/datasets/amphion___emilia-dataset")

# Say goodbye
try:
    shutil.rmtree(dataset_dir, ignore_errors=True)  # IGNORE ERRORS: NO MERCY
    print(f"💥 Nuked: {dataset_dir}")
except Exception as e:  # Just in case something dares to resist
    print(f"🔥 Failed to nuke {dataset_dir}: {e}")

In [None]:
sample = next(iter(dataset))
t1 = time.perf_counter()
inputs = codec.encode(sample["mp3"]["array"].unsqueeze(0))
t2 = time.perf_counter()
print(f"Model inference time: {t2 - t1:.3f}s")
inputs.shape

In [None]:
from typing import Dict
from torchaudio.transforms import Resample

downsample_16k = Resample(orig_freq=24_000)
# intentionally naïve, adding 
def encode_row(row: Dict):
    audio = row["mp3"]["array"]
    downsampled = downsample_16k(audio)
    inputs = feature_extractor(downsampled, padding=True, return_tensors="pt", sampling_rate=24_000)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    embeddings = model(**inputs).embeddings
    embeddings = embeddings.cpu()

    encoded = codec.encode(audio.unsqueeze(0))
    return({
        "codes": encoded,
        "speaker_emb": embeddings
    })

In [None]:
test_ds = dataset.take(1_000)
test_ds.map(encode_row)

## Inspecting tokenized data

In [None]:
from datasets import load_from_disk

ds = load_from_disk("../../datasets/byte-tokenized-emilia-v1")["train"].shard(16, 0)
ds[:5]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("../../inits/smoltts_byte_kokoro_layer")
tokenizer.decode(ds[20]["ground_truth"][0,:])