In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

load_dotenv()

paths = [f"Emilia/EN/EN-B00{i:04d}.tar" for i in range(5)]
dataset = load_dataset(
    "amphion/Emilia-Dataset",
    data_files=paths,
    split="train",
    token=os.getenv("HUGGINGFACE_TOKEN"),
)
print(dataset)  # here should only shows 90 n_shards
dataset = dataset.with_format('pt')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 122631 examples [00:19, 6377.58 examples/s]


Dataset({
    features: ['json', 'mp3', '__key__', '__url__'],
    num_rows: 122631
})


In [3]:
from datasets import load_from_disk

dataset_path = os.path.expanduser("~/local_datasets/emilia_chunks/shard_0_0")
dataset = load_from_disk(dataset_path)
dataset[0]

{'json': {'dnsmos': tensor(3.2927),
  'duration': tensor(6.2640),
  'id': 'EN_B00000_S00000_W000000',
  'language': 'en',
  'speaker': 'EN_B00000_S00000',
  'text': " You can help my mother and you- No. You didn't leave a bad situation back home to get caught up in another one here. What happened to you, Los Angeles?",
  'wav': 'EN_B00000/EN_B00000_S00000/mp3/EN_B00000_S00000_W000000.mp3'},
 '__key__': 'EN_B00000_S00000_W000000',
 '__url__': '/home/ritsuko/.cache/huggingface/hub/datasets--amphion--Emilia-Dataset/snapshots/d7f2f7340a6385696f3766c8049fa920a4707c07/Emilia/EN/EN-B000000.tar',
 'codes': tensor([[1692, 1671,  195,  979,   20,  376,  184,  925,  629,  629, 1407, 1943,
           514,  691, 1148,  115, 1199,  860,  497,  350,  350, 1028,  868, 1345,
           771, 1438,  473,  450,   86,  838,  101, 1185,  523, 1076, 1178,  501,
           594, 1517,  838, 1445,  987, 1262, 1925,  324,  979,  324, 1023,  283,
           521,  117,  117, 1943, 1943, 2023, 2023, 1169,  777, 167

In [7]:
import os
import shutil

# Define the path and expand ~
dataset_dir = os.path.expanduser("~/.cache/huggingface/datasets/amphion___emilia-dataset")

# Say goodbye
try:
    shutil.rmtree(dataset_dir, ignore_errors=True)  # IGNORE ERRORS: NO MERCY
    print(f"üí• Nuked: {dataset_dir}")
except Exception as e:  # Just in case something dares to resist
    print(f"üî• Failed to nuke {dataset_dir}: {e}")

üí• Nuked: /home/ritsuko/.cache/huggingface/datasets/amphion___emilia-dataset


In [2]:
import time
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
from data_pipeline.utils.codec import MimiCodec

codec = MimiCodec()

# please do not run this on a mac, i'm warning you
device = "cuda"

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv')
model = model.to(device)

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [None]:
# Start timing dataset fetching
t4 = time.perf_counter()
sample = dataset[:5]
t5 = time.perf_counter()
print(f"Dataset fetch time: {t5 - t4:.3f}s")
sample['mp3']

Dataset fetch time: 0.008s


[{'path': 'EN_B00000_S00000_W000000.mp3',
  'array': tensor([-0.0016, -0.0018, -0.0010,  ..., -0.0019, -0.0022, -0.0008]),
  'sampling_rate': tensor(24000)},
 {'path': 'EN_B00000_S00000_W000001.mp3',
  'array': tensor([ 1.2845e-03,  1.6861e-05, -1.7638e-03,  ..., -9.3855e-04,
          -1.1575e-03, -1.4367e-03]),
  'sampling_rate': tensor(24000)},
 {'path': 'EN_B00000_S00010_W000000.mp3',
  'array': tensor([-2.6341e-05, -3.6802e-05,  7.4276e-05,  ..., -5.1645e-03,
          -6.1383e-03, -5.3097e-03]),
  'sampling_rate': tensor(24000)},
 {'path': 'EN_B00000_S00020_W000000.mp3',
  'array': tensor([-9.9165e-04, -4.9922e-04, -2.2587e-04,  ..., -2.9825e-01,
          -3.4377e-01, -3.2791e-01]),
  'sampling_rate': tensor(24000)},
 {'path': 'EN_B00000_S00030_W000000.mp3',
  'array': tensor([-0.0001, -0.0001, -0.0001,  ..., -0.0011, -0.0012, -0.0011]),
  'sampling_rate': tensor(24000)}]

In [None]:

audio = [s['array'] for s in sample["mp3"]]

# TODO resampling, fix it immediately you f***ing fool

inputs = feature_extractor(audio, padding="max_length", return_tensors="pt", device=device)

# Start timing model inference
t8 = time.perf_counter()
inputs = {k: v.to(device) for k, v in inputs.items()}
embeddings = model(**inputs).embeddings
t9 = time.perf_counter()
print(f"Model inference time: {t9 - t8:.3f}s")

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

In [58]:
sample = next(iter(dataset))
t1 = time.perf_counter()
inputs = codec.encode(sample["mp3"]["array"].unsqueeze(0))
t2 = time.perf_counter()
print(f"Model inference time: {t2 - t1:.3f}s")
inputs.shape

Model inference time: 0.016s


torch.Size([8, 79])

In [None]:
from typing import Dict
from torchaudio.transforms import Resample

downsample_16k = Resample(orig_freq=24_000)
# intentionally na√Øve, adding 
def encode_row(row: Dict):
    audio = row["mp3"]["array"]
    downsampled = downsample_16k(audio)
    inputs = feature_extractor(downsampled, padding=True, return_tensors="pt", sampling_rate=24_000)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    embeddings = model(**inputs).embeddings
    embeddings = embeddings.cpu()

    encoded = codec.encode(audio.unsqueeze(0))
    return({
        "codes": encoded,
        "speaker_emb": embeddings
    })

In [62]:
test_ds = dataset.take(1_000)
test_ds.map(encode_row)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
Map:   0%|          | 4/1000 [00:00<00:32, 30.30 examples/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do 

OutOfMemoryError: CUDA out of memory. Tried to allocate 42.00 MiB. GPU 0 has a total capacity of 23.49 GiB of which 40.88 MiB is free. Including non-PyTorch memory, this process has 23.41 GiB memory in use. Of the allocated memory 22.90 GiB is allocated by PyTorch, and 65.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)