In [1]:
TAKE_CHUNK = 16

In [2]:
CHUNK_SIZE = 50_000

In [3]:
assert CHUNK_SIZE == 5e4 # to be sure

In [4]:
!pip install -U -qq --progress-bar off datasets # "torchaudio<0.12"

In [5]:
!apt-get -qq -y install sox > /dev/null

In [6]:
import datasets as hfds
import huggingface_hub
import numpy as np
from IPython import display
!python -V

Python 3.9.16


In [7]:
assert (hfds.__version__, huggingface_hub.__version__) == ('2.11.0', '0.13.4')

In [8]:
huggingface_hub.login('your hf token', add_to_git_credential=True)

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
git_commits = !git ls-remote https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0
assert git_commits[0][:40] == 'f0c7a76f8a342bdce06994a0e59cdcce841f4ef8'
assert git_commits[1][:40] == git_commits[0][:40]
assert len(git_commits) == 2

In [10]:
LANG = hfds.load_dataset_builder("mozilla-foundation/common_voice_12_0", 'en')
assert LANG.info.version == '12.0.0'

Downloading builder script:   0%|          | 0.00/8.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/63.2k [00:00<?, ?B/s]

In [11]:
LANG = tuple(sorted(LANG.builder_configs.keys()))
np.array(LANG).reshape(-1, 8).tolist()

[['ab', 'ar', 'as', 'ast', 'az', 'ba', 'bas', 'be'],
 ['bg', 'bn', 'br', 'ca', 'ckb', 'cnh', 'cs', 'cv'],
 ['cy', 'da', 'de', 'dv', 'el', 'en', 'eo', 'es'],
 ['et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'gl'],
 ['gn', 'ha', 'hi', 'hsb', 'hu', 'hy-AM', 'ia', 'id'],
 ['ig', 'it', 'ja', 'ka', 'kab', 'kk', 'kmr', 'ko'],
 ['ky', 'lg', 'lt', 'lv', 'mdf', 'mhr', 'mk', 'ml'],
 ['mn', 'mr', 'mrj', 'mt', 'myv', 'nan-tw', 'ne-NP', 'nl'],
 ['nn-NO', 'oc', 'or', 'pa-IN', 'pl', 'pt', 'quy', 'rm-sursilv'],
 ['rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sat', 'sc', 'sk'],
 ['skr', 'sl', 'sr', 'sv-SE', 'sw', 'ta', 'th', 'ti'],
 ['tig', 'tok', 'tr', 'tt', 'tw', 'ug', 'uk', 'ur'],
 ['uz', 'vi', 'vot', 'yo', 'yue', 'zh-CN', 'zh-HK', 'zh-TW']]

In [12]:
md5 = !echo "$LANG" | md5sum
assert '7503cd83930526be190dee5f0d6e897c' == md5[0][:32], md5[0][:32]
len(LANG)

104

In [13]:
dataset = [hfds.load_dataset("mozilla-foundation/common_voice_12_0", i, use_auth_token=True, streaming=True) for i in LANG]
dataset = [hfds.concatenate_datasets(list(d.values())) for d in dataset]
dataset = hfds.concatenate_datasets(dataset)
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f441f3f7ee0>

In [14]:
example = next(iter(dataset))
audio = example["audio"]
print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
display.display(display.Audio(audio["array"], rate=audio["sampling_rate"]))

Reading metadata...: 21027it [00:00, 31865.71it/s]


Shape: (381888,), sampling rate: 48000


In [15]:
dataset = dataset.skip(TAKE_CHUNK * CHUNK_SIZE).take(CHUNK_SIZE)
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f44c84a35b0>

In [16]:
dataset = dataset.map(lambda s: s['audio'].update({'array': np.trim_zeros(s['audio']["array"])}) or s)
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f441d3c5880>

In [17]:
dataset = dataset.filter(lambda s: len(s['audio']['array']) >= s['audio']['sampling_rate'])
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f441d3c59a0>

In [18]:
def slice_audio(examples):
  slices = []
  for audio in examples['audio']:
    rate = audio["sampling_rate"]
    sig = audio["array"]
    sig = sig[:(len(sig) // rate) * rate]
    slices += np.split(sig, len(sig) // rate)
  return {'audio_': [{'array': s, 'sampling_rate': rate} for s in slices]}

In [19]:
dataset = dataset.map(slice_audio, batched=True, batch_size=2,
                      remove_columns=list(example.keys()),
                      features=hfds.Features({"audio_": hfds.Audio(decode=False)}))
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f441d428f70>

In [20]:
dataset = dataset.rename_column('audio_', 'audio')
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f441d428d30>

In [None]:
ds = hfds.Dataset.from_generator(lambda: (i for i in dataset))
ds

In [22]:
ds = ds.cast_column("audio", hfds.Audio(sampling_rate=16000))
ds

Dataset({
    features: ['audio'],
    num_rows: 242711
})

In [23]:
for audio in ds[:5]['audio']:
  print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
  display.display(display.Audio(audio["array"], rate=audio["sampling_rate"]))

Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


In [24]:
hfds.disable_progress_bar()
huggingface_hub.utils.disable_progress_bars()

In [25]:
ds.push_to_hub(f'one-sec-cv12/chunk_{TAKE_CHUNK}')

In [26]:
!echo RG9uZSAhISEK | base64 -d

Done !!!
