In [1]:
DISTILL_DATSET_PATH = 'one-sec-cv12/chunk_160'

In [2]:
!nvidia-smi

Thu Apr 20 06:22:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install -U -qq --progress-bar off datasets transformers python_speech_features==0.6

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone


In [4]:
import numpy as np
import datasets as hfds
import huggingface_hub
import torch
import inspect
from IPython import display
from python_speech_features import mfcc as mfcc_speech_features
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

In [5]:
huggingface_hub.utils.disable_progress_bars()
hfds.disable_progress_bar()

In [6]:
ds = hfds.load_dataset(DISTILL_DATSET_PATH)
ds

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/one-sec-cv12___parquet/one-sec-cv12--chunk_160-6b81d1b2196e22b8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/one-sec-cv12___parquet/one-sec-cv12--chunk_160-6b81d1b2196e22b8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 231668
    })
})

In [7]:
!rm -r /root/.cache/huggingface/datasets/downloads

In [8]:
hfds.enable_progress_bar()

In [9]:
for audio in ds['train'][np.random.randint(len(ds["train"]), size=5)]['audio']:
  print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
  display.display(display.Audio(audio["array"], rate=audio["sampling_rate"]))

Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


In [10]:
DISTILL_MODEL_CHCKPNT = (
  'mazkooleg/0-9up-ast-ft',
  'mazkooleg/0-9up-wavlm-base-plus-ft',
  'mazkooleg/0-9up-hubert-base-ls960-ft',
  'mazkooleg/0-9up-unispeech-sat-base-ft',
  'mazkooleg/0-9up-data2vec-audio-base-960h-ft'
)

In [11]:
md5 = !echo "{DISTILL_MODEL_CHCKPNT}" | md5sum
assert md5[0][:32] == 'cd2bb695b95773ada0f3aec2321458a6'

In [12]:
distill_model = map(AutoModelForAudioClassification.from_pretrained, DISTILL_MODEL_CHCKPNT)
distill_model = tuple(model.to('cuda') for model in distill_model)

In [13]:
feature_extractor = tuple(map(AutoFeatureExtractor.from_pretrained, DISTILL_MODEL_CHCKPNT))
feature_extractor[0]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 128,
  "mean": -6.845978,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 5.5654526
}

In [14]:
def mfcc_from_audio(audio):
  rate = audio["sampling_rate"]
  sig  = audio["array"]
  assert len(sig) == 16000, len(sig)
  assert rate == 16000, rate
  assert sig.dtype in ('float64', 'float32'), sig.dtype
  sig = np.multiply(sig, 32768).astype('int16')
  assert sig.dtype == 'int16', sig.dtype
  return mfcc_speech_features(sig, rate, winlen=0.05, winstep=0.02, nfft=1024)

In [15]:
md5 = !echo "{inspect.getsource(mfcc_from_audio).strip()}" | md5sum
assert md5[0][:32] == '5e3d4a7939954764016b78d56686d965'

In [16]:
def distill_function(examples):
  examples_audio = examples['audio']
  mfcc   = [mfcc_from_audio(x)[1:-1] for x in examples_audio]
  inputs = [f(
    raw_speech = [x['array'] for x in examples_audio],
    sampling_rate=f.sampling_rate, 
    max_length=int(f.sampling_rate * 1.0), # seconds
    return_tensors='pt'
  ).to('cuda') for f in feature_extractor]
  with torch.no_grad():
    logits = torch.stack([m(**i).logits for i, m in zip(inputs, distill_model)])
    return {'logits': torch.mean(logits, dim=0).to('cpu'), 'mfcc': mfcc}

In [17]:
md5 = !echo "{inspect.getsource(distill_function).strip()}" | md5sum
assert md5[0][:32] == '8f1a8b6a585883a566118b242f70b88d'

In [18]:
hfds.disable_caching()

In [19]:
ds = ds.map(distill_function, batched=True, batch_size=256, remove_columns="audio")
ds

Map:   0%|          | 0/231668 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['logits', 'mfcc'],
        num_rows: 231668
    })
})

In [20]:
huggingface_hub.login('your huggingface token here')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [21]:
hfds.disable_progress_bar()

In [22]:
ds.push_to_hub('distilled-' + DISTILL_DATSET_PATH)



In [23]:
!echo RG9uZSAhISEK | base64 -d

Done !!!
