In [1]:
!nvidia-smi

Mon Apr 10 08:32:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -U -qq --progress-bar off datasets transformers python_speech_features==0.6

In [3]:
import numpy as np
import datasets as hfds
import huggingface_hub
import torch
import inspect
from IPython import display
from python_speech_features import mfcc as mfcc_speech_features
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from kaggle_secrets import UserSecretsClient

In [4]:
ds = hfds.load_from_disk("/kaggle/input/0-9up-google-speech-commands-augmented-raw")
# ds = hfds.load_dataset("mazkooleg/google_speech_commands_augmented_raw")
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1095480
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 3368
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 3773
    })
})

In [5]:
for audio in ds['train'][np.random.randint(len(ds["train"]), size=5)]['audio']:
  print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
  display.display(display.Audio(audio["array"], rate=audio["sampling_rate"]))

Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


Shape: (16000,), sampling rate: 16000


In [6]:
DISTILL_MODEL_CHCKPNT = (
  'mazkooleg/0-9up-ast-ft',
  'mazkooleg/0-9up-wavlm-base-plus-ft',
  'mazkooleg/0-9up-hubert-base-ls960-ft',
  'mazkooleg/0-9up-unispeech-sat-base-ft',
  'mazkooleg/0-9up-data2vec-audio-base-960h-ft'
)

In [7]:
md5 = !echo "{DISTILL_MODEL_CHCKPNT}" | md5sum
assert md5[0][:32] == 'cd2bb695b95773ada0f3aec2321458a6'

In [8]:
distill_model = map(AutoModelForAudioClassification.from_pretrained, DISTILL_MODEL_CHCKPNT)
distill_model = tuple(model.to('cuda') for model in distill_model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/342M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/374M [00:00<?, ?B/s]

In [9]:
feature_extractor = tuple(map(AutoFeatureExtractor.from_pretrained, DISTILL_MODEL_CHCKPNT))
feature_extractor[0]

Downloading (…)rocessor_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 128,
  "mean": -6.845978,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 5.5654526
}

In [10]:
def mfcc_from_audio(audio):
  rate = audio["sampling_rate"]
  sig  = audio["array"]
  assert len(sig) == 16000, len(sig)
  assert rate == 16000, rate
  assert sig.dtype in ('float64', 'float32'), sig.dtype
  sig = np.multiply(sig, 32768).astype('int16')
  assert sig.dtype == 'int16', sig.dtype
  return mfcc_speech_features(sig, rate, winlen=0.05, winstep=0.02, nfft=1024)

In [11]:
md5 = !echo "{inspect.getsource(mfcc_from_audio).strip()}" | md5sum
assert md5[0][:32] == '5e3d4a7939954764016b78d56686d965'

In [12]:
def distill_function(examples):
  examples_audio = examples['audio']
  mfcc   = [mfcc_from_audio(x)[1:-1] for x in examples_audio]
  inputs = [f(
    raw_speech = [x['array'] for x in examples_audio],
    sampling_rate=f.sampling_rate, 
    max_length=int(f.sampling_rate * 1.0), # seconds
    return_tensors='pt'
  ).to('cuda') for f in feature_extractor]
  with torch.no_grad():
    logits = torch.stack([m(**i).logits for i, m in zip(inputs, distill_model)])
    return {'logits': torch.mean(logits, dim=0).to('cpu'), 'mfcc': mfcc}

In [13]:
md5 = !echo "{inspect.getsource(distill_function).strip()}" | md5sum
assert md5[0][:32] == '8f1a8b6a585883a566118b242f70b88d'

In [14]:
hfds.disable_caching()

In [15]:
ds = ds.map(distill_function, batched=True, batch_size=512, remove_columns="audio")
ds

Map:   0%|          | 0/1095480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3368 [00:00<?, ? examples/s]

Map:   0%|          | 0/3773 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'logits', 'mfcc'],
        num_rows: 1095480
    })
    validation: Dataset({
        features: ['label', 'logits', 'mfcc'],
        num_rows: 3368
    })
    test: Dataset({
        features: ['label', 'logits', 'mfcc'],
        num_rows: 3773
    })
})

In [16]:
{i: sum(np.argmax(ds[i]['logits'], -1) != ds[i]['label']) for i in ('test', 'validation')}

{'test': 9, 'validation': 4}

In [22]:
huggingface_hub.login(UserSecretsClient().get_secret('hf-datasets-token'))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
ds.push_to_hub('mazkooleg/0-9up-ft_ensemble_distilled_mfcc')

Pushing dataset shards to the dataset hub:   0%|          | 0/12 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
!echo RG9uZSAhISEK | base64 -d

Done !!!
