# Unit 4. Build a music genre classifier
- Run using A100 (more performant) or V100 (cheaper) GPU

## Pre-trained models for audio classification

In [1]:
# Install the latest version (not on PyPi at that time)
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-4y6ze7dj
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-4y6ze7dj
  Resolved https://github.com/huggingface/transformers to commit 70b49f023c9f6579c516671604468a491227b4da
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers==4.33.0.dev0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.33.0.dev0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━

### Keyword Spotting

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [4]:
# Use MINDS-14 to classify the recordings by intent of the call
from datasets import load_dataset

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")

Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# Load a pre-trained model
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [6]:
# Pass a sample to make a prediction
classifier(minds[0]["path"])

[{'score': 0.9623646140098572, 'label': 'pay_bill'},
 {'score': 0.028678612783551216, 'label': 'freeze'},
 {'score': 0.0034296319354325533, 'label': 'card_issues'},
 {'score': 0.002060496713966131, 'label': 'abroad'},
 {'score': 0.0008625703630968928, 'label': 'high_value_payment'}]

### Speech Commands

In [7]:
# Load a sample of the Speech Commands dataset using streaming mode
speech_commands = load_dataset(
    "speech_commands", "v0.02", split="validation", streaming=True
)
sample = next(iter(speech_commands))

Downloading builder script:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.34k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

In [8]:
# Load an Audio Spectrogram Transformer checkpoint fine-tuned
classifier = pipeline(
    "audio-classification",
    model="MIT/ast-finetuned-speech-commands-v2"
)

# Need to copy otherwise audio value is lost -> Why ?
classifier(sample["audio"].copy())

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/342M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

[{'score': 0.9999892711639404, 'label': 'backward'},
 {'score': 1.7504888774055871e-06, 'label': 'happy'},
 {'score': 6.703033363919531e-07, 'label': 'follow'},
 {'score': 5.805884484288981e-07, 'label': 'stop'},
 {'score': 5.614541578324861e-07, 'label': 'up'}]

In [9]:
# Listen to the sample: it seems that it is indeed said "backward"
from IPython.display import Audio

Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

### Language Identification

In [10]:
# Load a sample from the validation split of the FLEURS dataset using streaming
# https://huggingface.co/datasets/google/fleurs
fleurs = load_dataset("google/fleurs", "all", split="validation",
                      streaming=True)
sample = next(iter(fleurs))

Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

In [11]:
# Load audio classification model: Whisper fine-tuned
classifier = pipeline(
    "audio-classification",
    model="sanchit-gandhi/whisper-medium-fleurs-lang-id"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [12]:
# Generate a prediction
classifier(sample["audio"].copy())

[{'score': 0.9999330043792725, 'label': 'Afrikaans'},
 {'score': 7.093030490068486e-06, 'label': 'Northern-Sotho'},
 {'score': 4.269153578206897e-06, 'label': 'Icelandic'},
 {'score': 3.266120529588079e-06, 'label': 'Danish'},
 {'score': 3.258075366829871e-06, 'label': 'Cantonese Chinese'}]

### Zero-Shot Audio Classification

In [13]:
# Load the dataset
dataset = load_dataset("ashraq/esc50", split="train", streaming=True)
audio_sample = next(iter(dataset))["audio"]["array"]

Downloading readme:   0%|          | 0.00/345 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

In [14]:
# Define the candidate labels, a priori
candidate_labels = ["Sound of a dog", "Sound of a vacuum cleaner"]

# Get a score for each candidate label
classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused"
)
classifier(audio_sample.copy(), candidate_labels=candidate_labels)

Downloading (…)lve/main/config.json:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

[{'score': 0.999643087387085, 'label': 'Sound of a dog'},
 {'score': 0.0003569532709661871, 'label': 'Sound of a vacuum cleaner'}]

In [15]:
# Listen to the sample to confirm
Audio(audio_sample, rate=16000)

## Fine-tuning a model for music classification

### The dataset

In [8]:
# Load the dataset: songs for music classification
from datasets import load_dataset

gtzan = load_dataset("marsyas/gtzan", split="train")
gtzan

Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})

In [9]:
# Create a validation set
gtzan = gtzan.train_test_split(seed=23, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [10]:
# Have a look at one audio file
# Notice that sampling rate here is 22050 Hz
gtzan["train"][0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/jazz/jazz.00072.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/jazz/jazz.00072.wav',
  'array': array([-0.06265259, -0.05786133, -0.05596924, ..., -0.00128174,
         -0.00396729, -0.00643921]),
  'sampling_rate': 22050},
 'genre': 5}

In [11]:
# Convert to "human-readable" genre
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][0]["genre"])

'jazz'

In [25]:
!pip install gradio



In [22]:
# Listen to a few examples using Gradio
import gradio as gr


def generate_audio():
    example = gtzan["train"].shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label_fn(example["genre"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch(debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Keyboard interruption in main thread... closing server.




### Preprocessing the data

In [12]:
# Instantiate feature extractor
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [13]:
# Check the model' sampling rate
# It is different from 22050 Hz
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [14]:
# Resample the dataset to match the expected value of the model
from datasets import Audio

gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [15]:
# Check that it correctly resampled
gtzan["train"][0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/jazz/jazz.00072.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/jazz/jazz.00072.wav',
  'array': array([-0.05011164, -0.06142042, -0.04560325, ..., -0.00152192,
         -0.00519248,  0.        ]),
  'sampling_rate': 16000},
 'genre': 5}

In [16]:
# Check for the current mean and variance of the raw audio data
import numpy as np

sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -0.00198, Variance: 0.0157


In [17]:
# Apply the feature extractor and have a look at the output
inputs = feature_extractor(sample["array"],
                           sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")
print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: 9e-09, Variance: 1.0


In [18]:
# Define a function to preprocess the data
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs


# Apply to the data using map
# batch_size set to 100 for Colab free tier, can be set higher otherwise
# Columns removed to simplify training
gtzan_encoded = gtzan.map(preprocess_function,
                          remove_columns=["audio", "file"],
                          batched=True,
                          batch_size=100,
                          num_proc=1,
)
gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

In [19]:
# Rename the "genre" for processing by the Trainer (see below)
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

# Obtain the mapping of the label
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["7"]

'pop'

### Fine-tuning the model

In [None]:
# accelerate>=0.20.1 needed for the Trainer
!pip install accelerate -U

# Need to restart Runtime after running this cell

In [20]:
# Link notebook to the hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# Instantiate the Trainer model
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['projector.bias', 'classifier.bias', 'classifier.weight', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Define the training arguments
from transformers import TrainingArguments

model_id = "ntu-spml/distilhubert"
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m61.4/81.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


In [23]:
# Define the metrics for evaluation
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions,
                          references=eval_pred.label_ids)

In [24]:
# Instantiate the Trainer and train the model
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# Training took 1h35 for me on Colab Pay-as-you-go using V100
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.887,1.81752,0.52
2,1.282,1.357036,0.61
3,0.9291,1.045712,0.7
4,0.6111,0.806512,0.78
5,0.4424,0.798259,0.77
6,0.2771,0.70267,0.79
7,0.2445,0.626809,0.81
8,0.2511,0.661131,0.82
9,0.1159,0.66364,0.79
10,0.1026,0.653047,0.82


TrainOutput(global_step=1130, training_loss=0.7410096934411378, metrics={'train_runtime': 5773.055, 'train_samples_per_second': 1.557, 'train_steps_per_second': 0.196, 'total_flos': 6.133988274624e+17, 'train_loss': 0.7410096934411378, 'epoch': 10.0})

### Sharing the model

In [None]:
# Define kwargs
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

In [None]:
# Push to Hub
trainer.push_to_hub(**kwargs)

## Build a demo with Gradio

In [26]:
# Load a model
from transformers import pipeline

model_id = "sanchit-gandhi/distilhubert-finetuned-gtzan"
pipe = pipeline("audio-classification", model=model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at sanchit-gandhi/distilhubert-finetuned-gtzan were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at sanchit-gandhi/distilhubert-finetuned-gtzan and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.wei

Downloading (…)rocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [27]:
# Define a function for classification
def classify_audio(filepath):
    preds = pipe(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs


# Run using Gradio
import gradio as gr

demo = gr.Interface(
    fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs=gr.outputs.Label()
)
demo.launch(debug=True)

  fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs=gr.outputs.Label()
  fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs=gr.outputs.Label()


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Keyboard interruption in main thread... closing server.




## Hands-on exercise

In [1]:
# Install the latest version (not on PyPi at that time)
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-352razo1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-352razo1
  Resolved https://github.com/huggingface/transformers to commit 70b49f023c9f6579c516671604468a491227b4da
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!pip install datasets



In [3]:
!pip install evaluate



In [None]:
# accelerate>=0.20.1 needed for the Trainer
!pip install accelerate -U

# Need to restart Runtime after running this cell

In [4]:
# Start by loading the dataset
from datasets import load_dataset

gtzan = load_dataset("marsyas/gtzan", split="train")
gtzan

Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})

In [5]:
# Create a validation set
gtzan = gtzan.train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [6]:
# Convert to "human-readable" genre
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][0]["genre"])

'pop'

In [7]:
# Instantiate feature extractor
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
#model_id = "facebook/wav2vec2-xls-r-300m"
#model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [8]:
# Check the model' sampling rate
# It is different from 22050 Hz
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [9]:
# Resample the dataset to match the expected value of the model
from datasets import Audio

gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [10]:
# Define a function to preprocess the data
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

# Apply to the data using map
# Columns removed to simplify training
gtzan_encoded = gtzan.map(preprocess_function,
                          remove_columns=["audio", "file"],
                          batched=True,
                          batch_size=100,
                          num_proc=1
)
gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

In [11]:
# Rename the "genre" for processing by the Trainer (see below)
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

# Obtain the mapping of the label
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["7"]

'pop'

In [12]:
# Instantiate the Trainer model
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['projector.bias', 'classifier.bias', 'projector.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Link notebook to the hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
# Define the training arguments
from transformers import TrainingArguments

model_id = "ntu-spml/distilhubert"
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10
#weight_decay = 0.006 # add regularization #TODO: fine-tune between [0.001-0.1]

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True
)

In [15]:
# Define the metrics for evaluation
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions,
                          references=eval_pred.label_ids)

In [16]:
# Instantiate the Trainer and train the model
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# Training took XX for me on Colab Pro V100
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.0262,1.827744,0.42
2,1.3859,1.319453,0.56
3,1.005,1.047439,0.74
4,0.8309,0.906607,0.71
5,0.5891,0.717598,0.82
6,0.4603,0.646894,0.81
7,0.4911,0.560491,0.88
8,0.1913,0.53914,0.86
9,0.3627,0.527194,0.88
10,0.1858,0.554396,0.87


TrainOutput(global_step=1130, training_loss=0.793328928419974, metrics={'train_runtime': 5483.8984, 'train_samples_per_second': 1.639, 'train_steps_per_second': 0.206, 'total_flos': 6.133988274624e+17, 'train_loss': 0.793328928419974, 'epoch': 10.0})

In [17]:
# Define the kwargs
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

# Then push to the Hub
trainer.push_to_hub(**kwargs)

'https://huggingface.co/64FC/distilhubert-finetuned-gtzan/tree/main/'