# Building a Music Genre Classifier

# Pre-trained models and datasets for audio classification

## Audio Classifier

### Keyword Spotting

In [1]:
from datasets import load_dataset

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train") # Minds-14 Dataset

In [2]:
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)




Device set to use cpu


In [5]:
classifier(minds[0]["audio"])

[{'score': 0.9611988067626953, 'label': 'pay_bill'},
 {'score': 0.029601896181702614, 'label': 'freeze'},
 {'score': 0.0035503068938851357, 'label': 'card_issues'},
 {'score': 0.0021323091350495815, 'label': 'abroad'},
 {'score': 0.0008829645812511444, 'label': 'high_value_payment'}]

### Speech Commands

#### Google Speech Commands Dataset

In [17]:
speech_commands = load_dataset(
    "speech_commands", "v0.02", split ="validation", streaming =True, trust_remote_code=True
)
sample = next(iter(speech_commands))

In [25]:
def sample_noise(example):
    # Use this function to extract random 1 sec slices of each _silence_ utterance,
    # e.g. inside `torch.utils.data.Dataset.__getitem__()`
    from random import randint

    if example["label"] == "_silence_":
        random_offset = randint(0, len(example["speech"]) - example["sample_rate"] - 1)
        example["speech"] = example["speech"][random_offset : random_offset + example["sample_rate"]]

    return example

TypeError: 'int' object is not subscriptable

In [26]:
for i, example in enumerate(speech_commands):
    print(f"Example {i}:")
    print(f"Label: {example['label']}")
    print(f"Audio Path: {example['audio']['path']}")
    if i == 200:  # Stop after a few examples
        break

Example 0:
Label: 30
Audio Path: backward/0d82fd99_nohash_2.wav
Example 1:
Label: 30
Audio Path: backward/4e6902d0_nohash_0.wav
Example 2:
Label: 30
Audio Path: backward/ef77b778_nohash_3.wav
Example 3:
Label: 30
Audio Path: backward/ce0cb033_nohash_1.wav
Example 4:
Label: 30
Audio Path: backward/30276d03_nohash_1.wav
Example 5:
Label: 30
Audio Path: backward/ff2b842e_nohash_4.wav
Example 6:
Label: 30
Audio Path: backward/1e02ffc5_nohash_2.wav
Example 7:
Label: 30
Audio Path: backward/f822b9bf_nohash_3.wav
Example 8:
Label: 30
Audio Path: backward/73f20b00_nohash_0.wav
Example 9:
Label: 30
Audio Path: backward/856eb138_nohash_2.wav
Example 10:
Label: 30
Audio Path: backward/856eb138_nohash_3.wav
Example 11:
Label: 30
Audio Path: backward/1e02ffc5_nohash_0.wav
Example 12:
Label: 30
Audio Path: backward/0d82fd99_nohash_1.wav
Example 13:
Label: 30
Audio Path: backward/a9ca1818_nohash_1.wav
Example 14:
Label: 30
Audio Path: backward/64e48f55_nohash_0.wav
Example 15:
Label: 30
Audio Path: b

KeyboardInterrupt: 

In [37]:
# Get the 300th element
target_index = 1500
for idx, example in enumerate(speech_commands):
    if idx == target_index:
        specific_element = example
        break

print(f"Element {target_index}: {specific_element}")

Element 1500: {'file': 'eight/856eb138_nohash_4.wav', 'audio': {'path': 'eight/856eb138_nohash_4.wav', 'array': array([ 0.00000000e+00,  3.05175781e-05,  0.00000000e+00, ...,
       -9.15527344e-05, -9.15527344e-05, -9.15527344e-05]), 'sampling_rate': 16000}, 'label': 18, 'is_unknown': True, 'speaker_id': '856eb138', 'utterance_id': 4}


#### MIT audio Classifier

In [40]:
classifier = pipeline(
    "audio-classification" , model ="MIT/ast-finetuned-speech-commands-v2"
)
classifier(specific_element["audio"].copy())

Device set to use cpu


[{'score': 0.9999958276748657, 'label': 'eight'},
 {'score': 3.551778036126052e-07, 'label': 'up'},
 {'score': 2.662008853349107e-07, 'label': 'two'},
 {'score': 2.4703945200599264e-07, 'label': 'one'},
 {'score': 2.1553124440742977e-07, 'label': 'right'}]

#### Audio Generation

In [39]:
from IPython.display import Audio

Audio(specific_element["audio"]["array"], rate=specific_element["audio"]["sampling_rate"])

### Language Identification

In [43]:
fleurs = load_dataset("google/fleurs", "all", split="validation", streaming=True,trust_remote_code=True)
sample = next(iter(fleurs))

In [44]:
classifier = pipeline(
    "audio-classification", model="sanchit-gandhi/whisper-medium-fleurs-lang-id"
)

config.json:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/615M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Device set to use cpu


In [53]:
# Get the 300th element
target_index = 6
for idx, example in enumerate(fleurs):
    if idx == target_index:
        ln_sample = example
        break

print(f"Element {target_index}: {ln_sample}")

Element 6: {'id': 1393, 'num_samples': 216960, 'path': None, 'audio': {'path': 'dev/10635177899066534417.wav', 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        2.56299973e-06, -3.93807888e-04,  6.53862953e-05]), 'sampling_rate': 16000}, 'transcription': 'mense sou offerande bring aan die gode en die priesters sou probeer om die behoeftes van die gode deur seremonies en feeste te bevredig', 'raw_transcription': 'Mense sou offerande bring aan die gode en die priesters sou probeer om die behoeftes van die gode deur seremonies en feeste te bevredig.', 'gender': 0, 'lang_id': 0, 'language': 'Afrikaans', 'lang_group_id': 3}


In [54]:
classifier(ln_sample["audio"])

[{'score': 0.9999363422393799, 'label': 'Afrikaans'},
 {'score': 8.904085007088725e-06, 'label': 'Northern-Sotho'},
 {'score': 3.3514932056277758e-06, 'label': 'Cantonese Chinese'},
 {'score': 3.2393081710324623e-06, 'label': 'Bengali'},
 {'score': 3.21026413985237e-06, 'label': 'Icelandic'}]

### Zero-Shot Audio Classification

In the traditional paradigm for audio classification, the model predicts a class label from a pre-defined set of possible classes. This poses a barrier to using pre-trained models for audio classification, since the label set of the pre-trained model must match that of the downstream task. For the previous example of LID, the model must predict one of the 102 langauge classes on which it was trained. If the downstream task actually requires 110 languages, the model would not be able to predict 8 of the 110 languages, and so would require re-training to achieve full coverage. This limits the effectiveness of transfer learning for audio classification tasks.



In [55]:
dataset = load_dataset("ashraq/esc50", split="train", streaming=True)
audio_sample = next(iter(dataset))["audio"]["array"]

README.md:   0%|          | 0.00/345 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

In [79]:
candidate_labels = ["Sound of a frog", "Sound of cat"]

In [80]:
classifier = pipeline(
    task="zero-shot-audio-classification", model="laion/clap-htsat-unfused"
)
classifier(audio_sample, candidate_labels=candidate_labels)

Device set to use cpu


[{'score': 0.9124478101730347, 'label': 'Sound of a frog'},
 {'score': 0.08755218237638474, 'label': 'Sound of cat'}]

In [76]:
from IPython.display import Audio

Audio(audio_sample, rate=16000)

## -------------------------------------------------------------------------------------------------

# Fine-tuning a model for audio classification

In [4]:
from datasets import load_dataset

gtzan = load_dataset("marsyas/gtzan", "all",trust_remote_code=True)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

In [5]:
gtzan = gtzan["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [6]:
gtzan["train"][4]

{'file': 'C:\\Users\\furka\\.cache\\huggingface\\datasets\\downloads\\extracted\\1a5d7cf63ab8dc103398c40c3f9d2a453f7a5e7be8402c354d4cf2ae87a96650\\genres\\rock\\rock.00047.wav',
 'audio': {'path': 'C:\\Users\\furka\\.cache\\huggingface\\datasets\\downloads\\extracted\\1a5d7cf63ab8dc103398c40c3f9d2a453f7a5e7be8402c354d4cf2ae87a96650\\genres\\rock\\rock.00047.wav',
  'array': array([-0.10357666, -0.08364868,  0.08837891, ...,  0.13192749,
          0.18023682,  0.19668579]),
  'sampling_rate': 22050},
 'genre': 9}

In [7]:
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][4]["genre"])

'rock'

## From audio to machine learning features

### Preprocessing the data

In [31]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [32]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [33]:
from datasets import Audio
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [34]:
gtzan["train"][0]

{'file': 'C:\\Users\\furka\\.cache\\huggingface\\datasets\\downloads\\extracted\\1a5d7cf63ab8dc103398c40c3f9d2a453f7a5e7be8402c354d4cf2ae87a96650\\genres\\pop\\pop.00098.wav',
 'audio': {'path': 'C:\\Users\\furka\\.cache\\huggingface\\datasets\\downloads\\extracted\\1a5d7cf63ab8dc103398c40c3f9d2a453f7a5e7be8402c354d4cf2ae87a96650\\genres\\pop\\pop.00098.wav',
  'array': array([ 0.08735093,  0.20183387,  0.47908676, ..., -0.1874318 ,
         -0.23294398, -0.13517427]),
  'sampling_rate': 16000},
 'genre': 7}

Great! We can see that the sampling rate has been downsampled to 16kHz. The array values are also different, as we’ve now only got approximately one amplitude value for every 1.5 that we had before.

A defining feature of Wav2Vec2 and HuBERT like models is that they accept a float array corresponding to the raw waveform of the speech signal as an input. This is in contrast to other models, like Whisper, where we pre-process the raw audio waveform to spectrogram format.

We can take a look at the feature extractor in operation by applying it to our first audio sample. First, let’s compute the mean and variance of our raw audio data:

In [35]:
import numpy as np

sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.000185, Variance: 0.0493


We can see that the mean is close to zero already, but the variance is closer to 0.05. If the variance for the sample was larger, it could cause our model problems, since the dynamic range of the audio data would be very small and thus difficult to separate. Let’s apply the feature extractor and see what the outputs look like:


In [36]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: -7.03e-09, Variance: 1.0


Great, so now we know how to process our resampled audio files, the last thing to do is define a function that we can apply to all the examples in the dataset. Since we expect the audio clips to be 30 seconds in length, we’ll also truncate any longer clips by using the max_length and truncation arguments of the feature extractor as follows:

In [37]:
max_duration = 30.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [46]:
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=50,
    num_proc=1,
)
gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

To simplify the training, we’ve removed the audio and file columns from the dataset. The input_values column contains the encoded audio files, the attention_mask a binary mask of 0/1 values that indicate where we have padded the audio input, and the genre column contains the corresponding labels (or targets). To enable the Trainer to process the class labels, we need to rename the genre column to label:

In [47]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

Finally, we need to obtain the label mappings from the dataset. This mapping will take us from integer ids (e.g. 7) to human-readable class labels (e.g. "pop") and back again. In doing so, we can convert our model’s integer id prediction into human-readable format, enabling us to use the model in any downstream application. We can do this by using the int2str() method as follows:

In [48]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["9"]

'rock'

OK, we’ve now got a dataset that’s ready for training! Let’s take a look at how we can train a model on this dataset.

# Fine-tuning the model

To fine-tune the model, we’ll use the ***Trainer*** class from 🤗 Transformers. As we’ve seen in other chapters, the *Trainer* is a high-level API that is designed to handle the most common training scenarios. In this case, we’ll use the *Trainer* to fine-tune the model on GTZAN. To do this, we’ll first need to load a model for this task. We can do this by using the ***AutoModelForAudioClassification*** class, which will automatically add the appropriate classification head to our pretrained DistilHuBERT model. Let’s go ahead and instantiate the model:



In [49]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 4
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
)

### Evaluate Library

In [51]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


RuntimeError: expected scalar type Long but found Int

# Creating Checkpoint

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…