<a href="https://colab.research.google.com/github/10udCryp7/Speech-Practice/blob/main/notebooks/Music_Genre_Classification_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:06
🔁 Restarting kernel...


In [1]:
!conda install datasets[audio]

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | done
Solving environment: - \ | / - \ | done


    current version: 24.11.3
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [2]:
!conda install evaluate

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - done


    current version: 24.11.3
    latest version: 25.5.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [79]:
from datasets import load_dataset
dataset = load_dataset("marsyas/gtzan", split = 'train')
dataset

Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})

In [80]:
dataset = dataset.train_test_split(test_size=0.1, shuffle=True, stratify_by_column='genre', seed = 42)
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [110]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

model_id = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize = True, return_attention_mask = True)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]



In [111]:
from datasets import Audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))

In [112]:
id2label_fn = dataset['train'].features['genre'].int2str
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(dataset['train'].features['genre'].names))
}
label2id = {v: k for k, v in id2label.items()}

In [123]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [124]:
max_dur = 30
def preprocess(examples):
  input_array = [example['array'] for example in examples['audio']]
  inputs = feature_extractor(input_array,
                             sampling_rate = feature_extractor.sampling_rate,
                             max_length = int(feature_extractor.sampling_rate * max_dur),
                             truncation = True,
                             return_attention_mask = True,
                             )
  return inputs

dataset_encoded = dataset.map(preprocess, batched=True, batch_size=100, remove_columns = ['audio', 'file'])

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [125]:
dataset_encoded = dataset_encoded.rename_column('genre', 'label')
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [135]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 16
gradient_accumulation_steps = 2
num_train_epochs = 20

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # fp16=True,
)

In [136]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script: 0.00B [00:00, ?B/s]

In [137]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,

)

In [138]:
trainer.train(resume_from_checkpoint = True)

Epoch,Training Loss,Validation Loss,Accuracy
11,0.2955,0.940698,0.8
12,0.2214,0.938289,0.8
13,0.5358,0.889182,0.81
14,0.3723,0.838623,0.82
15,0.3086,0.770437,0.86
16,0.2668,0.7496,0.81
17,0.2425,0.706865,0.85
18,0.2257,0.713767,0.85
19,0.1899,0.673343,0.86
20,0.1749,0.661766,0.86


TrainOutput(global_step=580, training_loss=0.2863046317521868, metrics={'train_runtime': 5071.7861, 'train_samples_per_second': 3.545, 'train_steps_per_second': 0.114, 'total_flos': 4.8971209106304e+18, 'train_loss': 0.2863046317521868, 'epoch': 20.0})

In [129]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [131]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [139]:
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan-plus",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

In [140]:
trainer.push_to_hub(**kwargs)

events.out.tfevents.1752494417.a86711721435.1509.13:   0%|          | 0.00/7.49k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

events.out.tfevents.1752494471.a86711721435.1509.14:   0%|          | 0.00/71.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/dung8204/wav2vec2-base-finetuned-gtzan/commit/1155f0ce78580631e85b8f15f7c461c444a10c4d', commit_message='End of training', commit_description='', oid='1155f0ce78580631e85b8f15f7c461c444a10c4d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dung8204/wav2vec2-base-finetuned-gtzan', endpoint='https://huggingface.co', repo_type='model', repo_id='dung8204/wav2vec2-base-finetuned-gtzan'), pr_revision=None, pr_num=None)