# Lab 11 Audio Classification

In [None]:
import torch
import librosa
import numpy as np
from datasets import load_dataset, Audio,DatasetDict
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Load dataset 

In [None]:
dataset = load_dataset("Wellyowo/esc50")

In [None]:
print(dataset)
print(dataset['train'][0]["audio"].keys())
print(dataset['train'][0]["audio"]["sampling_rate"])

# Prepare dataset for training

In [None]:
# Initialize the feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

# Set the sampling rate to match the feature extractor
sampling_rate = feature_extractor.sampling_rate

# Cast the audio column to the correct sampling rate
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

# Create a mapping from category names to numerical IDs
categories = dataset['train']['category']
category_to_id = {category: idx for idx, category in enumerate(np.unique(categories))}
num_categories = len(category_to_id)

# Function to preprocess the audio data
def prepare_dataset(example):
    audio = example["audio"]

    inputs = feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt",
        padding=True
    )

    example["input_values"] = inputs.input_values[0]  # Extract the tensor from the batch
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    example["labels"] = category_to_id[example["category"]]
    return example

# Preprocess the dataset
dataset = dataset.map(prepare_dataset, remove_columns=["filename", "target", "esc10", "take", "src_file"])



In [None]:
model_name = "Wellyowo/hubert-esc50-finetuned-v2"
model_id = "facebook/hubert-base-ls960"
model = HubertForSequenceClassification.from_pretrained(model_id, num_labels=num_categories)


training_args = TrainingArguments(
	output_dir=f"{model_name}-results",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=8,
	num_train_epochs=10,
	logging_dir='./logs',
	logging_steps=10,
	push_to_hub=False,
	hub_model_id=model_name,
)

def compute_metrics(eval_pred):
	predictions = np.argmax(eval_pred.predictions, axis=1)
	return {"accuracy": accuracy_score(eval_pred.label_ids, predictions)}


trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset["train"],
	eval_dataset=dataset["test"],
	tokenizer=feature_extractor,
	compute_metrics=compute_metrics,
)

trainer.train()

kwargs = {
	"finetuned_from": model_id,
	"tasks": "audio-classification",
	"dataset": "ESC-50",
	"tags": ["audio-classification", "hubert", "esc50"]
}

# trainer.push_to_hub(**kwargs)

# Test your model 

In [1]:
!pip3 install ipython
from IPython.display import Audio, display

In [None]:
# Load the fine-tuned model
model_name = "Wellyowo/hubert-esc50-finetuned"
model = HubertForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

In [None]:
def play_audio(input):
	audio_array = input["audio"]["array"]
	sampling_rate = input["audio"]["sampling_rate"]
	display(Audio(audio_array, rate=sampling_rate))

In [None]:
from transformers import Wav2Vec2Processor


def predict_and_audio(inputs):
    with torch.no_grad():
        input_values = feature_extractor(inputs["audio"]["array"], return_tensors="pt", sampling_rate=16000).input_values

        logits = model(input_values).logits

        predicted_id = torch.argmax(logits, dim=-1).item()
        predicted_label = list(category_to_id.keys())[predicted_id]
        

        print(f"Ground Truth: {inputs['category']}")
        print(f"Predicted Label: {predicted_label}")


In [None]:
test_id = 55
inputs = dataset["test"][test_id]
play_audio(inputs)
predict_and_audio(inputs)
