In [2]:
# basic libraries
import numpy as np
from random import sample
import json 


# To access the model on hugging face
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer

# To play audio interactively
from IPython.display import Audio 

# Import performance metrics
import evaluate

# To load the dataset
from utils import load_and_merge_batches

Load the model from the hugging face repository

In [4]:
model_name = "dima806/bird_sounds_classification"

# Load the feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

Load and merge batches into one dataset

In [9]:
batches_folder = "../processed_data"
dataset = load_and_merge_batches(batches_folder)

Loaded: ../processed_data/batch_0
Loaded: ../processed_data/batch_1
Loaded: ../processed_data/batch_2
Loaded: ../processed_data/batch_3
Loaded: ../processed_data/batch_4
Loaded: ../processed_data/batch_5
Loaded: ../processed_data/batch_6
Loaded: ../processed_data/batch_7
Loaded: ../processed_data/batch_8
Loaded: ../processed_data/batch_9
Loaded: ../processed_data/batch_10
Loaded: ../processed_data/batch_11
Loaded: ../processed_data/batch_12
Loaded: ../processed_data/batch_13
Loaded: ../processed_data/batch_14
Merged 15 batches into a single dataset.


Load dictionary of the labels

In [10]:
with open('../processed_data/label_mappings.json', 'r') as file:
    label_mappings = json.load(file)

label2id = label_mappings["label2id"]
id2label = label_mappings["id2label"]
del label_mappings

Display random examples from the dataset

In [11]:
n_of_examples = 3
idxs = sample(range(len(dataset)), n_of_examples)

Hz_rate = 16000

for i in idxs:
    waveform = dataset[i]["audio"]
    label = dataset[i]["label"]

    print(f"label: {id2label[str(label)]}")
    display(Audio(data=waveform, rate=Hz_rate))

label: Orange-footed Scrubfowl


label: Small-billed Tinamou


label: Little Tinamou


Given that we do not intend to train the model, we going to use the test set.

In [12]:
test_split = 0.1
dataset = dataset.train_test_split(test_size=test_split, shuffle=True, stratify_by_column="label")

Process the dataset

In [13]:
def preprocess_function(batch):
    # Extract audio features from the input batch using the feature_extractor
    inputs = feature_extractor(batch['audio'], sampling_rate=Hz_rate)
    
    # Extract and store only the 'input_values' component from the extracted features
    inputs['input_values'] = inputs['input_values'][0]
    
    return inputs

dataset['test']= dataset['test'].map(preprocess_function, remove_columns="audio", batched=False)

Map: 100%|██████████| 2450/2450 [01:22<00:00, 29.84 examples/s]


Define accuracy metrics

In [14]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    predictions = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)  # Softmax
    label_ids = eval_pred.label_ids
    acc_score = accuracy.compute(predictions=predictions.argmax(axis=1), references=label_ids)['accuracy']
    return {"accuracy": acc_score}

In [18]:
# Define TrainingArguments for evaluation
training_args = TrainingArguments(
    output_dir=model_name,  # Directory to save the model (not really used in evaluation)
    per_device_eval_batch_size=16,  # Evaluation batch size
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch (for eval only)
    save_strategy='no',  # No saving during evaluation
    load_best_model_at_end=False,  # No training, so no best model to load
    logging_steps=1,  # Log every step
    report_to="none",  # No need to report to mlflow during evaluation
)

# Create Trainer object (without training setup)
trainer = Trainer(
    model=model,  # The trained model you want to evaluate
    args=training_args,  # TrainingArguments (used for evaluation configuration)
    eval_dataset=dataset['test'],  # Use the test dataset for evaluation
    tokenizer=feature_extractor,  # Tokenizer (if needed)
    compute_metrics=compute_metrics,  # Define the metric function for evaluation
)

In [19]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

100%|██████████| 154/154 [21:15<00:00,  8.28s/it]

{'eval_loss': 0.531235933303833, 'eval_accuracy': 0.8922448979591837, 'eval_runtime': 1284.9757, 'eval_samples_per_second': 1.907, 'eval_steps_per_second': 0.12}





Log of the accuracies

Evaluation n.1: \
{'eval_loss': 1.0484373569488525, 'eval_model_preparation_time': 0.0044, 'eval_accuracy': 0.7383673469387755, 'eval_runtime': 1150.2315, 'eval_samples_per_second': 2.13, 'eval_steps_per_second': 0.134}

Evaluation n.2: \
{'eval_loss': 0.531235933303833, 'eval_accuracy': 0.8922448979591837, 'eval_runtime': 1284.9757, 'eval_samples_per_second': 1.907, 'eval_steps_per_second': 0.12}

