# Tutorial for Multilabel Training

At this tutorial, we will train the ResNet on the HSN data which is a mutltilabel dataset and evaluate our model by the whole test data. Then we feed the model by an audio sample that we got from test_dataloader and will find the label of it. Finally, we show how you can visualize an audio sample but this time we get the audio sample from our test data set directly.

# Step1) Import HSN Dataset

In [1]:
from birdset.datamodule.base_datamodule import DatasetConfig
from birdset.datamodule.birdset_datamodule import BirdSetDataModule
from birdset.datamodule.components.event_decoding import EventDecoding
from birdset.datamodule.components.transforms import PreprocessingConfig, BirdSetTransformsWrapper
from torchaudio.transforms import Spectrogram

transforms = BirdSetTransformsWrapper(model_type='vision',preprocessing=PreprocessingConfig(spectrogram_conversion= Spectrogram(
            n_fft=1024,
            hop_length=320,
            power=2.0,
        ),), decoding=EventDecoding(sampling_rate=32000), task="multilabel")
# initiate the data module
dm = BirdSetDataModule(
    dataset= DatasetConfig(
        data_dir='../../data_birdset/HSN',
        dataset_name='HSN',
        hf_path='DBD-research-group/BirdSet',
        hf_name='HSN',
        n_classes=21,
        n_workers=3,
        val_split=0.2,
        task="multilabel",
        classlimit=500,
        eventlimit=5,
        sampling_rate=32000,
    ),
    transforms=transforms
)

   



Map (num_proc=3):   0%|          | 0/5460 [00:00<?, ? examples/s]

Map:   0%|          | 0/38170 [00:00<?, ? examples/s]

Processing labels: 100%|██████████| 21/21 [00:02<00:00, 10.25it/s]


Map (num_proc=3):   0%|          | 0/17940 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14352 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3588 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

torch.Size([32, 1, 128, 1024])
torch.Size([32, 21])


{'input_values': tensor([[[[ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4477,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.2510,  5.9508,  5.1205,  ..., 16.5752, 16.5752, 16.5752],
           ...,
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752]]],
 
 
         [[[ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           ...,
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752],
           [ 6.4481,  6.4481,  6.4481,  ..., 16.5752, 16.5752, 16.5752]]],
 
 
         [[[-0.7895,  1.2041,  2.3324,  ..., 16.5752, 16.5752, 1

# Step2) Prepare Data

In [None]:
# prepare the data (download dataset, ...)
dm.prepare_data()
# setup the dataloaders
dm.setup(stage="fit")
# get the dataloaders
train_loader = dm.train_dataloader()
test_loader = dm.test_dataloader()
# get the first batch
batch = next(iter(train_loader))
# get shape of the batch
print(batch["input_values"].shape)
print(batch["labels"].shape)
batch

# Step3) Prepare trainer

In [None]:
from lightning import Trainer 

min_epochs = 1
max_epochs = 20
trainer = Trainer(min_epochs=min_epochs, max_epochs=max_epochs, accelerator="gpu", devices=0)

# Step4) Prepare your model

In [5]:

from birdset.modules.models.resnet import ResNetClassifier
from birdset.modules.metrics.multilabel import MultilabelMetricsConfig
from torch.nn import BCEWithLogitsLoss

#module = ResNetClassifier("resnet50",21)

from birdset.modules.base_module import BaseModule,NetworkConfig
NetworkConfig=NetworkConfig(
        model=ResNetClassifier(baseline_architecture="resnet50",num_classes =21),
        model_name ="resnet50",
        model_type="vision",
        torch_compile= False,
        sample_rate=32000,
        normalize_waveform=False,
        normalize_spectrogram=True)


model = BaseModule(
    network=NetworkConfig,
    loss=BCEWithLogitsLoss(),
    metrics=MultilabelMetricsConfig(),
    len_trainset=dm.len_trainset,
    task=dm.task,
    batch_size=dm.train_batch_size,
    num_epochs=max_epochs)



In [6]:
model

BaseModule(
  (loss): BCEWithLogitsLoss()
  (model): ResNetClassifier(
    (model): ResNet(
      (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats

# Step5) Initializing your model with HSN data

In this tutorial we train our model with 20 epochs to show how it works and make a visible pipline . For having reasonable results the model should be traind by more epochs. 

In [7]:
trainer.fit(model, dm)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name                  | Type              | Params
------------------------------------------------------------
0 | loss                  | BCEWithLogitsLoss | 0     
1 | model                 | ResNetClassifier  | 23.5 M
2 | train_metric          | cmAP              | 0     
3 | valid_metric          | cmAP              | 0     
4 | test_metric           | cmAP              | 0     
5 | valid_metric_best     | MaxMetric         | 0     
6 | valid_add_metrics     | MetricCollection  | 0     
7 | test_add_metrics      | MetricCollection  | 0     
8 | test_complete_metrics | Met

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


# Step6) Mapping of labels to eBirdsCode

In [None]:
import json
from typing import Dict

def get_label_to_category_mapping_from_metadata(
    file_path: str, task: str
) -> Dict[int, str]:
    """
    Reads a JSON file and extracts the mapping of labels to eBird codes.

    The function expects the JSON structure to be in a specific format, where the mapping
    is a list of names located under the keys 'features' -> 'labels' -> 'names'.
    The index in the list corresponds to the label, and the value at that index is the eBird code.

    Args:
    - file_path (str): The path to the JSON file containing the label to eBird code mapping.
    - task (str): The type of task for which to get the mapping. Expected values are "multiclass" or "multilabel".

    Returns:
    - Dict[int, str]: A dictionary where each key is a label (integer) and the corresponding value is the eBird code.

    Raises:
    - FileNotFoundError: If the file at `file_path` does not exist.
    - json.JSONDecodeError: If the file is not a valid JSON.
    - KeyError: If the expected keys ('features', 'labels', 'names') are not found in the JSON structure.
    """

    # Open the file and read the JSON data
    with open(file_path, "r") as file:
        dataset_info = json.load(file)

    # Extract the list of eBird codes from the loaded JSON structure.
    # Note: This assumes a specific structure of the JSON data.
    # If the structure is different, this line will raise a KeyError.
    if task == "multiclass":
        ebird_codes_list = dataset_info["features"]["labels"]["names"]
    elif task == "multilabel":
        ebird_codes_list = dataset_info["features"]["labels"]["feature"]["names"]
    else:
        # If the task is not recognized (not multiclass or multilabel), raise an error.
        raise NotImplementedError(
            f"Only the multiclass and multilabel tasks are implemented, not task {task}."
        )

    # Create a dictionary mapping each label (index) to the corresponding eBird code.
    mapping = {label: ebird_code for label, ebird_code in enumerate(ebird_codes_list)}

    return mapping

mapping = get_label_to_category_mapping_from_metadata(
    file_path='../../data_birdset/HSN/HSN_processed_42_467ad9795903cdde/train/dataset_info.json',
    task='multilabel'
)
print(mapping)

# Step7) Evaluate your model with your test data

In [None]:
trainer.test(model=model,dataloaders=test_loader,ckpt_path="best")

# Step8) Finding ebirdcode of one sample

In [None]:
import torch
# Load the trained model
model.eval()  # Set the model to evaluation mode
test_loader = dm.test_dataloader()
batch2 = next(iter(test_loader))
# Extract audio data from the batch
audiox = batch2["input_values"][0]  # Assuming the first sample in the batch
label = batch2["labels"][0]
audiox=audiox.unsqueeze(0)
print("Original audio shape:", audiox.shape)
print("Label shape:", label.shape)

# Pass the spectrogram data through the model for prediction
with torch.no_grad():
    output = model(audiox)
print(output.shape,output)


# Step9) Prediction of sample's labels

In [None]:
# Interpret the model's prediction
# This depends on how your model's prediction method is implemented
# It could be returning class indices, probabilities, or even class labels directly
import torch

# Your tensor of logits
logits_tensor = output

# Apply sigmoid to convert logits to probabilities for each class
probabilities = torch.sigmoid(logits_tensor)
print(probabilities)
# Define a threshold (e.g., 0.5) to determine positive labels
threshold = 0.1

# Get the predicted labels based on the threshold

predicted_labels = (probabilities > threshold).nonzero().squeeze()

# Print the predicted labels
print("Predicted Labels:", predicted_labels.tolist())
label_indices = [label_set[1].item() for label_set in predicted_labels]

print("label_indices",label_indices)  

# Step10)Convert Predection output to Class Label Name

In [None]:
predicted_class_names = [mapping[idx] for idx in label_indices]
real_label=[mapping[idx] for idx in label]
print("Real Class Names",real_label)
# Print the predicted class names
print("Predicted Class Names:", predicted_class_names)

# Visaulization of Audio

Load dataset

In [None]:
from datasets import load_dataset
hsn_test = load_dataset("DBD-research-group/BirdSet","HSN", split="test")

Load a sample

In [None]:
import librosa
import torchaudio
sample = 199
sr = 32000
# get sample audio
sample_audio =  librosa.load(
    hsn_test[sample]['filepath'],
    sr=sr,
    offset=hsn_test[sample]['start_time'],
    duration=hsn_test[sample]['end_time'] - hsn_test[sample]['start_time'])
sample_tensor = torchaudio.load(
        hsn_test[sample]['filepath'],
        normalize=True,
        frame_offset=hsn_test[sample]['start_time'] * sr,
        num_frames=(hsn_test[sample]['end_time'] - hsn_test[sample]['start_time']) * sr
        )

load one bird sound

In [None]:
sample_audio = sample_audio[0]

Listen to it

In [None]:
from IPython.display import Audio
# Play the audio
Audio(data=sample_audio, rate=sr)

Plot the waveform

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming 'waveform' is a dictionary containing the audio path under the key 'path'
# audio_path = waveform["path"]

# Load the audio file
# audio, sr = librosa.load(audio_path, sr=None)

# Calculate the time axis
time = np.arange(len(sample_audio)) / sr

# Plot the waveform
plt.figure(figsize=(14, 5))
plt.plot(time, sample_audio)
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.title('Waveform')
plt.show()

plot Spectrogram

In [None]:
from torchaudio.transforms import Spectrogram
import librosa 

spectrogram_conversion= Spectrogram(n_fft=1024)
spectrogram=spectrogram_conversion(sample_tensor[0])
spectrogram_db = librosa.power_to_db(spectrogram.squeeze().numpy(), ref=np.max)
# Plot the spectrogram
plt.figure(figsize=(10, 5))
plt.imshow(spectrogram_db, aspect='auto', origin='lower', extent=[0, len(sample_audio)/sr, 0, sr/2])
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plt.show()