#### Load Birdset dataset config

In [2]:
from birdset.datamodule.beans_datamodule import BEANSDataModule
from birdset.datamodule.base_datamodule import DatasetConfig

datasetconfig = DatasetConfig(dataset_name='beans_watkins', hf_path='DBD-research-group/beans_watkins', hf_name='default',sample_rate=16000)

datamodule = BEANSDataModule(dataset=datasetconfig)
dataset = datamodule._load_data()
dataset['train']

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'labels'],
    num_rows: 1017
})

#### Directly load from HF

In [2]:
from datasets import load_dataset

dataset_name = datasetconfig.hf_path
dataset = load_dataset(name='default', path='DBD-research-group/beans_watkins', cache_dir='/workspace/data_birdset')

#### Load Embedding Module Config

In [3]:
from birdset.datamodule.embedding_datamodule import EmbeddingModuleConfig
from birdset.modules.models.hubert import HubertSequenceClassifier
from birdset.configs.module_configs import NetworkConfig

embedding_module= NetworkConfig(model_type="waveform",sample_rate=16000,normalize_spectrogram= False,
normalize_waveform= False,model=HubertSequenceClassifier(checkpoint= "facebook/hubert-base-ls960",num_classes=31))

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Load embedding preprocessing

In [4]:
from birdset.datamodule.embedding_datamodule import EmbeddingDataModule
embedding_module.length = 15
embedding_module.sample_rate=16000

embedding_datamodule = EmbeddingDataModule(
    dataset=datasetconfig,
    embedding_model=embedding_module,
    average=False
)

#### Create embedding and pass to classifier
##### 1. Using embedding_datamodule

In [5]:
input_value = dataset['test'][0]
input_value

{'audio': {'path': '8300604C.wav',
  'array': array([ 0.0016283 ,  0.00375787,  0.00176721, ...,  0.01741578,
         -0.03841794, -0.01904567]),
  'sample_rate': 16000},
 'labels': 10}

In [28]:
    def _zero_pad(audio):
        desired_num_samples = 15 * 16000
        current_num_samples = audio.shape[0]
        padding = desired_num_samples - current_num_samples
        if padding > 0:
            #print('padding')
            audio = torch.nn.functional.pad(audio, (0, padding))
        return audio

In [47]:
import torch
tensor = torch.tensor(input_value['audio']['array'], dtype=torch.float32).to(embedding_datamodule.device)
padded = _zero_pad(tensor)
batched = padded.view(1, 1, -1)
embedding, _ = embedding_module.model.get_embeddings(batched)
embedding

tensor([[ 1.3961e-01, -6.5860e-02, -3.7661e-02,  2.6574e-01, -3.8397e-01,
         -2.7838e-01,  4.7122e-01, -8.9703e-01,  1.1543e-01,  2.6358e-02,
         -2.6965e-01, -5.8958e-02, -4.1052e-02,  2.6754e-01, -7.4314e-02,
         -3.6333e-01, -3.3269e-01,  2.1859e-01, -1.1106e-01, -3.0348e-01,
         -1.7248e-01, -7.9155e-01,  2.6017e-01, -1.0157e+00,  2.6277e-01,
         -8.5169e-02, -2.1181e-01, -4.9774e-02, -4.2393e-01,  1.1249e-01,
         -2.4212e-01, -2.3219e-01,  3.3783e-02,  2.9414e-01, -3.1536e-01,
          3.7916e-02, -2.7673e-01,  3.2152e-01, -2.5199e-02,  3.2559e-01,
          5.2889e-04,  3.2771e-02,  3.1916e-01,  3.0720e-01, -1.9211e-01,
         -2.4993e-01,  2.3534e-01, -1.0801e-01, -1.0886e-01,  2.6152e-02,
          6.4596e-02, -1.5888e-01,  3.9034e-01,  3.3439e-01,  3.6932e-02,
          2.9627e-01, -6.4848e-02,  7.1843e-02,  3.7392e-01, -2.6140e-01,
          2.9397e-01,  2.1605e-01, -8.3531e-02, -1.9607e-01,  1.8262e-01,
          1.0973e-01,  3.9191e-01,  4.

#### Left and right padding

In [71]:
embedding = embedding_datamodule._get_embedding(input_value['audio'])
embedding

tensor([[ 2.4852e-01, -1.9664e-01,  1.4376e-01,  2.8902e-01, -4.3323e-01,
          3.2479e-01, -1.4756e-01,  2.8321e+00, -1.3053e-01,  2.1132e-01,
         -2.4360e-01, -3.7690e-02, -1.1776e-01,  3.9365e-01, -4.2088e-02,
          6.0390e-02,  5.4581e-01,  3.6023e-01, -6.4353e-02, -1.5768e-01,
         -7.8516e-01,  1.3208e-01, -8.7731e-03, -3.3083e-01,  9.3489e-02,
          5.9739e-01,  1.1859e-01, -4.4748e-01, -5.9858e-01,  2.8812e-01,
          7.7592e-02,  1.6483e-01, -8.7997e-02,  2.1364e-01, -3.7372e-01,
         -1.1365e-02, -1.2343e-02,  4.0736e-01,  2.9565e-02, -5.4572e-02,
         -1.0127e-01, -1.6362e-01, -4.7430e-02,  6.8778e-01, -2.1331e-01,
          9.3467e-02,  1.8331e-01, -1.0750e-01,  5.8343e-02,  1.8091e-01,
          1.1552e-02, -1.1313e-01,  5.2452e-02,  3.0957e-01, -1.3356e-01,
          3.2952e-01,  1.9353e-01, -3.3665e-01, -1.3113e-01, -3.0818e-02,
          1.2783e-01, -5.5345e-02, -2.2731e-01, -1.8251e-01,  1.2260e-01,
         -1.8385e-01,  4.8510e-01,  4.

##### 2. Using birdset pipeline

In [6]:
from birdset.datamodule.components.transforms import BirdSetTransformsWrapper
print(input_value.keys())
birdset_transforms = BirdSetTransformsWrapper(task="multiclass",sample_rate=16000, model_type="waveform",max_length=15)
batched = birdset_transforms._transform({'audio':[input_value['audio']], 'labels':[21]})
batched

dict_keys(['audio', 'labels'])


2024-09-04 14:07:40.604198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-04 14:07:40.604270: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-04 14:07:40.604328: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 14:07:40.615981: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'input_values': tensor([[[0.0016, 0.0038, 0.0018,  ..., 0.0000, 0.0000, 0.0000]]]),
 'labels': tensor([21])}

In [7]:
embedding, _ = embedding_module.model.get_embeddings(batched['input_values'].to(embedding_datamodule.device))
embedding

tensor([[ 1.3961e-01, -6.5860e-02, -3.7661e-02,  2.6574e-01, -3.8397e-01,
         -2.7838e-01,  4.7122e-01, -8.9703e-01,  1.1543e-01,  2.6358e-02,
         -2.6965e-01, -5.8958e-02, -4.1052e-02,  2.6754e-01, -7.4314e-02,
         -3.6333e-01, -3.3269e-01,  2.1859e-01, -1.1106e-01, -3.0348e-01,
         -1.7248e-01, -7.9155e-01,  2.6017e-01, -1.0157e+00,  2.6277e-01,
         -8.5169e-02, -2.1181e-01, -4.9774e-02, -4.2393e-01,  1.1249e-01,
         -2.4212e-01, -2.3219e-01,  3.3783e-02,  2.9414e-01, -3.1536e-01,
          3.7916e-02, -2.7673e-01,  3.2152e-01, -2.5199e-02,  3.2559e-01,
          5.2889e-04,  3.2771e-02,  3.1916e-01,  3.0720e-01, -1.9211e-01,
         -2.4993e-01,  2.3534e-01, -1.0801e-01, -1.0886e-01,  2.6152e-02,
          6.4596e-02, -1.5888e-01,  3.9034e-01,  3.3439e-01,  3.6932e-02,
          2.9627e-01, -6.4848e-02,  7.1843e-02,  3.7392e-01, -2.6140e-01,
          2.9397e-01,  2.1605e-01, -8.3531e-02, -1.9607e-01,  1.8262e-01,
          1.0973e-01,  3.9191e-01,  4.

#### Linear classifier

In [25]:
from birdset.modules.models.linear_classifier import LinearClassifier

classifier = LinearClassifier(num_classes=31, in_features=768, state_dict="/workspace/logs/train/runs/beans_watkins/hubert_embedding/2024-09-04_131719/callback_checkpoints/hubert_embedding_beans_watkins_20.ckpt").to(embedding_datamodule.device)
network_config = NetworkConfig(model_type="waveform",sample_rate=16000,normalize_spectrogram= False,normalize_waveform= False,model=classifier)

import torch.nn.functional as F

output = F.softmax(classifier.forward(embedding), dim=1)
output

LOADED
Current model parameter names: odict_keys(['classifier.weight', 'classifier.bias'])
Modified state_dict keys: dict_keys(['classifier.weight', 'classifier.bias'])


tensor([[1.7318e-01, 4.6854e-03, 1.8236e-07, 1.2700e-04, 1.3325e-05, 2.8416e-05,
         6.7008e-08, 2.0651e-02, 8.9751e-04, 3.5889e-05, 5.4932e-05, 1.4155e-03,
         7.5473e-05, 1.6502e-03, 1.8397e-04, 4.9491e-02, 5.5685e-03, 2.8738e-04,
         1.1806e-01, 5.5345e-03, 7.4431e-03, 7.3237e-03, 3.9006e-04, 6.2625e-05,
         5.5673e-04, 4.4220e-02, 5.4288e-01, 1.4718e-02, 4.4827e-04, 1.8899e-05,
         2.8796e-06]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [11]:
print(classifier.classifier.weight)
print(classifier.classifier.bias)

Parameter containing:
tensor([[ 0.3802,  0.0566,  0.4469,  ...,  0.0466,  0.0816, -0.1308],
        [-0.3350,  0.8703,  0.2067,  ..., -0.3186, -0.2154, -0.0710],
        [-0.2090, -0.0579, -0.0451,  ..., -0.2674, -0.1373, -0.0911],
        ...,
        [-0.2430,  0.1554, -0.0946,  ...,  0.3617, -0.0025,  0.0669],
        [-0.0538, -0.0518,  0.0064,  ...,  0.0167, -0.4184, -0.1202],
        [ 0.1288, -0.1170,  0.0926,  ...,  0.2899, -0.1425,  0.0716]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.0136,  0.0461,  0.0358, -0.0570, -0.0096, -0.0524,  0.0621, -0.1021,
         0.0254, -0.0277,  0.1026, -0.0173,  0.0418, -0.0340,  0.0022,  0.0451,
         0.0323,  0.0020,  0.0326,  0.0074, -0.0516, -0.0742,  0.0114, -0.0278,
         0.0366, -0.0166,  0.0263,  0.0472, -0.0412, -0.0450,  0.0006],
       device='cuda:0', requires_grad=True)


#### Finetune_module

In [16]:
from birdset.modules.finetune_module import FinetuneModule

ft_module = FinetuneModule(network=network_config, embedding_model=embedding_module)

ft_module.forward(input_values=batched['input_values'].to(embedding_datamodule.device))

tensor([[  2.3773,  -1.2326, -11.3865,  -4.8406,  -7.0951,  -6.3378, -12.3877,
           0.2507,  -2.8851,  -6.1043,  -5.6787,  -2.4296,  -5.3610,  -2.2761,
          -4.4700,   1.1248,  -1.0599,  -4.0240,   1.9941,  -1.0660,  -0.7697,
          -0.7859,  -3.7185,  -5.5476,  -3.3627,   1.0122,   3.5199,  -0.0879,
          -3.5794,  -6.7457,  -8.6271]], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [24]:
batched['input_values'] = batched['input_values'].to(embedding_datamodule.device)
batched['labels'] = batched['labels'].to(embedding_datamodule.device)
ft_module.model_step(batched,0)

(tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor([[1.7318e-01, 4.6854e-03, 1.8236e-07, 1.2700e-04, 1.3325e-05, 2.8416e-05,
          6.7008e-08, 2.0651e-02, 8.9751e-04, 3.5889e-05, 5.4932e-05, 1.4155e-03,
          7.5473e-05, 1.6502e-03, 1.8397e-04, 4.9491e-02, 5.5685e-03, 2.8738e-04,
          1.1806e-01, 5.5345e-03, 7.4431e-03, 7.3237e-03, 3.9006e-04, 6.2625e-05,
          5.5673e-04, 4.4220e-02, 5.4288e-01, 1.4718e-02, 4.4827e-04, 1.8899e-05,
          2.8796e-06]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor([21], device='cuda:0'))