In [1]:
from birdset.datamodule import DatasetConfig
from birdset.datamodule.birdset_datamodule import BirdSetDataModule
from birdset.datamodule.components.transforms import BirdSetTransformsWrapper

transforms = BirdSetTransformsWrapper(
    task='multiclass',
    sampling_rate = 16000,
    model_type = 'waveform',
)

dm = BirdSetDataModule(
    dataset= DatasetConfig(
        data_dir='B:\DLL\Datasets',
        dataset_name='XCM',
        hf_path='DBD-research-group/BirdSet',
        hf_name='XCM',
        n_workers=8,
        val_split=0.2,
        task="multiclass",
        classlimit=500,
        eventlimit=5,
        sampling_rate=16000,
    ),
    transforms=transforms
)

dm.prepare_data()

Saving the dataset (0/1 shards):   0%|          | 0/187952 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46988 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16003 [00:00<?, ? examples/s]

In [2]:
dm.setup(stage="fit")
train_loader = dm.train_dataloader()
batch = next(iter(train_loader))
# get shape of the batch
print(batch["input_values"].shape)
print(batch["labels"].shape)

torch.Size([32, 1, 80000])
torch.Size([32])


In [3]:
print(batch["labels"])
print(dm.num_classes)

tensor([  9,   1, 316, 367,  52, 116, 407,  30,   7,  22, 369, 325, 327,  68,
        110, 295, 325, 123, 324,  60, 234, 388,  45, 306, 170, 338, 131, 313,
        170,  79, 266, 312])
411


In [4]:
from transformers import Wav2Vec2Model
import torch
basemodel = Wav2Vec2Model.from_pretrained("facebook/hubert-base-ls960", torch_dtype=torch.float16, attn_implementation=None).to("cuda:1")

You are using a model of type hubert to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
print(basemodel)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [10]:
class Modelwrapper(torch.nn.Module):

    def __init__(self, model, num_classes, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.model = model
        self.lastConv = torch.nn.Conv1d(249, 1, kernel_size=1, dtype=torch.float16)
        self.linear = torch.nn.Linear(512, num_classes, dtype=torch.float16)

    def forward(self, x):
        input = x.squeeze().half()
        output = self.model(input, return_dict=True)
        output = output["extract_features"]
        print("out:", output.shape)
        return torch.nn.functional.sigmoid(self.linear(self.lastConv(output).squeeze()))
    
        

In [7]:
dm.setup(stage="fit")
train_loader = dm.train_dataloader()

In [11]:
loss_func = torch.nn.CrossEntropyLoss()

num_classes = dm.num_classes

model = Modelwrapper(basemodel, num_classes).to("cuda:1")

optimizer = torch.optim.Adam(model.parameters())

epochs = 10

In [12]:
for epoch in range(epochs):
    losses = []
    for i, batch in enumerate(train_loader):
        print("batch: ", batch["input_values"].shape)
        pred = model(batch["input_values"].to("cuda:1"))

        loss = loss_func(pred, batch["labels"].to("cuda:1"))
        losses.append(loss)

        model.zero_grad()
        loss.backward()

        optimizer.step()
    print(f"epoch {i}: trainloss {torch.mean(loss)}")

batch:  torch.Size([32, 1, 80000])
out: torch.Size([32, 249, 512])
loss 6.015625
