# Implementing linear probing pipeline from Ghani
Trying to 100% simulate ghani setup

### 1. Load dataset

In [1]:
from birdset.datamodule.beans_datamodule import BEANSDataModule
from birdset.datamodule.base_datamodule import DatasetConfig

datasetconfig = DatasetConfig(
    dataset_name="beans_watkins",
    hf_path="DBD-research-group/beans_watkins",
    hf_name="default",
)

datamodule = BEANSDataModule(dataset=datasetconfig)
dataset = datamodule._load_data()
dataset["train"]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'labels'],
    num_rows: 1017
})

In [13]:
sample = dataset["train"][1]
print(len(sample["audio"]["array"]) / sample["audio"]["sample_rate"])
for num in sample["audio"]["array"]:
    print(num)

2.66153125
0.0028489339165389538
0.0127276461571455
0.000695886614266783
0.014284023083746433
0.010188997723162174
-0.005624439567327499
0.009112488478422165
0.011030803434550762
0.009036405012011528
0.007366364821791649
0.0073919473215937614
0.0012130429968237877
0.003765843575820327
0.008039159700274467
0.016119513660669327
0.003297208808362484
0.004568168893456459
0.0016249289037659764
0.0014217061689123511
0.0050490545108914375
0.005029771011322737
0.002944555366411805
0.000396410352550447
0.0054098754189908504
0.0024886690080165863
0.0029709392692893744
-0.0007071869913488626
-0.0035097121726721525
-0.011627869680523872
0.003580298274755478
0.004472359549254179
0.0007571040187031031
0.005485096946358681
0.0031540486961603165
0.0016573468456044793
0.013845509849488735
0.009325021877884865
0.005498502403497696
0.0025653131306171417
0.0014286709483712912
0.007022093050181866
-0.0006269124569371343
0.00037875829730182886
-6.072143150959164e-05
0.005916498601436615
0.000449520535767078

In [12]:
from IPython.display import Audio, display
import soundfile as sf
import numpy as np
import scipy.io.wavfile as wav

# Extract the audio array and sampling rate

audio_array = sample["audio"]["array"]  # Adjust according to your dataset's structure
sample_rate = sample["audio"]["sample_rate"]  # This key might vary

# Save the audio as a WAV file
output_path = "output.wav"
wav.write(output_path, sample_rate, np.array(audio_array))
print(f"Audio saved as {output_path}")


def play_audio(file_path):
    data, samplerate = sf.read(file_path)
    display(Audio(data, rate=samplerate))


play_audio("output.wav")

Audio saved as output.wav


In [5]:
lengths = []

for sample in dataset["train"]:
    duration = len(sample["audio"]["array"]) / sample["audio"]["sample_rate"]
    lengths.append(duration)
    if duration > 200:
        print(duration)


average_length = sum(lengths) / len(lengths)
min_length = min(lengths)
max_length = max(lengths)
total_samples = len(lengths)

overview = {
    "Average Length": average_length,
    "Minimum Length": min_length,
    "Maximum Length": max_length,
    "Total Samples": total_samples,
}

overview

20.75946875
57.36165625
52.731375
52.72690625
45.3
156.13425
122.91478125
27.74765625
152.9986875
27.43290625
71.8735625
114.0443125
85.15821875
23.7699375
29.5926875
60.03075
46.62928125
24.5069375
24.5159375
32.76040625
30.02509375
30.94390625
21.86775
32.76040625
35.625
35.625
53.78884375
86.615
35.155
46.8
24.0
27.3216875
41.04834375
38.71
44.775
43.49334375
42.7966875
101.91334375
29.47334375
59.34834375
69.21334375
66.98334375
65.0666875
83.93334375
24.0
31.40334375
64.13334375
52.06834375
24.0
44.2016875
24.0
81.49
69.635
51.1266875
72.31
75.92
187.3453125
1260.0
55.24853125
142.5
45.3
41.5669375
65.5905
69.9813125
20.56134375
42.44940625
44.3465
29.4063125
1162.0983125
28.627125
67.5120625
58.55040625
1260.0
20.03225
89.1894375
53.03815625
52.1364375


{'Average Length': 9.833395126597836,
 'Minimum Length': 0.065625,
 'Maximum Length': 1260.0,
 'Total Samples': 1017}

This can be used to check the class distribution

In [2]:
from collections import Counter

print(dataset["train"][0])
label_counts = dict(Counter(dataset["train"]["labels"]))
print(label_counts)

{'audio': {'path': 'Mac-3-A-3.wav', 'array': array([ 2.13882100e-04,  4.85118391e-04, -2.17375666e-04, ...,
        9.30107664e-04,  7.66232726e-04,  6.42658269e-05]), 'sample_rate': 32000}, 'labels': 9}
{9: 51, 5: 13, 2: 43, 8: 20, 7: 44, 0: 91, 1: 31, 4: 25, 6: 27, 3: 70}


(Only use next two cells if intended) This part is for removing specific classes from watkins (for this the conversion from class name to int in beans_datamodule has to be commented out)

In [2]:
from collections import Counter

#! Here we remove all labels that have less than x examples
x = 32
label_counts = dict(Counter(dataset["train"]["labels"]))

filtered_labels = {label: count for label, count in label_counts.items() if count < x}
print(filtered_labels)
# Remove additional labels
labels_to_remove = ["Fin,_Finback_Whale", "Northern_Right_Whale"]

for label in labels_to_remove:
    filtered_labels[label] = label_counts[label]

# Create a new dataset excluding the filtered labels
dataset = dataset.filter(lambda example: example["labels"] not in filtered_labels)

print(dataset)

{'Bottlenose_Dolphin': 15, 'Beluga,_White_Whale': 30, 'Bearded_Seal': 22, 'Minke_Whale': 10, 'Southern_Right_Whale': 15, 'Narwhal': 30, 'Harp_Seal': 28, 'Fin,_Finback_Whale': 30, 'Ross_Seal': 30, 'Rough-Toothed_Dolphin': 30, 'Killer_Whale': 21, 'Leopard_Seal': 6, 'Walrus': 23, 'Common_Dolphin': 31}


Filter:   0%|          | 0/1017 [00:00<?, ? examples/s]

Filter:   0%|          | 0/339 [00:00<?, ? examples/s]

Filter:   0%|          | 0/339 [00:00<?, ? examples/s]

Filter:   0%|          | 0/203 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'labels'],
        num_rows: 664
    })
    valid: Dataset({
        features: ['audio', 'labels'],
        num_rows: 221
    })
    test: Dataset({
        features: ['audio', 'labels'],
        num_rows: 221
    })
    train_low: Dataset({
        features: ['audio', 'labels'],
        num_rows: 133
    })
})


In [3]:
# Convert labels back to ids
labels = set()
for split in dataset.keys():
    labels.update(dataset[split]["labels"])

label_to_id = {lbl: i for i, lbl in enumerate(labels)}


def label_to_id_fn(batch):
    for i in range(len(batch["labels"])):
        batch["labels"][i] = label_to_id[batch["labels"][i]]
    return batch


dataset = dataset.map(
    label_to_id_fn, batched=True, batch_size=500, load_from_cache_file=True
)
print(len(labels))

Map:   0%|          | 0/664 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

16


### 2. Load model and set parameters

Perch:

In [2]:
from birdset.modules.models.perch import PerchModel
import torch.nn as nn

num_classes = 31  #! Don't forget to change this
sample_rate = 32_000  # Try 48_000 here
window_length = 5
input_size = 1280

perch_network = PerchModel(num_classes=num_classes, tfhub_version=4, gpu_to_use=0)

2024-07-04 21:25:31.202734: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 21:25:31.202805: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 21:25:31.202836: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-04 21:25:31.213627: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-04 21:25:33.585600: I tensorflow/core/comm

BirdNET:

In [20]:
from birdset.modules.models.birdnet import BirdNetModel
import torch.nn as nn

num_classes = 31  #! Don't forget to change this
sample_rate = 48_000
window_length = 3
input_size = 1024

perch_network = BirdNetModel(
    num_classes=num_classes,
    model_path="../../checkpoints/birdnet/BirdNET_GLOBAL_6K_V2.4_Model",
    train_classifier=False,
)



### 3. Batch and Preprocess the dataset 

#### 3.1. k-sample the dataset

In [6]:
print("Number of samples in the training set:", len(dataset["train"]))
print("Number of samples in the validation set:", len(dataset["valid"]))
print("Number of samples in the testing set:", len(dataset["test"]))

Number of samples in the training set: 664
Number of samples in the validation set: 221
Number of samples in the testing set: 221


In [6]:
import random
from collections import defaultdict
from datasets import concatenate_datasets, DatasetDict, Dataset

# Define the number of samples per class
samples_per_class = 30

# Merge the train, valid, and test splits
merged_data = concatenate_datasets(
    [dataset["train"], dataset["valid"], dataset["test"]]
)
merged_data = merged_data.shuffle()

# Create a dictionary to store the selected samples per class
selected_samples = defaultdict(list)
rest_samples = []
# Iterate over the merged data and select the desired number of samples per class
for sample in merged_data:
    label = sample["labels"]
    if len(selected_samples[label]) < samples_per_class:
        selected_samples[label].append(sample)
    else:
        rest_samples.append(sample)

# Flatten the selected samples into a single list
selected_samples = [
    sample for samples in selected_samples.values() for sample in samples
]

# Split the selected samples into training, validation, and testing sets
test_ratio = 0.5

num_samples = len(rest_samples)
num_test_samples = int(test_ratio * num_samples)

train_data = selected_samples
test_data = rest_samples[:num_test_samples]
val_data = rest_samples[num_test_samples:]

train_data = Dataset.from_dict(
    {key: [sample[key] for sample in train_data] for key in train_data[0]}
)
test_data = Dataset.from_dict(
    {key: [sample[key] for sample in test_data] for key in test_data[0]}
)
val_data = Dataset.from_dict(
    {key: [sample[key] for sample in val_data] for key in val_data[0]}
)

# Print the number of samples in each split
print("Number of samples in the training set:", len(train_data))
print("Number of samples in the validation set:", len(val_data))
print("Number of samples in the testing set:", len(test_data))

# Combine into a DatasetDict
datasett = DatasetDict({"train": train_data, "valid": val_data, "test": test_data})

Number of samples in the training set: 480
Number of samples in the validation set: 313
Number of samples in the testing set: 313


Preprocess the dataset 

In [3]:
import torch
import torchaudio


# Resample function (#! Move resampler out)
# Get embeddings
def get_embedding(audio):
    # Get waveform and sampling rate
    waveform = torch.tensor(audio["array"], dtype=torch.float32)
    dataset_sample_rate = audio["sample_rate"]
    # Resample audio
    audio = resample_audio(waveform, dataset_sample_rate, sample_rate)
    # print('Audio length:', audio.shape[0]/sample_rate)
    # Zero-padding
    audio = zero_pad(waveform)

    # Check if audio is too long
    if waveform.shape[0] > window_length * sample_rate:
        return frame_and_average(waveform)
    else:
        return perch_network.get_embeddings(audio)[
            0
        ]  # To just use embeddings not logits


# Resample function
def resample_audio(audio, orig_sr, target_sr):
    resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
    return resampler(audio)


# Zero-padding function
def zero_pad(audio):
    desired_num_samples = window_length * sample_rate
    current_num_samples = audio.shape[0]
    padding = desired_num_samples - current_num_samples
    if padding > 0:
        # print('padding')
        pad_left = padding // 2
        pad_right = padding - pad_left
        audio = torch.nn.functional.pad(audio, (pad_left, pad_right))
    return audio


# Average multiple embeddings function
def frame_and_average(audio):
    # Ensure the waveform is mono
    # if audio.size(0) > 1:
    # print("What")
    # audio = audio.mean(dim=0, keepdim=True)

    # Frame the audio
    frame_size = window_length * sample_rate
    hop_size = window_length * sample_rate
    frames = audio.unfold(0, frame_size, hop_size)

    # Generate embeddings for each frame
    l = []
    for frame in frames:
        embedding = perch_network.get_embeddings(frame)
        l.append(embedding[0])  # To just use embeddings not logits

    embeddings = torch.stack(tuple(l))

    # Average the embeddings
    averaged_embedding = embeddings.mean(dim=0)

    return averaged_embedding

In [4]:
from torch.utils.data import DataLoader


def preprocess(item):
    audio = item["audio"]
    return get_embedding(audio)


def collate_fn(batch):
    batch_new = {}
    audios = [preprocess(item) for item in batch]
    batch_new["audio"] = torch.stack(tuple(audios), dim=0)

    # batch_new['labels'] = torch.stack([torch.nn.functional.one_hot(torch.tensor(item['labels'],  dtype=torch.long), num_classes=num_classes) for item in batch]).float() #* For one hot-encoding
    batch_new["labels"] = torch.tensor([item["labels"] for item in batch])
    return batch_new


train_loader = DataLoader(
    dataset["train"], batch_size=32, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    dataset["test"], batch_size=32, shuffle=False, collate_fn=collate_fn
)
val_loader = DataLoader(
    dataset["valid"], batch_size=32, shuffle=False, collate_fn=collate_fn
)

# Example of iterating through the DataLoader
for batch in train_loader:
    print(batch.keys())
    print(batch["audio"])
    print(batch["labels"])
    print(batch["audio"].shape)
    break

2024-07-04 21:25:47.880041: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55f08fc14c30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-04 21:25:47.880117: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-07-04 21:25:48.279236: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-04 21:25:48.681183: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator jax2tf_infer_fn_/assert_equal_1/Assert/AssertGuard/Assert
2024-07-04 21:25:48.847162: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2024-07-04 21:25:53.225722: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


dict_keys(['audio', 'labels'])
tensor([[[ 0.0803, -0.0358, -0.0098,  ...,  0.0393,  0.0519,  0.1247]],

        [[ 0.1728, -0.0180,  0.0191,  ..., -0.0373,  0.0470,  0.0299]],

        [[-0.0328, -0.0520,  0.0628,  ...,  0.0046,  0.2843,  0.0919]],

        ...,

        [[-0.0006, -0.1183,  0.0225,  ..., -0.0019,  0.2713,  0.2067]],

        [[-0.0306, -0.0635, -0.0100,  ...,  0.0887,  0.1078,  0.0331]],

        [[ 0.0104, -0.0285,  0.0575,  ..., -0.0295,  0.0759, -0.0218]]])
tensor([ 4,  2, 18, 24, 11, 30, 29, 26,  3, 17, 26, 10, 18,  4, 29,  7, 21,  6,
         6, 18, 21, 29,  4,  5, 14, 18,  3, 26, 26, 14, 19, 26])
torch.Size([32, 1, 1280])


### 4. Train the classifier

In [5]:
gpu_id = 0  # Change this to the ID of the GPU you want to use
device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print(f"Using device: {device}")  #! Not working right now

Using device: cuda:0


In [6]:
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim


# Define your classifier model
class Classifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)

    # self.softmax = torch.softmax(dim=1) #* self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.squeeze(1)
        x = torch.softmax(self.fc(x), dim=1)
        return x


# Create an instance of your classifier model
classifier = Classifier(input_size, num_classes).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # * nn.BCELoss()
optimizer = optim.AdamW(classifier.parameters(), lr=1e-2, weight_decay=0.01)

# Set the number of training epochs
num_epochs = 25

early_stopping_patience = 5
best_loss = float("inf")
patience_counter = 0

# Training loop
for epoch in range(num_epochs):
    classifier.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        inputs = batch["audio"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = classifier(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Calculate average loss for this epoch
    train_loss /= len(train_loader)

    # Validate the model (assuming you have a validation loader)
    classifier.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch["audio"].to(device)
            labels = batch["labels"].to(device)
            outputs = classifier(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(
        f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}"
    )

    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

Epoch 1/25:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 1/25: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]


Epoch 1/25, Train Loss: 3.2697, Val Loss: 3.0770


Epoch 2/25: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Epoch 2/25, Train Loss: 2.9600, Val Loss: 2.9153


Epoch 3/25: 100%|██████████| 32/32 [00:35<00:00,  1.11s/it]


Epoch 3/25, Train Loss: 2.7891, Val Loss: 2.8264


Epoch 4/25: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Epoch 4/25, Train Loss: 2.7018, Val Loss: 2.7838


Epoch 5/25: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Epoch 5/25, Train Loss: 2.6629, Val Loss: 2.7608


Epoch 6/25: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Epoch 6/25, Train Loss: 2.6391, Val Loss: 2.7473


Epoch 7/25: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Epoch 7/25, Train Loss: 2.6267, Val Loss: 2.7424


Epoch 8/25: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Epoch 8/25, Train Loss: 2.6173, Val Loss: 2.7378


Epoch 9/25: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]


Epoch 9/25, Train Loss: 2.6120, Val Loss: 2.7286


Epoch 10/25:  50%|█████     | 16/32 [00:19<00:19,  1.25s/it]


KeyboardInterrupt: 

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score
import torchmetrics

# Set the model to evaluation mode
perch_network.eval()


# Initialize the metrics
metrics = torchmetrics.MetricCollection(
    {
        "T1Accuracy": torchmetrics.Accuracy(
            task="multiclass", num_classes=num_classes, top_k=1
        ),
        "T3Accuracy": torchmetrics.Accuracy(
            task="multiclass", num_classes=num_classes, top_k=3
        ),
        "AUROC": torchmetrics.AUROC(
            task="multiclass", num_classes=num_classes, average="macro"
        ),
        "F1": torchmetrics.F1Score(task="multiclass", num_classes=num_classes),
    }
).to(device)

# Iterate over the test_loader
for batch in test_loader:
    # Forward pass
    inputs = batch["audio"].to(device)
    labels = batch["labels"].to(device)
    # labels = torch.argmax(labels, dim=1) #* For one hot-encoding
    with torch.no_grad():
        outputs = classifier(inputs)
        outputs = outputs.squeeze(1)

    # Update the metrics
    metrics(outputs, labels)

# Compute and print the metric values
metric_values = metrics.compute()
for metric_name, metric_value in metric_values.items():
    print(f"{metric_name}: {metric_value}")

AUROC: 0.9706842303276062
F1: 0.8112094402313232
T1Accuracy: 0.8112094402313232
T3Accuracy: 0.9115044474601746


### <u>Ghani datasets</u>

|Dataset|Classes|Available?|
|-------|-------|----------|
|Godwit Calls|5|No part of a master thesis|
|Yellowhammer Dialects|2|Probably not (Only two classes anyway)|
|Bats|5|Yes but pitch shifting and two sources of which one is private|
|Watkins|32|Yes but removed some classes|
|RFCX Frog & Bird|12+13|Yes but for detection and not split in BEANS|

### <u>Results with Perch</u>
<span style="color:crimson"><b>These results where created with a Learning Rate (LR) of 1e-5 which was way too small which is why the values where so bad! With a LR of 1e-2 the Results are like Ghani's!</b></span>
<br>These are the results in this isolated run whereas we compare them to the Birdset Pipeline setup. We used 25 Epochs.
| Dataset         | Classes|AUROC (BirdsetPipeline results) | T1 (B.P.) | Audio lengths |Samples per class|
|--------------------|---|---------------------|-----------------|-----|----|
| beans_watkins      |31|**89** (85)                   |<span style="color:crimson"><b>81%</b></span> **32%** (23%)|Different lengths 1-45s|~30|
| beans_bats         |10|**79**  (78)                  |38% (**39%**)|0-5s|600|
| beans_cbi          |264|   (96)                 |(51%)|4-10s (Mostly 10)|~50-70|
| beans_dogs         |10|**78**    (75)                |**28%** (31%)|2-30s|13-70|
| beans_humbugdb     |14|**69** (66)|**46%** (12%)|1-55s|~70 or ~400|

### <u>Results with BirdNET</u>
<span style="color:crimson"><b>Same problem here!</b></span><br>
Used 25 Epochs
| Dataset         |AUROC | T1 |
|------------------|---|---|
| beans_watkins      |84|23%|
| beans_bats         |||
| beans_cbi          |||
| beans_dogs         |||
| beans_humbugdb     |||