In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import librosa
import soundfile as sf

from tqdm.notebook import tqdm
from glob import glob

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torchaudio.compliance import kaldi
import torch.nn.functional as F
import json, requests, timm
from collections import defaultdict

In [None]:
class BirdAudioMAEDataset(Dataset):

    def __init__(self, df_audio_meta, dataset_dir, sampling_rate=16_000):
        self.df_audio_meta = df_audio_meta
        self.dataset_dir = dataset_dir
        self.sampling_rate = sampling_rate
        self.global_mean = -4.2677393
        self.global_std = 4.5689974

    def __len__(self):
        return len(self.df_audio_meta)

    def preprocess(self, x: torch.Tensor):
        x = x - x.mean()
        melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
        if melspec.shape[0] < 1024:
            melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
        else:
            random_start = np.random.randint(0, len(melspec)-1025)
            melspec = melspec[random_start:random_start+1024]
        melspec = (melspec - self.global_mean) / (self.global_std * 2)
        return melspec

    def __getitem__(self, idx):
        row = self.df_audio_meta.iloc[idx]
        audio_path = f"{self.dataset_dir}/{row['file_name']}"
        # load audio file
        audio_arr, sr = librosa.load(audio_path, sr=self.sampling_rate)
        # get the spectrogram
        spec = self.preprocess(torch.tensor(audio_arr))
        return spec

In [None]:
TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True)

LABEL_URL = "https://huggingface.co/datasets/huggingface/label-files/raw/main/audioset-id2label.json"
AUDIOSET_LABELS = list(json.loads(requests.get(LABEL_URL).content).values())

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

In [None]:
dataset = BirdAudioMAEDataset(df_audios, f"{DRIVE_FOLDER}/Audio_GreaterManaus")
dataloader = DataLoader(dataset, batch_size=32, num_workers=2)

probs_collect, classes_collect = [], []

MODEL.to(DEVICE)
MODEL.eval()

# ~ 3.5 hours
for i, data in tqdm(enumerate(dataloader), total=len(dataloader)):

    # if i == 0:
    #     print("Example Spectrogram:")
    #     plt.figure(figsize=(9, 2))
    #     plt.imshow(data[0].numpy().T, aspect='auto', origin='lower', cmap='cool')
    #     plt.colorbar()
    #     plt.show()

    data = data.to(DEVICE)
    with torch.no_grad():
        logits = MODEL(data.unsqueeze(1))
        topk_probs, topk_classes = logits.sigmoid().topk(10)

    probs_collect.append(topk_probs.cpu().numpy())
    classes_collect.append(topk_classes.cpu().numpy())

    if (i+1) % 100 == 0:
        print(f"Finished {i} batches")

In [None]:
# aggregate all results
all_probs = np.concatenate(probs_collect)
all_classes = np.concatenate(classes_collect)

class_cols = [f'class_{i+1}' for i in range(10)]
proba_cols = [f'probability_{i+1}' for i in range(10)]

df_results = pd.DataFrame(
    np.concatenate([all_classes, all_probs], axis=1),
    columns = class_cols + proba_cols
)

df_results[class_cols] = df_results[class_cols].astype(int)
df_results[proba_cols] *= 100

id_to_label = {i: v for i, v in enumerate(AUDIOSET_LABELS)}
for _col in class_cols:
    df_results[_col] = df_results[_col].map(id_to_label)

df_audios_labels = df_audios.join(df_results, how='left').dropna()

df_audios_labels.to_csv(f"{DRIVE_FOLDER}/Audio_GreaterManaus/Audio_Segment_Labels.csv",index=False)

df_audios_labels