## Training Notebook for Sound Event Detection

Sound event detection (SED) is the task of detecting the type as well as the onset and offset times of sound events in audio streams.

In this notebook i will show how to train Sound Event Detection (SED) model with only weak annotation.

<img src="image.png">

We need to detect sound events from audio clips, and provide prediction of what sound event exists from when to when.

In [3]:
import random
from glob import glob
from collections import OrderedDict
import os.path as osp
import os

from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer

import audiomentations as AA
import albumentations as A
from functools import partial, lru_cache

import colorednoise as cn
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import librosa, librosa.display
import torchaudio
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

import torch
import torchvision
from torch import nn
import torch.nn.functional as F
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import Dataset, DataLoader
from audiomentations.core.transforms_interface import BaseTransform, BaseWaveformTransform

from IPython.display import Audio
import warnings

import timm.models as models 
from torch.distributions.beta import Beta
warnings.filterwarnings('ignore')



In [4]:
def seed_everything(seed=42):
    print(f'setting everything to seed {seed}')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.cuda.empty_cache()
seed_everything()

setting everything to seed 42


## Model functions

First we need to add some helper function for the SED model, including intialising weights of new layers we will be adding to the model

In [5]:
def init_layer(layer): 
    """Init layers using xavier uniform weight distribution"""
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

def init_bn(bn):
    """Init the BatchNorm Layer so bias is 0 and weights is 1"""
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)
    
def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled 
def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

#This is the head of the network, which can help us for either classification or tagging audio from an audio signal. 
# This block also helps to 
class AttBlock(nn.Module):
    def __init__(self,in_features: int, out_features: int, activation="linear",):
        super().__init__()
        #activation is an activation function like ReLU or Tahn
        self.activation = activation
        self.att = nn.Conv1d( #this is a 1D convolutional layer
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d( #this is a 1D convolutional layer
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features) #we create a batch norm layer
        self.init_weights() #intialise weights
        
    #we intialise the weights of the attention block
    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1) #we pass the incoming features through a activation function
                 #then a tahn and a softmax to get probabilites of classes within the features. 
        cla = self.nonlinear_transform(self.cla(x)) #calculate the Class attention and pass through an non linear function
        x = torch.sum(norm_att * cla, dim=2) #and sum the norm_att and cla via the 3rd dim. 
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

## MixUp

This is a powerful augmentation technique, that can improve metrics scores and generalisation for models. It is a simple technique, that essentailly combines two images using a weight chosen at random, and we combine the labels later on. More information can be found here: https://arxiv.org/abs/1710.09412

In [6]:
def do_mixup(x: torch.Tensor, mixup_lambda: torch.Tensor, permutations: torch.Tensor):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
      n
      
    """

    d = x.device
    x1 = x[permutations].to(d)
    lam = torch.unsqueeze(torch.unsqueeze(mixup_lambda, 1),3).to(d)
    out = torch.lerp(x1, x, lam)
    return out


class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.distrib = Beta(torch.tensor(mixup_alpha), torch.tensor(mixup_alpha))

    def get_lambda_and_perm(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        indexes = torch.randperm(batch_size)
        lam  = self.distrib.sample((batch_size,)).unsqueeze(-1)

        return lam, indexes

## SED Model

To Train a model for SED, we will be using a model based on https://arxiv.org/abs/1912.10211
Where the model will use a differenet encoder, but the same attention head. 

The model takes in raw audio signals, and converts these to log melspectrograms using torchlibrosa. This is then converted to a 3 channel input to a pretrained model, and passed through the CNN. 

Although the input is downsized many times, the size of the third dimension of the tensor are unchanged, and contains the time information, which we can use for predicting which bird call is within the audio signal, and where.We are using weakly supervised labeling for training. 

<img src="imgae2.png">

More information can be found here: https://github.com/qiuqiangkong/audioset_tagging_cnn

In [7]:
class AudioSEDModel(nn.Module):
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann' #params for the spectrogram extractor
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 30  # Downsampled ratio

        # Spectrogram extractor performs this on the GPU
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor performs this on the GPU
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter performs this on the GPU
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)
        
        # Model Encoder
        self.encoder = encoder_params[encoder]["init_op"]() #pretrained model to use
        self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True) #new linear layer is placed on the front of the model
        self.att_block = AttBlock(1024, classes_num, activation="sigmoid") #create an attention block
        self.bn0 = nn.BatchNorm2d(mel_bins) #batch norm for the number of freq_bins (melbins) used in the log melspectrogram
        self.init_weight() #we intialise the weights of the new model
       
    
    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)
    
    def forward(self, input, mixup_lambda=None, perm=None):
        """Input : (batch_size, data_length)"""
        #we convert the input to a melspectrogram
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)

        #frame numbers = time steps or x axis of spectrogram
        frames_num = x.shape[2]
        x = x.transpose(1, 3) #transpose the input, so that the mels bins, and p[assed through a batch norm layers
        x = self.bn0(x)
        x = x.transpose(1, 3) #transpose it back
        #if the model is training we add image augmentation from the torchlibrosa library
        if self.training:
            x = self.spec_augmenter(x) 
        
        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda, perm)
        
        # Output shape (batch size, channels, time, frequency)
        x = x.expand(x.shape[0], 3, x.shape[2], x.shape[3])
        #convert it to the dimensions that need to be passed through the encoder. 
        #and pass it through the enocder which is a pretrained CNN
        x = self.encoder.forward_features(x)
        # Aggregate in frequency axis
        x = torch.mean(x, dim=3)

        #do average pool and max pooling. 
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        #do drop out to avoid over fitting 
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        #pass the features through a ReLU
        #add drop out and transpose the feature tensor
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)

        #we get the clipwise output 
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)

        framewise_output = pad_framewise_output(framewise_output, frames_num)
        #we pass back the framewsie and clipwise predictions and the probabilites for classes
        output_dict = {
            'framewise_output' : framewise_output,
            'logit' : logit,
            'clipwise_output' : clipwise_output
        }
        
        return output_dict

## Model Configs
Here are the current configs for the model encoder that can be be used for training
I have included:
 - EfficientNet b0, b7 https://arxiv.org/abs/1905.11946
 - ResNet50 https://arxiv.org/abs/1512.03385
 - Densenet121 https://arxiv.org/abs/1608.06993

In [8]:
#Model Config

encoder_params = {
    "tf_efficientnet_b3_ns": {
        "features": 1536,
        "init_op": partial(models.tf_efficientnet_b3_ns, pretrained=True, drop_path_rate=0.2)
    },
    "tf_efficientnet_b7": {
        "features": 2560,
        "init_op": partial(models.tf_efficientnet_b7, pretrained=True, drop_path_rate=0.5)
    },
    "resnet50": {
        "features": 2048,
        "init_op": partial(models.resnet50, pretrained=True)
    },
    "densenet121": {
        "features": 1024,
        "init_op": partial(models.densenet121, pretrained=True)
    }
}

### Adding a new model encoder
if you wish to add another model from a newer archeteture, you can add it like so. 
First we download the model and get the in features we need.
If you press Tab after models. you can see a list of model archetectures that are avaiable from TIMM

Here I will add Inception V3: https://arxiv.org/abs/1512.00567v3

In [9]:
m =models.inception_v3(pretrained=True)

### If m.fc doesn't exist, use m.classfier istead

In [10]:
encoder_params['inception_v3'] = {"features": m.fc.in_features, 'init_op':partial(models.inception_v3, pretrained=True) }
del(m)

In [11]:
encoder_params

{'tf_efficientnet_b3_ns': {'features': 1536,
  'init_op': functools.partial(<function tf_efficientnet_b3_ns at 0x0000018F230EC678>, pretrained=True, drop_path_rate=0.2)},
 'tf_efficientnet_b7': {'features': 2560,
  'init_op': functools.partial(<function tf_efficientnet_b7 at 0x0000018F230EBE58>, pretrained=True, drop_path_rate=0.5)},
 'resnet50': {'features': 2048,
  'init_op': functools.partial(<function resnet50 at 0x0000018F23118048>, pretrained=True)},
 'densenet121': {'features': 1024,
  'init_op': functools.partial(<function densenet121 at 0x0000018F230D99D8>, pretrained=True)},
 'inception_v3': {'features': 2048,
  'init_op': functools.partial(<function inception_v3 at 0x0000018F231495E8>, pretrained=True)}}

## Audio Augmentation

For the purpose of this notebook, I will be using stacked GaussianNoise, GaussianNoiseSNR, PinkNoise and Gain. 

This can be changed to the previous augmentation or other. 

In [12]:
class PinkNoise(BaseWaveformTransform):
    """Add pink noise to the samples with random Signal to Noise Ratio (SNR)"""

    supports_multichannel = True

    def __init__(self, min_snr=0.5, max_snr=20.0, p=0.5):
        """
        :param min_SNR: Minimum signal-to-noise ratio
        :param max_SNR: Maximum signal-to-noise ratio
        :param p: The probability of applying this transform
        """
        super().__init__(p)
        self.min_snr = min_snr
        self.max_snr = max_snr


    def apply(self, y, sample_rate):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise ** 2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented

In [13]:
import audiomentations as AA

audio_transforms = AA.Compose([
    AA.AddGaussianNoise(p=0.2),
    AA.AddGaussianSNR(p=0.2),
    AA.Gain(min_gain_in_db=-15,max_gain_in_db=15,p=0.3),
    PinkNoise( p=0.5),
    AA.AddGaussianNoise(p=0.2),
    AA.AddGaussianSNR(p=0.2),
    AA.Gain(min_gain_in_db=-15,max_gain_in_db=15,p=0.3)
])

In [14]:
df = pd.read_csv('train_folds.csv')
y,sr = librosa.load(f'../{df.iloc[0].filepath}', sr=None)
Audio(y,rate =sr)

In [15]:
audio = audio_transforms(y, sample_rate=sr)
Audio(audio,rate =sr)

In [16]:
def crop_and_pad(fn, label, period, record, n_class=65):
    y, sr = librosa.load(fn, sr=None) #we load the audio file
    len_y = len(y) #get its length
    effective_length = sr * period # length to use of the audio signal 
    rint = np.random.randint(len(record['X_min'])) #the CSV file is stored as an array for each filepath
    time_start = record['X_min'][rint] * sr #get the beginning and end
    time_end = record['X_max'][rint] * sr
    if len_y > effective_length:
        # Positioning sound slice  making sure it isn't larger than the length of the total sound clip
        center = np.round((time_start + time_end) / 2)
        beginning = center - effective_length / 2
        if beginning < 0:
            beginning = 0
        beginning = np.random.randint(beginning, center)
        ending = beginning + effective_length
        if ending > len_y:
            ending = len_y
        beginning = ending - effective_length
        y = y[beginning:ending].astype(np.float32)
    else:
        y = y.astype(np.float32)
        beginning = 0
        ending = effective_length


    beginning_time = beginning / sr #get the species that lie between the beginning and ending
    ending_time = ending / sr
    
    for i in range(len(record['X_min'])):
        if (record['X_min'][i] <= ending_time) and (record['X_max'][i] >= beginning_time):
            label[record['species_id'][i]] = 1
    
    return y, label

## Dataset 

This is the class that will prepare the data before it is passed onto the model. This is where we pas the CSV file and other argumnets on how we wish for the audio file to be adjusted.

In [17]:
class SedDataset:
    def __init__(self, df, config=None, audio_transforms= None, mode='train'):
        self.period = config.duration #duration we are training on
        self.stride = config.stride #sliding window 
        self.audio_transform = audio_transforms #audio transforms
        if mode=='train': #if training we use the train_audio root or root dir
            self.data_path = config.TRAIN_AUDIO_ROOT
        else:
            self.data_path = config.data_root
        self.config = config #store the config and mode
        self.mode = mode
        #we store the csv file as a list with the recording ID as the unique key
        self.df = df.groupby("filepath").agg(lambda x: list(x)).reset_index() 

    def __len__(self):
        return len(self.df)#The number of elements in the dataset, we are going for the number of audio files
    
    def __getitem__(self, idx): 
        #This is the function that loads the audio file, transforms it and 
        #returns the audio file and label back
        
        record = self.df.iloc[idx] #we get the record from the csv file
        audio_fn = f"{self.data_path}/{record['filepath']}" #get the audio file filename
        
        labels = np.zeros(self.config.num_classes, dtype='f')
        if self.mode == "train": #if we are training, we get an audio segment, augment it 
            y,label = crop_and_pad(audio_fn,labels, self.config.duration, record)

            if self.audio_transform:
                y = self.audio_transform(samples=y, sample_rate=self.config.original_sr)
        else: 
            #if we are inferencing or evaluating, we get the full audio file
            #and associated labels
            y, sr = librosa.load(audio_fn,sr=None)
            segments = len(y) / (self.stride*sr)
            segments = int(np.ceil(segments))
            length = int(self.period * sr)
            y_stacked = []
            for i in range(0,segments):
                if (i + 1) * length > len(y):
                    y_ = y[len(y) - length:len(y)]
                else:
                    y_ = y[i * length:(i + 1) * length]
                y_stacked.append(y_)
            y  = np.array(y_stacked)
            if self.mode == 'valid':
                for s in record.species_id:
                    if s== -1:
                        continue
                    labels[s] = 1
        return y, labels

## Metric
The metric used to evaluate network is the label-weighted label-ranking average precision, which is a generalization of the mean reciprocal rank measure for the case where there can be multiple true labels per audio file.

More information can be found here https://scikit-learn.org/stable/modules/model_evaluation.html#label-ranking-average-precision

In [18]:
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes).to(preds.device)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(
        ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(
        np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0).to(preds.device)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

## Loss Function Focal Loss
For the loss function, simnilary to the other training notebook, we are using binary cross entropy, 
but we are adding some extra functions turning this losss function into Focal Loss. Which helps to deal with class imbalance. Focal Loss paper can be found here: https://arxiv.org/pdf/1708.02002.pdf

In [19]:
class PANNsLoss_FocalLoss(nn.Module):
    def __init__(self, gamma= 2.0):
        super().__init__()
        self.gamma = gamma
        self.bce = nn.BCELoss(reduction='none')

    def forward(self, pred, target):
        pred_ = pred["clipwise_output"] #we use the clipwise_outpout from the model
        #make sure we have no nans or infinites in the tensor
        pred_ = torch.where(torch.isnan(pred_),
                             torch.zeros_like(pred_),
                             pred_)
        pred_ = torch.where(torch.isinf(pred_),
                             torch.zeros_like(pred_),
                             pred_)

        target = target.float()
        bce_loss = self.bce(pred_, target)#get binary cross entropy
        probas = torch.sigmoid(pred_)# get probabilites of the preds
        loss = torch.where(target >= 0.5, (1. - probas)** self.gamma * bce_loss, probas** self.gamma * bce_loss) #focal loss function
        loss = loss.mean()
        return loss

## Learner

For training this model, I am using pure Pytorch, as Pytorch lightnight doesn't seem to work all that well. So you will see a training loop and dataloaders being built. 

In [20]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [21]:
def train_epoch(config, model, loader, loss_func, optimizer, scheduler, epoch):
    preds_, labels = [],[] #arrays to store labels and predictions per batch for training
    mixup_exp = Mixup(0.2) #mixup distribution
    losses = AverageMeter() #store the loss average
    model.train() #we set the model to traininig mode
    t = tqdm(loader)
    for i, (im,l) in enumerate(t): #loop though the batches
        optimizer.zero_grad() #zero the gradients of the optimiser
        im = im.to(config.device) #put the audio signal and label onto the GPU
        l = l.to(config.device)
        lam, indexes = mixup_exp.get_lambda_and_perm(l.shape[0]) #get the indexes of the mixuplabels
        output = model(im, lam, indexes) #predict on the model
        l = torch.lerp(l[indexes], l, lam.to(config.device)).to(config.device) #get the correct label for predicting
        loss = loss_func(output, l) #calculate the loss function
        loss.backward() #do backpropogation
        optimizer.step() #step the optimiser and scheduler if there is one
        if scheduler and config.step_scheduler:
            scheduler.step()

        preds_.extend(torch.sigmoid(torch.max(output['framewise_output'], dim=1)[0]).detach().cpu().numpy()) #store the predictions
        labels.extend(l.detach().cpu().numpy())
        
        bs = im.size(0) #store loss for the batch
        losses.update(loss.item(), bs) #get the loss of the network

        t.set_description(f"Train E:{epoch} - Loss{losses.avg:0.4f}") #set the loss on the twdm progress bar
    t.close()
    
    p = torch.from_numpy(np.array(preds_)) #calculate metrics
    t = torch.from_numpy(np.array(labels))
    return LWLRAP(p, t), losses.avg
        
def valid_epoch(config, model, loader, loss_fn, epoch):
    losses = AverageMeter() #store losses for validation
    preds_, labels = [],[] #arrays to store labels and predictions per batch for training
    model.eval() #set the model to evaluate mode
    with torch.no_grad():
        t = tqdm(loader)
        for i, (im,l) in enumerate(t): #loop through the data
            im = im.to(config.device) #pass the data to the GPU
            l = l.to(config.device)
            bs, seq, w = im.shape #get the original shape of the data 
            im = im.reshape(bs*seq, w) #we need to reshape it to bs * audio signal where it is current bsxnumber of segments, audio signal
            im = im.float()
            output = model(im) #predict
            output['clipwise_output'] = torch.max(output['clipwise_output'].reshape(bs, seq, -1), dim=1)[0] #we need to change this for the validatioin loss
                                                  
            loss = loss_fn(output, l) #calculate validation loss
            
            output = torch.sigmoid(torch.max(output['framewise_output'], dim=1)[0]) #reshape the framewiseoutput to the correct dims
            output = output.reshape(bs, seq, -1)
            output = torch.sum(output, dim=1)
                               
            preds_.extend(output.detach().cpu().numpy()) #store the predictions
            labels.extend(l.detach().cpu().numpy())
            
            bs = im.size(0)
            losses.update(loss.item(), bs)
            t.set_description(f"Valid E:{epoch} - Loss:{losses.avg:0.4f}")
    t.close()
    
    p = torch.from_numpy(np.array(preds_)) #calculate metrics
    t = torch.from_numpy(np.array(labels))
    return LWLRAP(p, t), losses.avg

# Training 

We are about to train the model, we have all of the components but now need to put it all together. 

In [22]:
def main(fold, config, train_df):

    save_path = os.path.join(config.save_path, config.exp_name)
    os.makedirs(save_path, exist_ok=True) #create folders to save the weights of the model

    train_fold = train_df[train_df.kfold != fold]
    valid_fold = train_df[train_df.kfold == fold] #we get the csv folds for training and validation 

    train_dataset = SedDataset( #we create out train and validation datasets
        df = train_fold,
        config=config,
        audio_transforms=audio_transforms,
        mode="train"
    )
    valid_dataset = SedDataset(
        df = valid_fold,
        config=config,
        mode="valid"
    )

    train_loader = torch.utils.data.DataLoader( #we create our dataloaders 
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=config.num_workers,
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=config.num_workers
    )
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #we create our model and load it onto the GPU
    model = AudioSEDModel(**config.model_param)
    model = model.to(config.device)

    criterion = PANNsLoss_FocalLoss() #loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    #we are using another scheduler, which first has a warm up and then linearely decreases the learning rate over time
    num_train_steps = int(len(train_loader) * config.epochs)
    num_warmup_steps = int(0.1 * config.epochs * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

    best_lwlrap = -np.inf #we get the best validation metric score
    early_stop_count = 0

    for epoch in range(config.epochs):

        train_avg, train_loss = train_epoch(config, model, train_loader, criterion, optimizer, scheduler, epoch) #we train the model
        valid_avg, valid_loss = valid_epoch(config, model, valid_loader, criterion, epoch) #we validate the model

        if config.epoch_scheduler: #if the scheduler is scheduled to step every epoch we do it here 
            scheduler.step()
        #we print out information after each epoch including time, date and loss and metric info
        content = f"Fold:{fold}, Epoch:{epoch}, lr:{optimizer.param_groups[0]['lr']:.7}, Train Loss:{train_loss:0.4f} - LWLRAP:{train_avg:0.4f},Valid Loss:{valid_loss:0.4f} - LWLRAP:{valid_avg:0.4f}\n"
        print(content)
        #if the validation metric is better than the current best validation metric#
        #we save the model
        #if it does not improve, we add 1 more to the early stopping count 
        if valid_avg> best_lwlrap:
            print(f"Model Improved From {best_lwlrap} ----> {valid_avg}")
            torch.save(model.state_dict(), os.path.join(save_path, f'fold-{fold}.pth'))
            best_lwlrap = valid_avg
            early_stop_count = 0
        else:
            early_stop_count += 1
        #if early stopping occues, we stop training for this fold. 
        if config.early_stop == early_stop_count:
            print('Early stopping happened')
            break


In [23]:
train_df = pd.read_csv('train_folds.csv')# we load the training CSV

In [24]:
train_df

Unnamed: 0,Label,File,Event_ID,X_min,X_max,Y_min,Y_max,filepaths,wavepath,Group,Species,duration,species_id,filepath,t_diff,kfold
0,antser,20190221_B261T8_2018-06-13_01-05-30.wav,159,11.39810,11.51420,1248.0,1636.0,Spectros_OM/20190221_B261T8_2018-06-13_01-05-3...,Wavs_OM/20190221_B261T8_2018-06-13_01-05-30.wav,bird,Silky-tailed nightjar,116.10,0,Wavs_OM/20190221_B261T8_2018-06-13_01-05-30.wav,0.11610,1
1,antser,20190308_B261T9_2018-06-25_00-54-00.wav,17,1.31195,1.48320,1248.0,1680.0,Spectros_OM/20190308_B261T9_2018-06-25_00-54-0...,Wavs_OM/20190308_B261T9_2018-06-25_00-54-00.wav,bird,Silky-tailed nightjar,171.25,0,Wavs_OM/20190308_B261T9_2018-06-25_00-54-00.wav,0.17125,4
2,antser,20190308_B261T9_2018-06-25_00-56-45.wav,128,13.28470,13.93195,1248.0,2240.0,Spectros_OM/20190308_B261T9_2018-06-25_00-56-4...,Wavs_OM/20190308_B261T9_2018-06-25_00-56-45.wav,bird,Silky-tailed nightjar,647.25,0,Wavs_OM/20190308_B261T9_2018-06-25_00-56-45.wav,0.64725,0
3,antser,20190313_B261T9_2018-06-25_02-55-01.wav,70,9.23865,9.27350,1464.0,1894.0,Spectros_OM/20190313_B261T9_2018-06-25_02-55-0...,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,bird,Silky-tailed nightjar,34.85,0,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,0.03485,1
4,antser,20190308_B261T9_2018-06-25_00-56-45.wav,127,13.19185,13.27020,1248.0,1464.0,Spectros_OM/20190308_B261T9_2018-06-25_00-56-4...,Wavs_OM/20190308_B261T9_2018-06-25_00-56-45.wav,bird,Silky-tailed nightjar,78.35,0,Wavs_OM/20190308_B261T9_2018-06-25_00-56-45.wav,0.07835,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26779,strhuh_call2,20180425_Unburned_300m_2017-12-27_19_38_15_.wav,95,7.47100,7.94990,44.0,1852.0,Spectros_OM/20180425_Unburned_300m_2017-12-27_...,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,bird,Barred owl - variable,478.90,30,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,0.47890,0
26780,strhuh_call2,20180425_Unburned_300m_2017-12-27_19_38_15_.wav,98,7.95575,7.97895,44.0,516.0,Spectros_OM/20180425_Unburned_300m_2017-12-27_...,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,bird,Barred owl - variable,23.20,30,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,0.02320,0
26781,strhuh_call2,20180425_Unburned_300m_2017-12-27_19_38_15_.wav,119,9.58985,9.59275,44.0,86.0,Spectros_OM/20180425_Unburned_300m_2017-12-27_...,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,bird,Barred owl - variable,2.90,30,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,0.00290,0
26782,strhuh_call2,20180425_Unburned_300m_2017-12-27_19_38_15_.wav,143,11.30810,11.35745,560.0,776.0,Spectros_OM/20180425_Unburned_300m_2017-12-27_...,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,bird,Barred owl - variable,49.35,30,Wavs_OM/20180425_Unburned_300m_2017-12-27_19_3...,0.04935,0


In [25]:
num_classes = train_df.Label.nunique() #this is the number of classes we are using check to make sure this is label
class Config:
    exp_name = "Efficientnet_B0" #experiment and save path
    save_path = 'weights'
    pretrain_weights = None 
    model_param = {
        'encoder' : 'tf_efficientnet_b3_ns', #we pass in which pretrained CNN we wish to you
        'sample_rate': 44100,
        'window_size' : 512, #1024 for better results
        'hop_size' : 512, # #mel spectrogram params
        'mel_bins' : 128, # 60
        'fmin' : 0,
        'fmax' : 13000,
        'classes_num' : num_classes #this is the number of classes that you are training on.
    }
    duration = 4 #if you set this less than 4 it will cause the training to crash
    stride=4
    seed = 42
    original_sr = 44100
    num_classes = num_classes 
    epochs = 3
    lr = 1e-3 #learning rate
    ROOT = '.'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
    total_duration =15.
    early_stop = 15
    step_scheduler = True
    epoch_scheduler = True
    batch_size = 16 #number of images that are passed to the GPU
    num_workers = 0# number of workers, this must be 0 on windows! 
    weight_decay = 1e-8 #weight decay for the optimiser
    TRAIN_AUDIO_ROOT = osp.join(ROOT, "..") #root directories
    data_root = osp.join(ROOT, "..")
config = Config()

In [26]:
train_df.t_diff.min(),train_df.t_diff.max(),train_df.t_diff.mean()

(0.0028999999999985, 7.561, 0.22862956429211467)

In [27]:
num_folds = train_df.kfold.nunique()

In [28]:
for fold in range(num_folds):
    main(fold, config, train_df)

  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:0, Epoch:0, lr:0.0007371938, Train Loss:0.0433 - LWLRAP:0.0565,Valid Loss:0.0225 - LWLRAP:0.4598

Model Improved From -inf ----> 0.4597982168197632


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:0, Epoch:1, lr:0.0003652561, Train Loss:0.0226 - LWLRAP:0.0856,Valid Loss:0.0200 - LWLRAP:0.5628

Model Improved From 0.4597982168197632 ----> 0.5628185868263245


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:0, Epoch:2, lr:0.0, Train Loss:0.0210 - LWLRAP:0.0895,Valid Loss:0.0189 - LWLRAP:0.5840

Model Improved From 0.5628185868263245 ----> 0.5840457081794739


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:1, Epoch:0, lr:0.0007371938, Train Loss:0.0417 - LWLRAP:0.0620,Valid Loss:0.0237 - LWLRAP:0.4147

Model Improved From -inf ----> 0.41467127203941345


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:1, Epoch:1, lr:0.0003652561, Train Loss:0.0230 - LWLRAP:0.0856,Valid Loss:0.0206 - LWLRAP:0.5427

Model Improved From 0.41467127203941345 ----> 0.5426749587059021


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:1, Epoch:2, lr:0.0, Train Loss:0.0211 - LWLRAP:0.0861,Valid Loss:0.0195 - LWLRAP:0.5576

Model Improved From 0.5426749587059021 ----> 0.5575760006904602


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:2, Epoch:0, lr:0.0007371938, Train Loss:0.0430 - LWLRAP:0.0482,Valid Loss:0.0235 - LWLRAP:0.3311

Model Improved From -inf ----> 0.3310544192790985


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:2, Epoch:1, lr:0.0003652561, Train Loss:0.0229 - LWLRAP:0.0813,Valid Loss:0.0206 - LWLRAP:0.4916

Model Improved From 0.3310544192790985 ----> 0.49155759811401367


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:2, Epoch:2, lr:0.0, Train Loss:0.0216 - LWLRAP:0.0896,Valid Loss:0.0198 - LWLRAP:0.5195

Model Improved From 0.49155759811401367 ----> 0.5195050835609436


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:3, Epoch:0, lr:0.0007371938, Train Loss:0.0418 - LWLRAP:0.0558,Valid Loss:0.0235 - LWLRAP:0.4236

Model Improved From -inf ----> 0.423614501953125


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:3, Epoch:1, lr:0.0003652561, Train Loss:0.0227 - LWLRAP:0.0840,Valid Loss:0.0207 - LWLRAP:0.5179

Model Improved From 0.423614501953125 ----> 0.517890989780426


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:3, Epoch:2, lr:0.0, Train Loss:0.0211 - LWLRAP:0.0982,Valid Loss:0.0200 - LWLRAP:0.5679

Model Improved From 0.517890989780426 ----> 0.5678731799125671


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:4, Epoch:0, lr:0.0007371938, Train Loss:0.0419 - LWLRAP:0.0494,Valid Loss:0.0231 - LWLRAP:0.3338

Model Improved From -inf ----> 0.3338000774383545


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:4, Epoch:1, lr:0.0003652561, Train Loss:0.0231 - LWLRAP:0.0734,Valid Loss:0.0215 - LWLRAP:0.4605

Model Improved From 0.3338000774383545 ----> 0.46050825715065


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Fold:4, Epoch:2, lr:0.0, Train Loss:0.0214 - LWLRAP:0.0848,Valid Loss:0.0198 - LWLRAP:0.5393

Model Improved From 0.46050825715065 ----> 0.5393466353416443


## Cross Validation of Training Data

In [29]:
classes = list(np.loadtxt('classes.txt',dtype=str, delimiter='\n')) # load classes
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
species_cols = [f'{classes[i]}' for i in range(len(classes))] #create a pecies columns for a CSV file to store predictions

cv_preds = pd.DataFrame(columns=species_cols)
cv_preds['filepath'] = train_df['filepath'].drop_duplicates()
cv_preds.loc[:, species_cols] = 0
cv_preds = cv_preds.reset_index(drop=True)

label_df = cv_preds.copy()

In [30]:
def load_model(paths, config):
    """This function loads the weights that are stored within a path"""
    model = AudioSEDModel(**config.model_param)
    model.load_state_dict(torch.load(paths, map_location='cpu'))
    model.eval()

    return model

In [32]:
for fold in range(5): #change to include an array 

    paths = f'{config.save_path}/{config.exp_name}/fold-{fold}.pth' #load a pretrained model we trained
    model = load_model(paths, config) #we load the weights
    model.to(device)
    
    valid_fold = train_df[train_df.kfold ==fold ]# we load inour valid folds that were trained on each fold
    dataset = SedDataset(valid_fold, config=config, mode='valid')
    test_loader = DataLoader(dataset, batch_size=config.batch_size, #create a dataloader
                             num_workers=config.num_workers,
                             shuffle=False, drop_last=False)
    

    tk = tqdm(test_loader, total=len(test_loader))
    sub_index = 0
    with torch.no_grad():
        _preds, labels = [], []
        for i, (x,l) in enumerate(tk):
            
            x = x.to(device)
            l = l.to(device)
            bs, seq, w = x.shape #we need to reshape the input it as contains lots of segments of raw audio files that are not in the right shape for the CNN
            x = x.reshape(bs*seq, w)
            x = x.float()
            preds = model(x, None, None)# We do the same as above, but we don't mix the labels up, 

            #we also need to reshape the framewise output to calculate the evaluation metric
            preds = torch.sigmoid(torch.max(preds['framewise_output'], dim=1)[0])
            preds = preds.reshape(bs, seq, -1)
            preds = torch.sum(preds, dim=1) #we get our preds and reshape them
            o = preds.cpu().numpy()
            _preds.extend(o)
            labels.extend(l.cpu().numpy()) #add them to the preds cvs file
            for val, ll in zip(o,l.cpu().numpy()):
                cv_preds.loc[sub_index, species_cols] += list(val)
                label_df.loc[sub_index, species_cols] = ll
                sub_index += 1
            
    p = torch.from_numpy(np.array(_preds)) #calculate metrics
    t = torch.from_numpy(np.array(labels))
    print(f"Fold {fold} lwlrap: {LWLRAP(p, t):.6}")
    
cv_preds.loc[:, species_cols] /=5

preds = cv_preds.loc[:,species_cols].values.astype(np.float32)
preds = torch.from_numpy(preds)

labels = label_df.loc[:,species_cols].values.astype(np.float32)
labels = torch.from_numpy(labels)

print(f"Final Ensemble lwlrap: {LWLRAP(preds, labels):.6}") #calculate the final ensemble score

  0%|          | 0/42 [00:00<?, ?it/s]

Fold 0 lwlrap: 0.584046


  0%|          | 0/42 [00:00<?, ?it/s]

Fold 1 lwlrap: 0.557576


  0%|          | 0/42 [00:00<?, ?it/s]

Fold 2 lwlrap: 0.519505


  0%|          | 0/42 [00:00<?, ?it/s]

Fold 3 lwlrap: 0.567873


  0%|          | 0/42 [00:00<?, ?it/s]

Fold 4 lwlrap: 0.539347
Final Ensemble lwlrap: 0.578403


## Loading models
To load a model for use, its quite simple. We create a model with the base model and load our weights. I am choosing the model from the 3rd fold

In [33]:
load_path = f'{config.save_path}/{config.exp_name}/fold-0.pth'
model = AudioSEDModel(**config.model_param) #we create a model
model.load_state_dict(torch.load(load_path, map_location='cpu'))

<All keys matched successfully>

## Saving a model for inference
To save a model out that we use for evaluation or inference we get the average of the models weights and save out their final weights for each fold

In [34]:
torch.save(model.state_dict(), 'weights/model_0.pth')

## Get the Framewise onset and Offet

In [36]:
def prediction_for_clip(test_df: pd.DataFrame, clip: np.ndarray, config,  model,threshold=0.5):
    PERIOD = config.duration
    SR=config.original_sr
    #we break the clip into multiple audio clips to pass to the GPU, we process each clip seperately
    audios = []
    y = clip.astype(np.float32)
    len_y = len(y)
    start = 0
    end = int(PERIOD * SR)
    while True:
        y_batch = y[start:end].astype(np.float32) #the first batch
        if len(y_batch) != PERIOD * SR: #we keep on adding till we get to the end of the audio clip
            y_pad = np.zeros(PERIOD * SR, dtype=np.float32)
            y_pad[:len(y_batch)] = y_batch
            audios.append(y_pad)
            break
        start = end
        end += int(PERIOD * SR)
        audios.append(y_batch) #add the audio clip to anarrya
    array = np.asarray(audios)
    tensors = torch.from_numpy(array) #convert the audop batches to a tensor for the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()  #the model is placed to evaluate mode
    estimated_event_list = []
    global_time = 0.0
    audio_id = test_df["filepath"].values[0] #get the audio ID
    for image in tqdm(tensors): #for each image in the tensor
        image = image.view(1, image.size(0)) #we convert to a batch of one and send to the GPU
        image = image.to(device)
        #we predict and get the framewise output. 
        with torch.no_grad():
            prediction = model(image)
            framewise_outputs = prediction["framewise_output"].detach(
                ).cpu().numpy()[0]
        #we threshold the outputs, and get rid of items that are below the 
        thresholded = framewise_outputs >= threshold

        for target_idx in range(thresholded.shape[1]): #thresholded.shape[1] is the number of classes youve trained on
            if thresholded[:, target_idx].mean() == 0: #if mean is zero, nothing is detected
                pass
            else:
                detected = np.argwhere(thresholded[:, target_idx]).reshape(-1) #else we get all of the thresholds at the target idx

                head_idx = 0
                tail_idx = 0 
                while True:
                    if (tail_idx + 1 == len(detected)) or (
                            detected[tail_idx + 1] - 
                            detected[tail_idx] != 1): #if the tail idx is not larger than the detected class
                        onset = 0.01 * detected[
                            head_idx] + global_time #we loop through the detected classes and add to the onset and offset the detected is the time domain
                        offset = 0.01 * detected[
                            tail_idx] + global_time
                        onset_idx = detected[head_idx]
                        offset_idx = detected[tail_idx]
                        max_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].max() #we get the mean and max confidence scores 
                        mean_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].mean()
                        estimated_event = {
                            "audio_id": audio_id,
                            "ebird_code": classes[target_idx],
                            "onset": onset,
                            "offset": offset,
                            "max_confidence": max_confidence,
                            "mean_confidence": mean_confidence
                        }
                        estimated_event_list.append(estimated_event) #we append the estimated event for the class
                        head_idx = tail_idx + 1
                        tail_idx = tail_idx + 1
                        if head_idx >= len(detected):
                            break
                    else:
                        tail_idx += 1
        global_time += PERIOD #we add to the global time and create a dataframe from the estimated time list
        prediction_df = pd.DataFrame(estimated_event_list) 
    return prediction_df #return the CSV
    

In [37]:
def get_model(path, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state_dict = torch.load(path, map_location=device) # we load a model  here
    model = AudioSEDModel(**config.model_param)
    model.load_state_dict(state_dict)
    model.to(device)
    return model.eval()

In [38]:
from collections import defaultdict
def prediction(test_df,model, config, thresh=0.5): 
    #framewise predictions for getting the onset and offset of bird calls
    #using one model
    unique_audio_id = test_df.filepath.unique() #we get unique audio filepaths in the test_csv

    warnings.filterwarnings("ignore")
    prediction_dfs = [] #create a predictions df list
    for filepath in tqdm(unique_audio_id): # for the audio files
        clip, _ = librosa.load(f'{config.data_root}/{filepath}',sr=None,
                               res_type="kaiser_fast") #we load the audio
        
        test_df_for_audio_id = test_df.query(
            f"filepath == '{filepath}'").reset_index(drop=True) #create a csv file for the filepath
        prediction_df = prediction_for_clip(test_df_for_audio_id, #pass this to the prediction per clip
                                                clip=clip,
                                                config=config,
                                                model=model,
                                                threshold=thresh) #threshold of what confidence in a score we are looking for

        prediction_dfs.append(prediction_df)
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True) #ad all of the lists together
    return prediction_df

In [50]:
model = get_model(f'{config.save_path}/{config.exp_name}/fold-0.pth', config)
list_of_prediction_df = prediction(train_df[:5], model, config, thresh=0.9)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [51]:
list_of_prediction_df.tail()

Unnamed: 0,audio_id,ebird_code,onset,offset,max_confidence,mean_confidence
12,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,nycoce,4.0,5.79,0.997336,0.988737
13,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,nycoce,8.0,8.89,0.952564,0.948626
14,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,nycoce,9.2,9.49,0.924393,0.924393
15,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,nycoce,9.8,10.69,0.970744,0.960718
16,Wavs_OM/20190313_B261T9_2018-06-25_02-55-01.wav,nycoce,12.0,13.79,0.980085,0.951238


# Evaluation 

We Have now trained models, so we can now evaluate them on a testset. 

In [52]:
#we load our test set and create a CSV file that can accept the predictions and labels
classes = list(np.loadtxt('classes.txt',dtype=str, delimiter='\n'))
test = pd.read_csv('../test_data_labs_OM.csv')
test['filepath'] = [f'Test_data_OM/{f}' for f in test.File]
test['species_id'] = [classes.index(s) if s in classes else -1 for s in test.Label]

species_cols = [f'{classes[i]}' for i in range(len(classes))]

cv_preds = pd.DataFrame(columns=species_cols)
cv_preds['filepath'] = test['filepath'].drop_duplicates()
cv_preds.loc[:, species_cols] = 0
cv_preds = cv_preds.reset_index(drop=True)

label_df = cv_preds.copy()
test = test.reset_index(drop=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [53]:
config.data_root = '..'

In [57]:
fns =[]
for fn in test.filepath.unique():
    y,sr = librosa.load(f'../{fn}', sr=None)
    if libros 14928a.get_duration(y,sr=sr) <14.9:
        fns.append(fn)

In [59]:
test = test[~test.filepath.isin(fns)]

In [61]:
test.species_id.value_counts()

-1     14928
 8       880
 0       585
 5       550
 12      416
 15      241
 32      195
 20      188
 29       90
 16       78
 25       67
 28       62
 13       52
 23       42
 27       37
 17       36
 31       27
 34       20
 3        19
 24       18
 10       16
 18       14
 33       11
 37       10
 30        9
 1         8
 2         7
 6         6
 19        6
 38        6
 7         5
 35        4
 4         3
 39        3
 36        3
 11        2
 43        2
 9         1
 41        1
 21        1
Name: species_id, dtype: int64

## Evaluation Loop

In [60]:
model = AudioSEDModel(**config.model_param) #we create a model

for fold in range(1): #change to include an array 
    paths = f'{config.save_path}/{config.exp_name}/fold-{fold}.th' #load a pretrained model we trained
#     m = average_model(paths)
    model = get_model(paths, config)# load the weights
    
    # we load inour valid folds that were trained on each fold
    dataset = SedDataset(test, config=config, mode='valid')
    test_loader = DataLoader(dataset, batch_size=config.batch_size, #create a dataloader
                             num_workers=config.num_workers,
                             shuffle=False, drop_last=False)
    

    tk = tqdm(test_loader, total=len(test_loader))
    sub_index = 0
    with torch.no_grad():
        _preds, labels = [], []
        for i, (x,l) in enumerate(tk):
            
            x = x.to(device)
            l = l.to(device)
            bs, seq, w = x.shape #we need to reshape the input it as contains lots of segments of raw audio files that are not in the right shape for the CNN
            x = x.reshape(bs*seq, w)
            x = x.float()
            preds = model(x, None, None)# We do the same as above, but we don't mix the labels up, 

            #we also need to reshape the framewise output to calculate the evaluation metric
            preds = torch.sigmoid(torch.max(preds['framewise_output'], dim=1)[0])
            preds = preds.reshape(bs, seq, -1)
            preds = torch.sum(preds, dim=1)
            val_lwlrap = LWLRAP(preds, l)#get metrics and log them
            o = preds.cpu().numpy()
            _preds.extend(o)
            labels.extend(l.cpu().numpy())
            for val, ll in zip(o,l.cpu().numpy()):
                cv_preds.loc[sub_index, species_cols] += list(val)
                label_df.loc[sub_index, species_cols] = ll
                sub_index += 1
    
    p = torch.from_numpy(np.array(_preds)) 
    t = torch.from_numpy(np.array(labels))
    print(f"Fold {fold} lwlrap: {LWLRAP(p, t):.6}")
    
cv_preds.loc[:, species_cols] /=num_folds

preds = cv_preds.loc[:,species_cols].values.astype(np.float32)
preds = torch.from_numpy(preds)

labels = label_df.loc[:,species_cols].values.astype(np.float32)
labels = torch.from_numpy(labels)

print(f"Final Ensemble lwlrap: {LWLRAP(preds, labels):.6}")

  0%|          | 0/434 [00:00<?, ?it/s]

Fold 0 lwlrap: 0.349765
Final Ensemble lwlrap: 0.349765


### fin