# BirdCLEF 2022 based on Convolutional Neural Networks 

## Road map

## Libraries

In [1]:
"""# Install python_speech_features to calculate filter banks features:
!ls ../input/python-speech-features06-zip/
!mkdir -p /tmp/pip/cache/
!cp ../input/python-speech-features06-zip/python_speech_features-0.6.xyz /tmp/pip/cache/python_speech_features-0.6.tar.gz
!pip install --no-index --find-links /tmp/pip/cache/ python_speech_features"""

'# Install python_speech_features to calculate filter banks features:\n!ls ../input/python-speech-features06-zip/\n!mkdir -p /tmp/pip/cache/\n!cp ../input/python-speech-features06-zip/python_speech_features-0.6.xyz /tmp/pip/cache/python_speech_features-0.6.tar.gz\n!pip install --no-index --find-links /tmp/pip/cache/ python_speech_features'

In [2]:
import numpy as np
import pandas as pd
import json
from sklearn import preprocessing
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm import tqdm
import librosa
import python_speech_features
import gc
import logging

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

## Config

In [3]:
# Data:
TRAIN_DIR = 'train_audio/'
SAMPLE_RATE = 32000
TRAIN_SIZE = 0.8

# Data processing:
WINDOW_LENGTH = 0.25
WINDOW_STRIDE = 0.01
N_MELS = 32
N_FFT = 512
FMIN = 0
FMAX = SAMPLE_RATE / 2
PREEMPHASIS_COEFFICIENT = 0.97
STRIDE = 14

# Learning process:
NAME_MODEL_0 = "model_0/model_0.h5"
BATCH_SIZE = 64
EPOCHS = 25
CALL_BACKS = None
CHUNK_SIZE = 316

## Load data

In [4]:
train_metadata = pd.read_csv('train_metadata.csv')
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,time,url,filename
0,afrsil1,[],"['call', 'flight call']",12.391,-1.493,Euodice cantans,African Silverbill,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,08:00,https://www.xeno-canto.org/125458,afrsil1/XC125458.ogg
1,afrsil1,"['houspa', 'redava', 'zebdov']",['call'],19.8801,-155.7254,Euodice cantans,African Silverbill,Dan Lane,Creative Commons Attribution-NonCommercial-Sha...,3.5,08:30,https://www.xeno-canto.org/175522,afrsil1/XC175522.ogg
2,afrsil1,[],"['call', 'song']",16.2901,-16.0321,Euodice cantans,African Silverbill,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,4.0,11:30,https://www.xeno-canto.org/177993,afrsil1/XC177993.ogg
3,afrsil1,[],"['alarm call', 'call']",17.0922,54.2958,Euodice cantans,African Silverbill,Oscar Campbell,Creative Commons Attribution-NonCommercial-Sha...,4.0,11:00,https://www.xeno-canto.org/205893,afrsil1/XC205893.ogg
4,afrsil1,[],['flight call'],21.4581,-157.7252,Euodice cantans,African Silverbill,Ross Gallardy,Creative Commons Attribution-NonCommercial-Sha...,3.0,16:30,https://www.xeno-canto.org/207431,afrsil1/XC207431.ogg


In [5]:
# Load work classes:
with open('scored_birds.json', 'r') as f:
    valid_classes = json.load(f)
valid_classes.append('other')
N_CLASSES = len(valid_classes)

# Encode labels:
encoder = preprocessing.LabelEncoder()
valid_labels = encoder.fit_transform(valid_classes)

edited_labels = list()
primary_labels = train_metadata.primary_label
for i in range(primary_labels.shape[0]):
    if primary_labels[i] in valid_classes:
        edited_labels.append(primary_labels[i])
    else:
        edited_labels.append('other')
labels = encoder.transform(edited_labels)

## Data Processing

In [6]:
# Cut the signal into frames duration 5 sec:
def framing(sig: np.ndarray, sample_rate: int, frame_len: int, duration_time: float) -> np.ndarray:
    num_frames = int(np.ceil(duration_time / 5))
    framed_sig = np.zeros((num_frames, int(frame_len * sample_rate)))
    start_time = 0
    end_time = frame_len * sample_rate
    if duration_time < 5:
        framed_sig[0][:sig.shape[0]] = sig
    else:
        for i in range(num_frames):
            framed_sig[i][:end_time - start_time] = sig[start_time:end_time]
            start_time = start_time + int(frame_len * sample_rate)
            if i == num_frames - 2:
                end_time = end_time + int(sig.shape[0] - start_time)
            else:
                end_time = end_time + int(frame_len * sample_rate)

    return framed_sig


def processingChunkAudio(filenames: list, labels: np.ndarray) -> list:
    features_arr = np.zeros((1, STRIDE, N_MELS, N_MELS, 1))
    labels_arr = np.zeros((1, STRIDE, N_CLASSES))
    
    for i in range(len(filenames)):
        # Load audio:
        signal, _ = librosa.load(
            TRAIN_DIR + filenames.iloc[i],
            sr=SAMPLE_RATE,
            mono=True,
            dtype=np.float32
        )
        
        # Divide signal into frames duration 5 sec:
        frames = framing(
            sig=signal,
            sample_rate=SAMPLE_RATE,
            frame_len=5,
            duration_time=librosa.get_duration(
                y=signal,
                sr=SAMPLE_RATE
            )
        )
        
        for j in range(frames.shape[0]):
        # Extract log filter banks:
            mel = python_speech_features.base.logfbank(
                frames[j],
                samplerate=SAMPLE_RATE,
                winlen=WINDOW_LENGTH,
                winstep=WINDOW_STRIDE,
                nfilt=N_MELS,
                nfft=N_FFT,
                lowfreq=FMIN,
                highfreq=FMAX,
                preemph=PREEMPHASIS_COEFFICIENT
            )
            mel = np.float32(mel)
        
            # Make images:
            num_images = int(np.floor(mel.shape[0] / N_MELS))
            mel = mel[:num_images * N_MELS]
            images = mel.reshape(num_images, N_MELS, N_MELS)
        
            # Make series of images:
            num_groups = int(np.floor(images.shape[0] / STRIDE))
            images = images[:num_groups * STRIDE]
            sequences = images.reshape(num_groups, STRIDE, images.shape[1], images.shape[2], 1)
        
            # Reshape and convert label for images:
            label_cat = tf.keras.utils.to_categorical(
                labels[i],
                num_classes=N_CLASSES,
                dtype=np.uint8
            )
            labels_reshaped = np.full(
                shape=(sequences.shape[0], sequences.shape[1], N_CLASSES),
                fill_value=label_cat,
                dtype=np.uint8
            )
        
            # Stack series of images and labels into arrays:
            features_arr = np.vstack((features_arr, sequences))
            labels_arr = np.vstack((labels_arr, labels_reshaped))
        
            # Erase memory:
            del mel
            del num_images
            del images
            del num_groups
            del sequences
            del label_cat
            del labels_reshaped
            gc.collect()
        
        # Erase memory:
        del signal
        del frames
        gc.collect()
    
    # Erase memory:
    del filenames
    del labels
    gc.collect()
    
    # Delete the first zero element:
    features_arr = features_arr[1:]
    labels_arr = labels_arr[1:]
    
    return [features_arr, labels_arr]


def splitData(data: list) -> list:
    # Split into train and val sets:
    train_features = data[0][:int(TRAIN_SIZE * data[0].shape[0])]
    train_labels = data[1][:int(TRAIN_SIZE * data[1].shape[0])]
    val_features = data[0][int(TRAIN_SIZE * data[0].shape[0]):]
    val_labels = data[1][int(TRAIN_SIZE * data[1].shape[0]):]
    
    # Erase memory:
    del data
    
    return [train_features, train_labels, val_features, val_labels]


def dataGenerator(features: np.ndarray, labels: np.ndarray):
    # Make tf dataset:
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    ds = ds.batch(BATCH_SIZE)
    
    # Erase memory:
    del features
    del labels
    
    return ds

In [7]:
def getModel_0():
    layer_input = tf.keras.Input((STRIDE, N_MELS, N_MELS, 1), dtype=tf.float32)
    layer_model_0 = tf.keras.layers.BatchNormalization()(layer_input)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(64, (5, 5), activation='elu'))(layer_model_0)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D((2, 2)))(layer_model_0)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(128, (3, 3), activation='elu'))(layer_model_0)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.MaxPooling2D((2, 2)))(layer_model_0)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(layer_model_0)
    layer_model_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(64, activation='elu'))(layer_model_0)
    layer_model_0 = tf.keras.layers.Dropout(0.5)(layer_model_0)
    layer_model_0 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(layer_model_0)
    layer_model_0 = tf.keras.layers.Dropout(0.5)(layer_model_0)
    layer_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(valid_classes), activation='softmax'))(layer_model_0)
    model_0 = tf.keras.Model(inputs=[layer_input], outputs=[layer_output])
    model_0.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    
    return model_0

In [None]:
train_metadata = train_metadata.sample(frac=1, random_state=42)
labels = tf.random.shuffle(labels, seed=42)

model_0 = getModel_0()
train_log_los_err = np.zeros(EPOCHS)
val_log_loss_err = np.zeros(EPOCHS)
for epoch in tqdm(range(EPOCHS)):
    for i in range(int(train_metadata.shape[0] / CHUNK_SIZE)):
        # Prepare data for fitting:
        processed_data = processingChunkAudio(
            filenames=train_metadata.filename.iloc[i * CHUNK_SIZE:(i + 1) * CHUNK_SIZE],
            labels=labels[i * CHUNK_SIZE:(i + 1) * CHUNK_SIZE]
        )
        splitted_data = splitData(data=processed_data)
        train_ds = dataGenerator(features=splitted_data[0], labels=splitted_data[1])
        val_ds = dataGenerator(features=splitted_data[2], labels=splitted_data[3])
        
        # Fit the model:
        history = model_0.fit(train_ds, validation_data=val_ds, epochs=1, batch_size=BATCH_SIZE, callbacks=CALL_BACKS)
        train_log_los_err[epoch] = train_log_los_err[epoch] + history.history['loss'][0]
        val_log_loss_err[epoch] = val_log_loss_err[epoch] + history.history['val_loss'][0]
        
        # Erase memory:
        del processed_data
        del splitted_data
        del train_ds
        del val_ds
        del model_0
        gc.collect()
    
    train_log_los_err[epoch] = train_log_los_err[epoch] / int(train_metadata.shape[0] / CHUNK_SIZE)
    val_log_loss_err[epoch] = val_log_loss_err[epoch] / int(train_metadata.shape[0] / CHUNK_SIZE)
    
model_0.save(NAME_MODEL_0)

  0%|          | 0/25 [00:00<?, ?it/s]