Training Jupyter Notebook

Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ! cp -vr /content/drive/MyDrive/audio_images/ /content/audio_images
# ! cp -vr /content/drive/MyDrive/audio_images-20220324T215740Z-001.zip /content/
! cp -vr /content/drive/MyDrive/audio_images.tar.gz /content/

'/content/drive/MyDrive/audio_images.tar.gz' -> '/content/audio_images.tar.gz'


In [None]:
! ls -alt /content/
! mkdir /content/audio_images
! tar -zxvf audio_images.tar.gz 
# ! unzip /content/audio_images-20220324T215740Z-001.zip


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
audio_images/normoc/XC661657.ogg.npy
audio_images/normoc/XC662554.ogg.npy
audio_images/normoc/XC662555.ogg.npy
audio_images/normoc/XC664117.ogg.npy
audio_images/normoc/XC664292.ogg.npy
audio_images/normoc/XC71561.ogg.npy
audio_images/normoc/XC71562.ogg.npy
audio_images/norpin/
audio_images/norpin/XC113161.ogg.npy
audio_images/norpin/XC113162.ogg.npy
audio_images/norpin/XC113164.ogg.npy
audio_images/norpin/XC113185.ogg.npy
audio_images/norpin/XC113186.ogg.npy
audio_images/norpin/XC127474.ogg.npy
audio_images/norpin/XC129047.ogg.npy
audio_images/norpin/XC144366.ogg.npy
audio_images/norpin/XC161348.ogg.npy
audio_images/norpin/XC161349.ogg.npy
audio_images/norpin/XC182407.ogg.npy
audio_images/norpin/XC187573.ogg.npy
audio_images/norpin/XC187576.ogg.npy
audio_images/norpin/XC266784.ogg.npy
audio_images/norpin/XC266786.ogg.npy
audio_images/norpin/XC266787.ogg.npy
audio_images/norpin/XC266788.ogg.npy
audio_images/norpin/XC291860

In [None]:
! du -h /content/audio_images/

23M	/content/audio_images/mitpar
780K	/content/audio_images/akikik
5.5M	/content/audio_images/akiapo
1.8M	/content/audio_images/hawgoo
3.6M	/content/audio_images/blknod
360K	/content/audio_images/puaioh
3.9M	/content/audio_images/afrsil1
44M	/content/audio_images/categr
7.9M	/content/audio_images/refboo
11M	/content/audio_images/reccar
75M	/content/audio_images/gamqua
27M	/content/audio_images/grbher3
5.0M	/content/audio_images/burpar
73M	/content/audio_images/rinphe
10M	/content/audio_images/rempar
105M	/content/audio_images/cangoo
11M	/content/audio_images/buwtea
93M	/content/audio_images/gnwtea
7.6M	/content/audio_images/gryfra
7.0M	/content/audio_images/lcspet
2.5M	/content/audio_images/bulpet
9.8M	/content/audio_images/rettro
14M	/content/audio_images/parjae
20M	/content/audio_images/peflov
27M	/content/audio_images/zebdov
4.9M	/content/audio_images/grefri
15M	/content/audio_images/semplo
8.2M	/content/audio_images/whfibi
2.0M	/content/audio_images/kauama
126M	/content/audio_image

Paths and Imports

In [None]:
import pandas as pd, numpy as np, gc
import librosa as lb
import librosa.display as lbd

# from kaggle_datasets import KaggleDatasets
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
# force a channel ordering
from keras import backend
from tensorflow import keras


import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from pathlib import Path
from tqdm import tqdm
from functools import lru_cache

import json
import random
from datetime import datetime

import pickle

In [None]:
# TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("/content/drive/MyDrive/audio_images")
TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("/content/audio_images")

MODEL_SAVE_ROOT = Path("/content/drive/MyDrive/model_save")
MODEL_SAVE_NAME = 'BirdClef2022-ResNet50V2_model.h5'

LOAD_SAVED_MODEL = True

# Threshold for no-call detector
BIRD_CALL_PROB = 0.5

# No Call Label
NO_CALL = "no_call"

# NUM_FOLDS = 5

BATCH_SIZE = 128
EPOCHS = 100

Some Birds Only Have A Few Training Samples and no-call will reduce their values even more

In [None]:
# ignore these birds for no-call
# samples too little to filter through no-call
NO_CALL_IGNORE = [ 'akikik', 'brnboo', 'bubsan', 'bulpet', 'coopet', 'crehon', 'ercfra', 'hawpet1', 'layalb', 'lessca', 'magpet1', 'mauala', 'pomjae', 'puaioh', 'shtsan']

Connect To TPU

In [None]:
DEVICE = "TPU" # "TPU" or "GPU"

In [None]:
# https://www.kaggle.com/code/itsuki9180/birdcall-using-tpu-train/notebook
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

connecting to TPU...
Could not connect to TPU
Using default strategy for CPU and single GPU
Num GPUs Available:  1
REPLICAS: 1


Import Data

In [None]:
x_data = []
y_data= []

df = pd.read_csv('/content/drive/MyDrive/no_call_detect/nocalldetection_for_shortaudio_fold0.csv')
for row in tqdm(df.itertuples(False)):
    mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.filename).as_posix() + ".npy"))
    # print(mels.shape)

    # extract the calculated call probability
    temp_str = row.nocalldetection
    call_prob = [float(x) for x in temp_str.split()]

    # for each image, append each audio segment
    for i in range(len(mels)):
        x_data.append( (str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.filename).as_posix() + ".npy"), i) )

        if (call_prob[i] >= BIRD_CALL_PROB and row.primary_label not in NO_CALL_IGNORE):
            y_data.append(row.primary_label)
        else:
            y_data.append(NO_CALL)




14852it [00:25, 579.37it/s]


In [None]:
print(len(x_data))
print(len(y_data))

144843
144843


Label Encode the Output and Save the Mappings

In [None]:
le = LabelEncoder()
le.classes_ = np.load(MODEL_SAVE_ROOT/"classes.npy")
y_data = le.fit_transform(y_data)
le_name_mapping = dict(zip(le.classes_.astype(str), le.transform(le.classes_)))
print(le_name_mapping)

# np.save(MODEL_SAVE_ROOT/"classes.npy", le.classes_)

{'afrsil1': 0, 'akekee': 1, 'akepa1': 2, 'akiapo': 3, 'amewig': 4, 'aniani': 5, 'apapan': 6, 'arcter': 7, 'barpet': 8, 'bcnher': 9, 'belkin1': 10, 'bkbplo': 11, 'bknsti': 12, 'bkwpet': 13, 'blkfra': 14, 'blknod': 15, 'bongul': 16, 'brant': 17, 'brnnod': 18, 'brnowl': 19, 'brtcur': 20, 'buffle': 21, 'burpar': 22, 'buwtea': 23, 'cacgoo1': 24, 'calqua': 25, 'cangoo': 26, 'canvas': 27, 'caster1': 28, 'categr': 29, 'chbsan': 30, 'chemun': 31, 'chukar': 32, 'cintea': 33, 'comgal1': 34, 'commyn': 35, 'compea': 36, 'comsan': 37, 'comwax': 38, 'dunlin': 39, 'elepai': 40, 'eurwig': 41, 'fragul': 42, 'gadwal': 43, 'gamqua': 44, 'glwgul': 45, 'gnwtea': 46, 'golphe': 47, 'grbher3': 48, 'grefri': 49, 'gresca': 50, 'gryfra': 51, 'gwfgoo': 52, 'hawama': 53, 'hawcoo': 54, 'hawcre': 55, 'hawgoo': 56, 'hawhaw': 57, 'hoomer': 58, 'houfin': 59, 'houspa': 60, 'hudgod': 61, 'iiwi': 62, 'incter1': 63, 'jabwar': 64, 'japqua': 65, 'kalphe': 66, 'kauama': 67, 'laugul': 68, 'lcspet': 69, 'leasan': 70, 'leater1': 

In [None]:
for i in range(3):
    print(x_data[i])
    print(y_data[i])

('/content/audio_images/afrsil1/XC125458.ogg.npy', 0)
0
('/content/audio_images/afrsil1/XC125458.ogg.npy', 1)
0
('/content/audio_images/afrsil1/XC125458.ogg.npy', 2)
82


Split Into Train and Validation Sets

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data)

Checking Train / Val Split

In [None]:
print(type(y_data))
counts = np.bincount(y_data)
print(counts)


y = le.inverse_transform(y_data)
print(y)
unique, counts = np.unique(y, return_counts=True)
values = dict(zip(unique, counts))
d = dict((k, v) for k, v in values.items() if v <= 20)
print(d)

<class 'numpy.ndarray'>
[   93    61   220   153   184    73   486  1153   103  1340   529   847
   796   110   410    54   318   439    26  2203    66    61   120   172
   241  1884  1774    75   979   530    26    26   180    84   980  3210
   161  2070  1626  2365   133  1415   184   934  1763   203  1855    95
   444    82    47   176  1410   143    37   489    33    24   141  3348
  7070    43   395    32   719   278    86    39   407   145   265   545
   669   370   245    27  2174    34    50   497   538  1479 39888  5792
   160  9587   319   411   437    60   226  1509   155    46   286   217
   489  1068  1410   279   255   513   263    39   175   118   410   116
  1623   679  2226  1055    72  1211   271   310   570  5158  1217    36
    85    28  1062   570   576  1267    49   524  4112   196    45   157
    91    45   931    66   448   644]
['afrsil1' 'afrsil1' 'no_call' ... 'zebdov' 'zebdov' 'zebdov']
{}


In [None]:
def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image

In [None]:

# cache file loads?, doesn't seem to work
@lru_cache(maxsize=None)
def load_data(im_path):
    return np.load(im_path)



# https://medium.com/analytics-vidhya/write-your-own-custom-data-generator-for-tensorflow-keras-1252b64e41c3
class CustomDataGen(tf.keras.utils.Sequence):
    def __init__(self, x_data, y_data, batch_size, shuffle=True):
        self.x_data = x_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.shuffle = shuffle

    def on_epoch_end(self):

        # Print Time
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Current Time =", current_time)

        # Shuffle Data at the End of Epoch
        if self.shuffle:
            c = list(zip(self.x_data, self.y_data))
            random.shuffle(c)
            x_data, y_data = zip(*c)
        

    def __getitem__(self, index):

        # print(f"Index {index}")
        
        x_batch = self.x_data[index * self.batch_size:(index + 1) * self.batch_size]
        y_batch = self.y_data[index * self.batch_size:(index + 1) * self.batch_size]
        # print(f"x_batch {x_batch}")

        x_images = self.__get_data(x_batch)
        y_labels = self.__get_output(y_batch)

        x_images = np.array(x_images)

        # ensure type TODO REMOVEME
        # print(type(x_images))
        # assert isinstance(x_images, (np.ndarray, np.generic))
        # assert isinstance(y_labels, (np.ndarray, np.generic))

        # print(f"x_images shape {x_images.shape}")
        return x_images, y_labels


    def __len__(self):
        return len(self.x_data) // self.batch_size

    def __get_data(self, x_batch):
        x_im = []
        for index, tup in enumerate(x_batch):
            file_name = tup[0]
            mel_num = tup[1]
            # mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/file_name).as_posix()))
            mels = load_data(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/file_name).as_posix()))
            norm_im = normalize(mels[mel_num])
            x_im.append(norm_im)
        return x_im


    def __get_output(self, y_batch):
        # num classes from the label encoder
        num_classes = len(le.classes_)
        # Target for 0.99 instead of 1
        return tf.keras.utils.to_categorical(y_batch, num_classes) * 0.99
        




In [None]:
# force channels-first ordering
backend.set_image_data_format('channels_first')
print(backend.image_data_format())

base_model = tf.keras.applications.resnet_v2.ResNet50V2(
    include_top=False,
    input_shape=(3, 128, 281),
    weights='imagenet',
)
x = base_model.output
# https://cv-tricks.com/keras/understand-implement-resnets/
# Global Average Pooling
x = GlobalAveragePooling2D()(x)
d1 = Dense(1024, activation='relu')(x)
d1 = Dropout(0.5)(d1)
predictions = Dense(138, activation='softmax')(d1)

model = Model(inputs=base_model.input, outputs=predictions)

opt = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
        epsilon=1e-07,
      )
model.compile(opt, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



channels_first
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 3, 128, 281  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 3, 134, 287)  0           ['input_1[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 64, 64, 141)  9472        ['conv1_pad[0][0]']              
                                 

Training Callbacks

In [None]:
# https://www.kaggle.com/code/enukuro/108th-place-solution-birdcall-keras-tpu/notebook
es = tf.keras.callbacks.EarlyStopping(
              monitor='loss', 
              verbose=1, 
              patience=10)
sv = tf.keras.callbacks.ModelCheckpoint(
              MODEL_SAVE_ROOT/MODEL_SAVE_NAME,
              monitor='val_loss',
              verbose=1,
              save_best_only=True) #, save_weights_only=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
              monitor='val_loss', 
              verbose=1, 
              factor=0.2, 
              patience=5, 
              min_delta=0.0001, 
              cooldown=1, 
              min_lr=1e-7)

In [None]:
# x_train, x_val, y_train, y_val

traingen = CustomDataGen(x_train, y_train, batch_size = BATCH_SIZE, shuffle = True)
valgen = CustomDataGen(x_val, y_val, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
STEPS_PER_EPOCH = len(x_train) // BATCH_SIZE
VALIDATION_STEP = len(x_val) // BATCH_SIZE

if LOAD_SAVED_MODEL:
    model = keras.models.load_model(MODEL_SAVE_ROOT/'BirdClef2022-ResNet50V2_model.h5')

history = model.fit(
    traingen,
    epochs = EPOCHS,
    steps_per_epoch= STEPS_PER_EPOCH,
    callbacks = [es, sv, reduce_lr],
    validation_data=valgen,
    validation_steps = VALIDATION_STEP
)



with open(MODEL_SAVE_ROOT/'trainHistoryDict', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

Epoch 1/100

Epoch 1: val_loss improved from inf to 0.01273, saving model to /content/drive/MyDrive/model_save/BirdClef2022-ResNet50V2_model.h5
Current Time = 17:34:19
Epoch 2/100

Epoch 2: val_loss improved from 0.01273 to 0.00952, saving model to /content/drive/MyDrive/model_save/BirdClef2022-ResNet50V2_model.h5
Current Time = 18:07:28
Epoch 3/100

Epoch 3: val_loss did not improve from 0.00952
Current Time = 18:40:29
Epoch 4/100

In [None]:
# ---- display history ----
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('train_test_accuracy_vgg16_augmentation.png')
plt.clf() # clear figure
# summarize history for loss (binary cross-entropy)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('binary cross-entropy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('train_test_loss_vgg16_augmentation.png')
plt.clf()