Training Jupyter Notebook

Imports

In [1]:
COLAB = False
# On Windows Run in ENSC_413 Folder

In [2]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [3]:
if COLAB:
    # ! cp -vr /content/drive/MyDrive/audio_images/ /content/audio_images
    # ! cp -vr /content/drive/MyDrive/audio_images-20220324T215740Z-001.zip /content/
    ! cp -vr /content/drive/MyDrive/audio_images.tar.gz /content/

In [4]:
if COLAB:
    ! ls -alt /content/
    ! mkdir /content/audio_images
    ! tar -zxvf audio_images.tar.gz 
    # ! unzip /content/audio_images-20220324T215740Z-001.zip


In [5]:
if COLAB:
    ! du -h /content/audio_images/

Paths and Imports

In [5]:
import pandas as pd, numpy as np, gc
import librosa as lb
import librosa.display as lbd

# from kaggle_datasets import KaggleDatasets
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import regularizers
# force a channel ordering
from keras import backend
from tensorflow import keras


import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from pathlib import Path
from tqdm import tqdm
from functools import lru_cache

import json
import random
from datetime import datetime

import pickle
import copy

In [19]:
if COLAB:
    # TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("/content/drive/MyDrive/audio_images")
    TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("/content/audio_images")

    MODEL_SAVE_ROOT = Path("/content/drive/MyDrive/model_save")
    MODEL_SAVE_NAME = 'BirdClef2022-ResNet50V2_model.h5'
else:
    # TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("/content/drive/MyDrive/audio_images")
    TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path(r"C:\Users\xuewi\Desktop\SFU\ENSC_413\audio_images")

    MODEL_SAVE_ROOT = Path(r"C:\Users\xuewi\Desktop\SFU\ENSC_413\BirdCLEF2022-Project\model_save")
    MODEL_SAVE_NAME = 'ResNet50V2_model_v4_mixup.h5'
    #MODEL_SAVE_NAME = 'Xception_sigmoid_v1_train.h5'
    #MODEL_SAVE_NAME = 'EfficientNetB5_v1.h5'



LOAD_SAVED_MODEL = False

# Threshold for no-call detector
BIRD_CALL_PROB = 0.5

# No Call Label
NO_CALL = "no_call"

# NUM_FOLDS = 5

if COLAB:
    BATCH_SIZE = 128
else:
    #BATCH_SIZE = 96
    BATCH_SIZE = 64 #Resnet
    #BATCH_SIZE = 64
EPOCHS = 35

# Train Validation Split
TEST_SET_SIZE = 0.33

Some Birds Only Have A Few Training Samples and no-call will reduce their values even more

In [7]:
# ignore these birds for no-call
# samples too little to filter through no-call
NO_CALL_IGNORE = [ 'akikik', 'brnboo', 'bubsan', 'bulpet', 'coopet', 'crehon', 'ercfra', 'hawpet1', 'layalb', 'lessca', 'magpet1', 'mauala', 'pomjae', 'puaioh', 'shtsan']

Connect To TPU

In [8]:
DEVICE = "TPU" # "TPU" or "GPU"

In [9]:
# https://www.kaggle.com/code/itsuki9180/birdcall-using-tpu-train/notebook
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

connecting to TPU...
Could not connect to TPU
Using default strategy for CPU and single GPU
Num GPUs Available:  1
REPLICAS: 1


Import Data

In [10]:
x_data = []
y_data= []
y_call_prob = []

if COLAB:
    df = pd.read_csv('/content/drive/MyDrive/no_call_detect/nocalldetection_for_shortaudio_fold0.csv')
else:
    df = pd.read_csv(r'C:\Users\xuewi\Desktop\SFU\ENSC_413\BirdCLEF2022-Project\no_call_detect\nocalldetection_for_shortaudio_fold0.csv')
for row in tqdm(df.itertuples(False)):
    mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.filename).as_posix() + ".npy"))
    # print(mels.shape)

    # extract the calculated call probability
    temp_str = row.nocalldetection
    call_prob = [float(x) for x in temp_str.split()]

    # for each image, append each audio segment
    for i in range(len(mels)):
        x_data.append( (str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.filename).as_posix() + ".npy"), i) )

        #if (call_prob[i] >= BIRD_CALL_PROB or row.primary_label in NO_CALL_IGNORE):
        #    y_data.append(row.primary_label)
        #else:
        #    y_data.append(NO_CALL)
        y_call_prob.append(call_prob[i])
        y_data.append(row.primary_label)




14852it [01:26, 172.41it/s]


In [11]:
print(len(x_data))
print(len(y_data))
print(len(y_call_prob))

144843
144843
144843


Label Encode the Output and Save the Mappings

In [12]:
LOAD_LABELS = True

le = LabelEncoder()
if LOAD_LABELS:
    le.classes_ = np.load(MODEL_SAVE_ROOT/"classes_only_birds.npy")
y_label = le.fit_transform(y_data)
le_name_mapping = dict(zip(le.classes_.astype(str), le.transform(le.classes_)))
print(le_name_mapping)

if not LOAD_LABELS:
    np.save(MODEL_SAVE_ROOT/"classes_only_birds.npy", le.classes_)

{'afrsil1': 0, 'akekee': 1, 'akepa1': 2, 'akiapo': 3, 'akikik': 4, 'amewig': 5, 'aniani': 6, 'apapan': 7, 'arcter': 8, 'barpet': 9, 'bcnher': 10, 'belkin1': 11, 'bkbplo': 12, 'bknsti': 13, 'bkwpet': 14, 'blkfra': 15, 'blknod': 16, 'bongul': 17, 'brant': 18, 'brnboo': 19, 'brnnod': 20, 'brnowl': 21, 'brtcur': 22, 'bubsan': 23, 'buffle': 24, 'bulpet': 25, 'burpar': 26, 'buwtea': 27, 'cacgoo1': 28, 'calqua': 29, 'cangoo': 30, 'canvas': 31, 'caster1': 32, 'categr': 33, 'chbsan': 34, 'chemun': 35, 'chukar': 36, 'cintea': 37, 'comgal1': 38, 'commyn': 39, 'compea': 40, 'comsan': 41, 'comwax': 42, 'coopet': 43, 'crehon': 44, 'dunlin': 45, 'elepai': 46, 'ercfra': 47, 'eurwig': 48, 'fragul': 49, 'gadwal': 50, 'gamqua': 51, 'glwgul': 52, 'gnwtea': 53, 'golphe': 54, 'grbher3': 55, 'grefri': 56, 'gresca': 57, 'gryfra': 58, 'gwfgoo': 59, 'hawama': 60, 'hawcoo': 61, 'hawcre': 62, 'hawgoo': 63, 'hawhaw': 64, 'hawpet1': 65, 'hoomer': 66, 'houfin': 67, 'houspa': 68, 'hudgod': 69, 'iiwi': 70, 'incter1': 

In [13]:
for i in range(3):
    print(x_data[i])
    print(y_label[i])
    print(y_call_prob[i])
    

('C:/Users/xuewi/Desktop/SFU/ENSC_413/audio_images/afrsil1/XC125458.ogg.npy', 0)
0
0.8829130530357361
('C:/Users/xuewi/Desktop/SFU/ENSC_413/audio_images/afrsil1/XC125458.ogg.npy', 1)
0
0.8767924308776855
('C:/Users/xuewi/Desktop/SFU/ENSC_413/audio_images/afrsil1/XC125458.ogg.npy', 2)
0
0.48533734679222107


Split Into Train and Validation Sets

In [14]:
x_train, x_val, y_train, y_val, y_call_prob_train, y_call_prob_val = train_test_split(x_data, y_label, y_call_prob, test_size=TEST_SET_SIZE, stratify=y_data)

In [15]:
#np.set_printoptions(threshold=sys.maxsize)
print(y_val[:5])
print(type(y_val))
y_prob = np.asarray(y_call_prob_val)
values = tf.keras.utils.to_categorical(y_val[:5], 152)
for i in range(len(values)):
    values[i] = values[i] * y_call_prob_val[i]
    
print(y_call_prob_val[:5])
print(values)

[94 96 68 50 39]
<class 'numpy.ndarray'>
[0.9324221611022949, 0.7658768892288208, 0.9636142253875732, 0.9925619959831238, 0.4727885127067566]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.   

Checking Train / Val Split

In [16]:
print(type(y_label))
counts = np.bincount(y_label)
print(counts)


y = le.inverse_transform(y_label)
print(y)
unique, counts = np.unique(y, return_counts=True)
values = dict(zip(unique, counts))
d = dict((k, v) for k, v in values.items() if v <= 20)
print(d)

<class 'numpy.ndarray'>
[  112    71   224   159    22   293    95   540  1561   210  3303   888
  1205  1344   120   567   104   459   996   100    62  5511   106     8
   107    70   144   291   381  2209  3031   117  1261  1265    67    34
   279   114  1448  6132   267  3624  2186     7    25  3485   145    17
  2092   211  1340  2172   310  2663    99   763   139   150   218  1985
   161    64   504    50    55    35   182  3582  7985    49   455    89
   881   335   136    57   509     9   202   380   648    35   831   466
   361   120   395  3822   139    18    50   732   652  1887  6747   260
 10632   561   695   528    69   245  1946   244    50   383   311   564
  1566  1959    54    10   309   316   848   364   225   288   283   490
   156  2093  1312  2552  1472   136  1377   490   419  1046     7  5796
  1601   127   107    50  1311   818   727  1515    82   571  4747   258
   231   236   149   120  1258    89   494   765]
['afrsil1' 'afrsil1' 'afrsil1' ... 'zebdov' 'zebdo

In [17]:
def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image

In [92]:

# cache file loads?, doesn't seem to work
#@lru_cache(maxsize=None)
def load_data(im_path):
    return np.load(im_path)



# https://medium.com/analytics-vidhya/write-your-own-custom-data-generator-for-tensorflow-keras-1252b64e41c3
class CustomDataGen(tf.keras.utils.Sequence):
    def __init__(self, x_data, y_data, y_call_probs, batch_size, shuffle=True, mixup=False):
        self.x_data = x_data
        self.y_data = y_data
        self.y_call_probs = y_call_probs
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mixup=mixup

    def on_epoch_end(self):

        # Print Time
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Current Time =", current_time)

        # Shuffle Data at the End of Epoch
        if self.shuffle:
            c = list(zip(self.x_data, self.y_data, self.y_call_probs))
            random.shuffle(c)
            self.x_data, self.y_data, self.y_call_probs = zip(*c)
        

    def __getitem__(self, index):

        # print(f"Index {index}")
        
        x_batch = self.x_data[index * self.batch_size:(index + 1) * self.batch_size]
        y_batch = self.y_data[index * self.batch_size:(index + 1) * self.batch_size]
        y_call_batch = self.y_call_probs[index * self.batch_size:(index + 1) * self.batch_size]
        # print(f"x_batch {x_batch}")

        x_images = self.__get_data(x_batch)
        y_labels = self.__get_output(y_batch, y_call_batch)

        x_images = np.array(x_images)

        # ensure type TODO REMOVEME
        # print(type(x_images))
        # assert isinstance(x_images, (np.ndarray, np.generic))
        # assert isinstance(y_labels, (np.ndarray, np.generic))

        # print(f"x_images shape {x_images.shape}")
        if self.mixup:
            x_images, y_labels = self.__mixup(x_images, y_labels)
        return x_images, y_labels


    def __len__(self):
        return len(self.x_data) // self.batch_size

    def __get_data(self, x_batch):
        x_im = []
        for index, tup in enumerate(x_batch):
            file_name = tup[0]
            mel_num = tup[1]
            # mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/file_name).as_posix()))
            mels = load_data(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/file_name).as_posix()))
            norm_im = normalize(mels[mel_num])
            x_im.append(norm_im)
        return x_im


    def __get_output(self, y_batch, y_call_batch):
        # num classes from the label encoder
        num_classes = len(le.classes_)
        
        values = tf.keras.utils.to_categorical(y_batch, num_classes)

        # Multiply the Categorical Values by the no call probability
        for i in range(len(values)):
            values[i] = values[i] * y_call_batch[i]
        return values


    # Referenced from https://www.dlology.com/blog/how-to-do-mixup-training-from-image-files-in-keras/
    def __mixup(self, x_images, y_labels, alpha=0.2):

        x_images_copy = copy.deepcopy(x_images)
        y_labels_copy = copy.deepcopy(y_labels)
        

        # iterate through all images
        for i in range(len(x_images)):

            # perform mixup with current item and any random item in the batch
            rand_index = random.randint(0, self.batch_size - 1)

            l = np.random.beta(alpha, alpha, 1)
            X_l = l.reshape(1, 1, 1)
            y_l = l.reshape(1)
            
            value = x_images[i] # * X_l # + x_images[rand_index] * ( 1 - X_l)
            
            x_images_copy[i] = x_images[i] * X_l + x_images[rand_index] * ( 1 - X_l)
            y_labels_copy[i] = y_labels[i] * y_l + y_labels[rand_index] * ( 1 - y_l)

        return x_images_copy, y_labels_copy




In [93]:
# force channels-first ordering
backend.set_image_data_format('channels_first')
print(backend.image_data_format())

base_model = tf.keras.applications.resnet_v2.ResNet50V2(
    include_top=False,
    input_shape=(3, 128, 281),
    weights='imagenet',
)
x = base_model.output
# https://cv-tricks.com/keras/understand-implement-resnets/
# Global Average Pooling
x = GlobalAveragePooling2D()(x)
d1 = Dense(1024, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001))(x)
d1 = Dropout(0.5)(d1)
predictions = Dense(152, activation='sigmoid')(d1)

model = Model(inputs=base_model.input, outputs=predictions)

opt = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
        epsilon=1e-07,
      )
model.compile(opt, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



channels_first
Model: "model_14"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 3, 128, 281  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 3, 134, 287)  0           ['input_15[0][0]']               
                                                                                                  
 conv1_conv (Conv2D)            (None, 64, 64, 141)  9472        ['conv1_pad[0][0]']              
                                                                                                  
 pool1_pad (ZeroPadding2D)      (None, 64, 66, 143)  0           ['conv1_con

Training Callbacks

In [94]:
# https://www.kaggle.com/code/enukuro/108th-place-solution-birdcall-keras-tpu/notebook
es = tf.keras.callbacks.EarlyStopping(
              monitor='val_loss', 
              verbose=1, 
              patience=8)
sv = tf.keras.callbacks.ModelCheckpoint(
              MODEL_SAVE_ROOT/MODEL_SAVE_NAME,
              monitor='val_loss',
              verbose=1,
              save_best_only=True) #, save_weights_only=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
              monitor='val_loss', 
              verbose=1, 
              factor=0.2, 
              patience=5, 
              min_delta=0.0001, 
              cooldown=1, 
              min_lr=1e-7)

In [95]:
# x_train, x_val, y_train, y_val

traingen = CustomDataGen(x_train, y_train, y_call_prob_train, batch_size = BATCH_SIZE, shuffle = True, mixup=True)

valgen = CustomDataGen(x_val, y_val, y_call_prob_val, batch_size = BATCH_SIZE, shuffle = True, mixup=False)

In [96]:
STEPS_PER_EPOCH = len(x_train) // BATCH_SIZE
VALIDATION_STEP = len(x_val) // BATCH_SIZE

if LOAD_SAVED_MODEL:
    model = keras.models.load_model(MODEL_SAVE_ROOT/MODEL_SAVE_NAME)

history = model.fit(
    traingen,
    epochs = EPOCHS,
    steps_per_epoch= STEPS_PER_EPOCH,
    callbacks = [es, sv, reduce_lr],
    validation_data=valgen,
    validation_steps = VALIDATION_STEP
)



with open(MODEL_SAVE_ROOT/'trainHistoryDict', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

Epoch 1/35

Epoch 00001: val_loss improved from inf to 0.02493, saving model to C:\Users\xuewi\Desktop\SFU\ENSC_413\BirdCLEF2022-Project\model_save\ResNet50V2_model_v4_mixup.h5


  layer_config = serialize_layer_fn(layer)


Current Time = 11:02:15
Epoch 2/35

Epoch 00002: val_loss improved from 0.02493 to 0.02176, saving model to C:\Users\xuewi\Desktop\SFU\ENSC_413\BirdCLEF2022-Project\model_save\ResNet50V2_model_v4_mixup.h5
Current Time = 11:10:58
Epoch 3/35

Epoch 00003: val_loss improved from 0.02176 to 0.02062, saving model to C:\Users\xuewi\Desktop\SFU\ENSC_413\BirdCLEF2022-Project\model_save\ResNet50V2_model_v4_mixup.h5
Current Time = 11:19:30
Epoch 4/35

In [None]:
# ---- display history ----
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.ylabel('accuracy')


plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('train_test_accuracy_ResNet50V2_model_v4_mixup.png')
plt.clf() # clear figure
# summarize history for loss (binary cross-entropy)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('binary cross-entropy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('train_test_loss_ResNet50V2_model_v4_mixup.png')
plt.clf()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy', 'lr'])


<Figure size 432x288 with 0 Axes>