In [1]:
import os

import tensorflow as tf
import tensorflow_io as tfio
import tensorflow_hub as hub
import pandas as pd

# Load yamnet model

In [2]:
yamnet_model_handle = 'https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# Create dataset

In [3]:
def load_mp3(filename):
    file_content = tf.io.read_file(filename)
    wav = tfio.audio.decode_mp3(file_content)
    wav = tf.reduce_mean(wav, axis=1)
    sample_rate = tfio.audio.AudioIOTensor(filename, dtype=tf.float32).rate
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [9]:
data_directory = os.path.join('..', 'data', 'xeno-canto')
bird_species_df = pd.read_csv('../data/selected_species.csv', sep=',')

class_names = []
class_id = 0

X_train = []
y_train = []

X_val = []
y_val = []

X_test = []
y_test = []

for dir_name in os.listdir(os.path.join(data_directory, 'train')):
    if dir_name.startswith('.'):
        continue
    class_name = bird_species_df.loc[bird_species_df['Latin name'] == dir_name]['Polish name'].squeeze()
    class_names.append(class_name)

    for file_name in os.listdir(os.path.join(data_directory, 'train', dir_name)):
        file_path = os.path.join(data_directory, 'train', dir_name, file_name)
        X_train.append(file_path)
        y_train.append(class_id)

    for file_name in os.listdir(os.path.join(data_directory, 'val', dir_name)):
        file_path = os.path.join(data_directory, 'val', dir_name, file_name)
        X_val.append(file_path)
        y_val.append(class_id)

    for file_name in os.listdir(os.path.join(data_directory, 'test', dir_name)):
        file_path = os.path.join(data_directory, 'test', dir_name, file_name)
        X_test.append(file_path)
        y_test.append(class_id)
    
    class_id += 1


In [5]:
# delete recordings that cannot be decoded
for file_path in X_train:
    try:
        wav = load_mp3(file_path)
    except:
        os.remove(file_path)

for file_path in X_val:
    try:
        wav = load_mp3(file_path)
    except:
        os.remove(file_path)

for file_path in X_test:
    try:
        wav = load_mp3(file_path)
    except:
        os.remove(file_path)









In [10]:
print(class_names)

['skowronek', 'krzyżówka', 'gęś białoczelna', 'gęś zbożowa', 'jerzyk', 'mewa śmieszka', 'gołąb miejski', 'grzywacz', 'gawron', 'kawka', 'kukułka', 'modraszka', 'oknówka', 'łyska', 'sójka', 'słowik szary', 'słowik rdzawy', 'sroka', 'brzegówka', 'kowalik']


In [11]:
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_data = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))

def load_mp3_for_map(filename, label):
    return load_mp3(filename), label

train_data = train_data.map(lambda filename, label: load_mp3_for_map(filename, label))
val_data = val_data.map(lambda filename, label: load_mp3_for_map(filename, label))
test_data = test_data.map(lambda filename, label: load_mp3_for_map(filename, label))

def extract_embeddings(wav_data, label):
    _, embeddings, _ = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings,
            tf.repeat(label, num_embeddings))

train_data = train_data.map(extract_embeddings).unbatch()
val_data = val_data.map(extract_embeddings).unbatch()
test_data = test_data.map(extract_embeddings).unbatch()

train_data = train_data.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_data = val_data.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_data = test_data.cache().batch(32).prefetch(tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089














In [12]:
print(train_data.element_spec)

(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))


# Train model

In [13]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(class_names))
], name='my_model')

my_model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               524800    
                                                                 
 dense_1 (Dense)             (None, 20)                10260     
                                                                 
Total params: 535,060
Trainable params: 535,060
Non-trainable params: 0
_________________________________________________________________


In [14]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                            patience=3,
                                            restore_best_weights=True)

In [15]:
history = my_model.fit(train_data,
                       epochs=20,
                       validation_data=val_data,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


# Evaluate model

In [16]:
loss, accuracy = my_model.evaluate(test_data)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  8.127580642700195
Accuracy:  0.06181284040212631


In [17]:
testing_file_name = '../data/xeno-canto/test/Luscinia megarhynchos/36984.mp3'
wav = load_mp3(testing_file_name)

scores, embeddings, spectrogram = yamnet_model(wav)
result = my_model(embeddings).numpy()

inferred_class = class_names[result.mean(axis=0).argmax()]

print(f'The main sound is: {inferred_class}')

The main sound is: kowalik


# Save model

In [86]:
# extend trained model to be able to give it raw wav data as input
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

  def call(self, input):
    return tf.math.reduce_mean(input, axis=self.axis)

In [87]:
saved_model_path = './birds_yamnet.keras'

input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
embedding_extraction_layer = hub.KerasLayer(yamnet_model_handle,
                                            trainable=False, name='yamnet')
_, embeddings_output, _ = embedding_extraction_layer(input_segment)
serving_outputs = my_model(embeddings_output)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
serving_model = tf.keras.Model(input_segment, serving_outputs)
serving_model.save(saved_model_path, include_optimizer=False)





# Test saved model

In [88]:
reloaded_model = tf.keras.models.load_model(saved_model_path, custom_objects={'KerasLayer':hub.KerasLayer, 'ReduceMeanLayer': ReduceMeanLayer})





In [89]:
reloaded_results = reloaded_model(wav)
inferred_class = class_names[tf.math.argmax(reloaded_results)]

print(f'The main sound is: {inferred_class}')

The main sound is: słowik szary
