In [23]:
import os
import random

import tensorflow as tf
import tensorflow_io as tfio
import tensorflow_hub as hub
import pandas as pd

# Load yamnet model

In [24]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# Create dataset

In [25]:
@tf.function
def load_mp3(filename):
    file_content = tf.io.read_file(filename)
    wav = tfio.audio.decode_mp3(file_content)
    wav = tf.reduce_mean(wav, axis=1)
    sample_rate = tfio.audio.AudioIOTensor(filename, dtype=tf.float32).rate
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [26]:
data_directory = '../data/xeno-canto/'
bird_species_df = pd.read_csv('../data/selected_species.csv', sep=',')
filenames = []
targets = []
recording_ids = []
class_names = []
class_id = 0
recording_id = 0

for dir_name in os.listdir(data_directory):
    if dir_name.startswith('.'):
        continue
    class_name = bird_species_df.loc[bird_species_df['Latin name'] == dir_name]['Polish name'].squeeze()
    class_names.append(class_name)

    for file_name in os.listdir(data_directory + dir_name)[:20]:
        file_path = data_directory + dir_name + '/' + file_name
        filenames.append(file_path)
        targets.append(class_id)
        recording_ids.append(recording_id)
        recording_id += 1
    
    class_id += 1


In [29]:
main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets, recording_ids))

In [30]:
def load_mp3_for_map(filename, label, recording_id):
    return load_mp3(filename), label, recording_id

main_ds = main_ds.map(load_mp3_for_map)

In [31]:
def extract_embeddings(wav_data, label, recording_id):
    _, embeddings, _ = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings,
            tf.repeat(label, num_embeddings),
            tf.repeat(recording_id, num_embeddings))
main_ds = main_ds.map(extract_embeddings).unbatch()

# Split data

In [32]:
train_recording_ids = random.sample(range(recording_id), int(0.8*recording_id))
validation_recording_ids = [x for x in range(recording_id) if x not in train_recording_ids]
test_recording_ids = random.sample(validation_recording_ids, int(0.5 * len(validation_recording_ids)))

validation_recording_ids = [x for x in validation_recording_ids if x not in test_recording_ids]

In [33]:
cached_ds = main_ds.cache()

def train_filter_condition(embedding, label, recording_id):
    return recording_id in train_recording_ids

def train_filter_condition_wrapper(embedding, label, recording_id):
    return tf.py_function(train_filter_condition, (embedding, label, recording_id), tf.bool)

def validation_filter_condition(embedding, label, recording_id):
    return recording_id in validation_recording_ids

def validation_filter_condition_wrapper(embedding, label, recording_id):
    return tf.py_function(validation_filter_condition, (embedding, label, recording_id), tf.bool)

def test_filter_condition(embedding, label, recording_id):
    return recording_id in test_recording_ids

def test_filter_condition_wrapper(embedding, label, recording_id):
    return tf.py_function(test_filter_condition, (embedding, label, recording_id), tf.bool)

train_ds = cached_ds.filter(train_filter_condition_wrapper)
val_ds = cached_ds.filter(validation_filter_condition_wrapper)
test_ds = cached_ds.filter(test_filter_condition_wrapper)

remove_recording_id_column = lambda embedding, label, recording_id: (embedding, label)

train_ds = train_ds.map(remove_recording_id_column)
val_ds = val_ds.map(remove_recording_id_column)
test_ds = test_ds.map(remove_recording_id_column)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

# Train model

In [34]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(class_names))
], name='my_model')

my_model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 512)               524800    
                                                                 
 dense_3 (Dense)             (None, 20)                10260     
                                                                 
Total params: 535,060
Trainable params: 535,060
Non-trainable params: 0
_________________________________________________________________


In [35]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [36]:
history = my_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Evaluate model

In [37]:
loss, accuracy = my_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  7.3546929359436035
Accuracy:  0.343235045671463


In [39]:
testing_file_name = '../data/xeno-canto/Luscinia megarhynchos/19660.mp3'
wav = load_mp3(testing_file_name)

scores, embeddings, spectrogram = yamnet_model(wav)
result = my_model(embeddings).numpy()

inferred_class = class_names[result.mean(axis=0).argmax()]

print(f'The main sound is: {inferred_class}')

The main sound is: kowalik


# Save model

In [40]:
# extend trained model to be able to give it raw wav data as input
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

  def call(self, input):
    return tf.math.reduce_mean(input, axis=self.axis)

In [41]:
saved_model_path = './birds_yamnet.keras'

input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
embedding_extraction_layer = hub.KerasLayer(yamnet_model_handle,
                                            trainable=False, name='yamnet')
_, embeddings_output, _ = embedding_extraction_layer(input_segment)
serving_outputs = my_model(embeddings_output)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
serving_model = tf.keras.Model(input_segment, serving_outputs)
serving_model.save(saved_model_path, include_optimizer=False)





# Test saved model

In [42]:
reloaded_model = tf.keras.models.load_model(saved_model_path, custom_objects={'KerasLayer':hub.KerasLayer, 'ReduceMeanLayer': ReduceMeanLayer})





In [43]:
reloaded_results = reloaded_model(wav)
inferred_class = class_names[tf.math.argmax(reloaded_results)]

print(f'The main sound is: {inferred_class}')

The main sound is: kowalik
