##### Copyright 2023 The TensorFlow Hub Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [1]:
#@title Copyright 2023 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<table class="tfo-notebook-buttons" align="left">
  <!-- <td>
    <a target="_blank" href="https://www.tensorflow.org/hub/tutorials/bird_vocalization_classifier"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td> -->
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/bird_vocalization_classifier.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/hub/blob/master/examples/colab/bird_vocalization_classifier.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/hub/examples/colab/bird_vocalization_classifier.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
  <td>
    <a href="https://tfhub.dev/google/bird-vocalization-classifier/1"><img src="https://www.tensorflow.org/images/hub_logo_32px.png" />See TF Hub model</a>
  </td>
</table>

# Using Google Bird Vocalization model

The Google Bird Vocalization is a global bird embedding and classification model.

This model expects as input a 5-second audio segment sampled at 32kHz

The model outputs both the logits and the embeddigs for each input window of audio.

On this notebook you'll learn how to feed the audio properly to the model and how to use the logits for inference.


In [1]:
# !pip install -q "tensorflow_io==0.28.*"
!pip install -q tensorflow
!pip install -q tensorflow_hub
!pip install -q tensorflow_io
!pip install -q librosa

In [49]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import multiprocessing as mp

import numpy as np
import librosa

import csv
import io
import os

from IPython.display import Audio
from datetime import timedelta

Loading the Model from TFHub

In [2]:
model_handle = "https://tfhub.dev/google/bird-vocalization-classifier/1"
model = hub.load(model_handle)

2023-05-24 13:48:30.946522: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 55971840 exceeds 10% of free system memory.


Lets load the labels that the model was trained on.

The labels file is in the assets forlder under label.csv. Each line is an ebird id.

In [3]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  with open(labels_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    class_names = [mid for mid, desc in csv_reader]
    return class_names[1:]

labels_path = hub.resolve(model_handle) + "/assets/label.csv"
classes = class_names_from_csv(labels_path)
#print(classes)

The ```frame_audio``` function is based on the [Chirp lib](https://github.com/google-research/chirp/blob/10c5faa325a3c3468fa6f18a736fc1aeb9bf8129/chirp/inference/interface.py#L128) version but using tf.signal instead of librosa.

The `ensure_sample_rate` is a function to make sure that any audio used with the model has the expected sample rate of 32kHz

In [4]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
  ) -> np.ndarray:
    """Helper function for framing audio for inference."""
    if window_size_s is None or window_size_s < 0:
      return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

def ensure_sample_rate(waveform, original_sample_rate,
                       desired_sample_rate=32000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    waveform = tfio.audio.resample(waveform, original_sample_rate, desired_sample_rate)
  return desired_sample_rate, waveform

In [37]:
def save_results_to_csv(sensor, filename, all_logits, classes):
    # opening the csv file in 'w' mode
    file_location = f"results/{sensor}/" + filename.split('.')[0] + '.csv'
    file = open(file_location, 'w', newline ='')

    with file:
        # identifying header 
        header = ['frame', 'time', 'class', 'probability', 'probability_rank']
        writer = csv.DictWriter(file, fieldnames = header)

        # writing data row-wise into the csv file
        writer.writeheader()

        frame = 0
        for frame_logits in all_logits:
            probabilities = tf.nn.softmax(frame_logits).numpy()
            argmax = np.argmax(probabilities)
            td = timedelta(seconds = frame * 5)

            # Calculate the 5 highest probabilities at each timeframe
            top5_ind = (-probabilities).argsort()[:5]
            top5_classes = np.array(classes)[top5_ind]
            top5_props = probabilities[top5_ind]
            prob_rank = 1
        
            for t5_cls, t5_prp in zip(top5_classes, top5_props):
                writer.writerow({'frame' : frame,
                                 'time': td,
                                 'class': t5_cls,
                                 'probability': float(t5_prp),
                                 'probability_rank': prob_rank})
                prob_rank += 1

            frame += 1

In [39]:
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

In [53]:
# Get the list of all files and directories
sensor = '9_035_217'
# sensor = '1_041_209'

path = f"/data/volume_2/audio_recordings/{sensor}/ogg/"
audio_recordings = os.listdir(path)

audio_recordings_done = [f.replace('csv', 'ogg') for f in os.listdir(f"results/{sensor}/")]

audio_recordings_todo = list(set(audio_recordings) - set(audio_recordings_done))
audio_recordings_todo.sort()
audio_recordings_todo

['20221230_143000.ogg',
 '20221230_213000.ogg',
 '20221231_000001.ogg',
 '20221231_060000.ogg',
 '20221231_143000.ogg',
 '20221231_213000.ogg',
 '20230101_000001.ogg',
 '20230101_060000.ogg',
 '20230101_143000.ogg',
 '20230101_213000.ogg',
 '20230102_000001.ogg',
 '20230102_060000.ogg',
 '20230102_143000.ogg',
 '20230102_213000.ogg',
 '20230103_000001.ogg',
 '20230103_060000.ogg',
 '20230103_143000.ogg',
 '20230103_213000.ogg',
 '20230104_000001.ogg',
 '20230104_060000.ogg',
 '20230104_143000.ogg',
 '20230104_213000.ogg',
 '20230105_000001.ogg',
 '20230105_060000.ogg',
 '20230105_143000.ogg',
 '20230105_213000.ogg',
 '20230106_000001.ogg',
 '20230106_060000.ogg',
 '20230106_143000.ogg',
 '20230106_213000.ogg',
 '20230107_000001.ogg',
 '20230107_060000.ogg',
 '20230107_143000.ogg',
 '20230107_213000.ogg',
 '20230108_000001.ogg',
 '20230108_060000.ogg']

In [54]:
def process_audio_recording(audio_recording):
    print('Processing audio recording: ', audio_recording)
    #audio, sample_rate = __audioread_load(path + audio_recording)
    audio, sample_rate = librosa.load(path + audio_recording)
    sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)
    
    # Audio(wav_data_turdus, rate=sample_rate)
    
    fixed_tm = frame_audio(wav_data)
    # fixed_tm.shape
    
    all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
    
    for window in fixed_tm[1:]:
        logits, embeddings = model.infer_tf(window[np.newaxis, :])
        all_logits = np.concatenate([all_logits, logits], axis=0)
        
    
    save_results_to_csv(sensor, audio_recording, all_logits, classes)
    print('Processing audio recording finished: ', audio_recording)

In [55]:
for audio_recording in audio_recordings_todo:
    process_audio_recording(audio_recording)

Processing audio recording:  20221230_143000.ogg
Processing audio recording finished:  20221230_143000.ogg
Processing audio recording:  20221230_213000.ogg
Processing audio recording finished:  20221230_213000.ogg
Processing audio recording:  20221231_000001.ogg
Processing audio recording finished:  20221231_000001.ogg
Processing audio recording:  20221231_060000.ogg
Processing audio recording finished:  20221231_060000.ogg
Processing audio recording:  20221231_143000.ogg
Processing audio recording finished:  20221231_143000.ogg
Processing audio recording:  20221231_213000.ogg
Processing audio recording finished:  20221231_213000.ogg
Processing audio recording:  20230101_000001.ogg
Processing audio recording finished:  20230101_000001.ogg
Processing audio recording:  20230101_060000.ogg
Processing audio recording finished:  20230101_060000.ogg
Processing audio recording:  20230101_143000.ogg
Processing audio recording finished:  20230101_143000.ogg
Processing audio recording:  20230101

In [32]:
audio_recording = audio_recordings_todo[0]

print('Processing audio recording: ', audio_recording)
#audio, sample_rate = __audioread_load(path + audio_recording)
audio, sample_rate = librosa.load(path + audio_recording, duration=60)
sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)

# Audio(wav_data_turdus, rate=sample_rate)

fixed_tm = frame_audio(wav_data)
# fixed_tm.shape

all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])

for window in fixed_tm[1:]:
    logits, embeddings = model.infer_tf(window[np.newaxis, :])
    all_logits = np.concatenate([all_logits, logits], axis=0)
    

save_results_to_csv(sensor, audio_recording, all_logits, classes)

frame = 0
for frame_logits in all_logits:
    probabilities = tf.nn.softmax(frame_logits).numpy()
    argmax = np.argmax(probabilities)

    top5_ind = (-probabilities).argsort()[:5]
    top5_classes = np.array(classes)[top5_ind]
    top5_props = probabilities[top5_ind]
    prob_rank = 1

    for t5_cls, t5_prp in zip(top5_classes, top5_props):
        print('Frame:', frame, 'contains:', t5_cls, t5_prp, prob_rank)
        prob_rank += 1

    frame += 1

Processing audio recording:  20221230_143000.ogg
Frame: 0 contains: commoo3 0.057859786 1
Frame: 0 contains: mallar3 0.031186381 2
Frame: 0 contains: loeowl 0.021188833 3
Frame: 0 contains: grebit1 0.019686805 4
Frame: 0 contains: tawowl1 0.018183853 5
Frame: 1 contains: whtspa 0.02424339 1
Frame: 1 contains: tawowl1 0.015073772 2
Frame: 1 contains: eueowl1 0.014807262 3
Frame: 1 contains: eurnig1 0.01457802 4
Frame: 1 contains: loeowl 0.013993418 5
Frame: 2 contains: eurrob1 0.17550966 1
Frame: 2 contains: gretit1 0.072275065 2
Frame: 2 contains: firecr1 0.066433884 3
Frame: 2 contains: blutit 0.04213808 4
Frame: 2 contains: shttre1 0.038900517 5
Frame: 3 contains: tawowl1 0.04314288 1
Frame: 3 contains: loeowl 0.02879544 2
Frame: 3 contains: eueowl1 0.02271987 3
Frame: 3 contains: whtspa 0.01923624 4
Frame: 3 contains: eurnig1 0.01734746 5
Frame: 4 contains: duswar 0.11968703 1
Frame: 4 contains: eupfly1 0.028006194 2
Frame: 4 contains: ortbun1 0.024787296 3
Frame: 4 contains: corbun

The audio files are a maximum of 1 hour and the model expects chunks of 5 seconds.

The `frame_audio` function can fix that and split the audio in proper frames

Lets apply the model on all the frames now:

*note*: this code is also based on the [Chirp library](https://github.com/google-research/chirp/blob/d6ff5e7cee3865940f31697bf4b70176c1072572/chirp/inference/models.py#L174)

In [11]:
all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
for window in fixed_tm[1:]:
  logits, embeddings = model.infer_tf(window[np.newaxis, :])
  all_logits = np.concatenate([all_logits, logits], axis=0)

all_logits.shape

(720, 10932)

In [12]:
frame = 0
for frame_logits in all_logits:
  probabilities = tf.nn.softmax(frame_logits)
  argmax = np.argmax(probabilities)
  
  if probabilities[argmax] > 0.8:
      td = timedelta(seconds=frame*5)
      print(f"For frame {frame} at {td}, the audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[argmax]}")
  frame += 1

For frame 38 at 0:03:10, the audio is from the class eurrob1 (element:3244 in the label.csv file), with probability of 0.8288521766662598
For frame 39 at 0:03:15, the audio is from the class martit2 (element:5499 in the label.csv file), with probability of 0.9985539317131042
For frame 42 at 0:03:30, the audio is from the class eurnut2 (element:3241 in the label.csv file), with probability of 0.9394095540046692
For frame 45 at 0:03:45, the audio is from the class eurnut2 (element:3241 in the label.csv file), with probability of 0.9762986302375793
For frame 46 at 0:03:50, the audio is from the class eurnut2 (element:3241 in the label.csv file), with probability of 0.9221656322479248
For frame 50 at 0:04:10, the audio is from the class gretit1 (element:3953 in the label.csv file), with probability of 0.878087043762207
For frame 52 at 0:04:20, the audio is from the class martit2 (element:5499 in the label.csv file), with probability of 0.9930181503295898
For frame 57 at 0:04:45, the audio 