In [40]:
import sys
import os
import numpy as np
import datetime
from PIL import Image
import tensorflow as tf
import multiprocessing

import birdnet_util.audio as audio
from birdnet_util.audio0 import spectrogram  # Spectrogram function

In [41]:
# ---------------------- LOAD TRAINED MODEL ---------------------- #
def model_loading(MODEL_PATH, TFLITE_THREADS=1):
    interpreter = tf.lite.Interpreter(model_path=MODEL_PATH, num_threads=TFLITE_THREADS)
    interpreter.allocate_tensors()
    print("[INFO] Model loaded successfully.")
    return interpreter

def load_labels(LABEL_FILE):
    with open(LABEL_FILE, "r") as f:
       LABELS = [line.strip() for line in f]

    print(f"# Target categories: {len(LABELS)}")
    NUM_CLASSES = len(LABELS)
    return LABELS

In [42]:
MODEL_PATH = "mobilenet-224-337wi-ft.tflite" 
LABEL_FILE = "species-list-337.txt"
TFLITE_THREADS = max(1, multiprocessing.cpu_count() // 2)

interpreter = model_loading(MODEL_PATH, TFLITE_THREADS)
LABELS = load_labels(LABEL_FILE)

print("First labels:")
for k in LABELS[:10]:
    print(f"  {k}")

[INFO] Model loaded successfully.
# Target categories: 337
First labels:
  Accipiter gentilis
  Accipiter nisus
  Acrocephalus agricola
  Acrocephalus arundinaceus
  Acrocephalus dumetorum
  Acrocephalus paludicola
  Acrocephalus palustris
  Acrocephalus schoenobaenus
  Acrocephalus scirpaceus
  Actitis hypoleucos


In [43]:
# INPUT FILE
INPUT_PATH = "./"
AUDIO_FILE = "XC793531.MP3"

# PARAMETERS
SAMPLE_RATE = 48000
FILE_SPLITTING_DURATION = 600
BANDPASS_FMIN = 0
BANDPASS_FMAX = 15000
SIG_LENGTH = 3.0
SIG_OVERLAP = 0
SIG_MINLEN = SIG_LENGTH
MAX_LIMIT = 1000
IMG_HEIGHT = 224
IMG_WIDTH = 224
rescaling = 1.0 / 255.0
MAX_SEGMENTS = 1000
MIN_CONF = 0.5

In [44]:
# AUXILIARY FUNCTIONS
def apply_confidence_threshold(predictions, threshold=0.5):
    filtered_predictions = {}
    for k,pred in predictions.items():
        max_prob = np.max(pred) 
        if max_prob >= threshold:
            predicted_class = np.argmax(pred) 
            filtered_predictions[k]=(predicted_class, max_prob)
    return filtered_predictions

def analyze_file(f):
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    input_shape = input_details[0]['shape']

    print(f"Analyzing {f}", flush=True)
    start_time = datetime.datetime.now()
    full_path = os.path.join(INPUT_PATH, f)
    chunk_preds = []
    print_preds = {}

    sig, rate = audio.openAudioFile(full_path, SAMPLE_RATE, offset=0, duration=FILE_SPLITTING_DURATION, fmin=BANDPASS_FMIN, fmax=BANDPASS_FMAX)
    chunks = audio.splitSignal(sig, rate, SIG_LENGTH, SIG_OVERLAP, SIG_MINLEN)

    for interval, y in enumerate(chunks[:MAX_SEGMENTS]):
                    spec, _ = spectrogram(y, rate, shape=(128, 224))
                    try:
                        standardized_spec = (spec - np.min(spec)) / (np.max(spec) - np.min(spec)) 
                    except RuntimeWarning:
                        continue

                    spec_array = (np.asarray(standardized_spec.T) * 255)
                    img = Image.fromarray(spec_array.T)

                    # Preprocessing
                    img = img.resize((IMG_HEIGHT, IMG_WIDTH))
                    img = np.expand_dims(img, axis=-1)  # channel dimension (1)
                    img = np.repeat(img, 3, axis=-1)  # to 3-channel
                    img = np.expand_dims(img, axis=0)  # add batch dimension
                    img = img.astype(np.float32) * rescaling

                    # Model inference
                    interpreter.set_tensor(input_details[0]['index'], img.astype(input_details[0]['dtype']))
                    interpreter.invoke()

                    # Results
                    output_data = interpreter.get_tensor(output_details[0]['index'])
                    predictions = np.squeeze(output_data)
                    predicted_class = np.argmax(predictions)

                    print_preds[f"{interval*SIG_LENGTH}-{(interval+1)*SIG_LENGTH}"] = predictions

    # Show predictions
    filtered_predictions = apply_confidence_threshold(print_preds, MIN_CONF)
    for k, p_c in filtered_predictions.items():
                        predicted_class, confidence = p_c
                        label = LABELS[predicted_class] if predicted_class < len(LABELS) else f"class_{pred_class}"
                        print(f"{k.split('-')[0]}\t{k.split('-')[1]}\t{LABELS[predicted_class]}\t{confidence:.2f}\t{f}")


    delta_time = (datetime.datetime.now() - start_time).total_seconds()
    print(f"Finished {f} in {delta_time:.2f} seconds", flush=True)


In [45]:
# ---------------------- ANALYZE FILE ---------------------- #
analyze_file(AUDIO_FILE)

Analyzing XC793531.MP3
0.0	3.0	Bubo bubo	1.00	XC793531.MP3
3.0	6.0	Bubo bubo	1.00	XC793531.MP3
6.0	9.0	other	0.92	XC793531.MP3
9.0	12.0	Bubo bubo	1.00	XC793531.MP3
12.0	15.0	Bubo bubo	1.00	XC793531.MP3
Finished XC793531.MP3 in 0.39 seconds
