# Embeddings Extraction for Bird Vocalizations Classification

This notebook extracts 1024-dimensional audio embeddings from a fine-tuned BirdNET model. It processes 3-second audio segments, feeds them into the model, and retrieves feature representations from an intermediate layer. These embeddings can be used for training traditional machine learning models like Random Forest bird species classification.

### Libraries

In [1]:
import os
import numpy as np
import tensorflow as tf
import librosa

2025-02-03 09:11:15.929546: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-03 09:11:16.362991: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-03 09:11:16.363124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-03 09:11:16.438742: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-03 09:11:16.606044: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-03 09:11:16.609033: I tensorflow/core/platform/cpu_feature_guard.cc:1

BirdNET Fine Tuned Model Path

In [4]:
model_path = "../../models/BirdNET_CustomClassifier/AllData/2025_CustomClassifier_DF_REPEAT_025_MIXUP_SEGMENTS.tflite"

## Embeddings Extraction

In [5]:
import tensorflow as tf
import numpy as np

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

# Get layer details
layer_details = interpreter.get_tensor_details()

# Print layer details
for layer in layer_details:
    print("Layer Name:", layer['name'])
    print("Layer Index:", layer['index'])
    print("Layer Shape:", layer['shape'])
    print("Layer Type:", layer['dtype'])
    print("Quantization Parameters:", layer['quantization'])
    print("-" * 30)

Layer Name: serving_default_model_1_input:0
Layer Index: 0
Layer Shape: [     1 144000]
Layer Type: <class 'numpy.float32'>
Quantization Parameters: (0.0, 0)
------------------------------
Layer Name: basic/model_1/BNORM_0/FusedBatchNormV3
Layer Index: 1
Layer Shape: [24]
Layer Type: <class 'numpy.float32'>
Quantization Parameters: (0.0, 0)
------------------------------
Layer Name: basic/model_1/BLOCK_1-1_BN_2/FusedBatchNormV3
Layer Index: 2
Layer Shape: [72]
Layer Type: <class 'numpy.float32'>
Quantization Parameters: (0.0, 0)
------------------------------
Layer Name: basic/model_1/BLOCK_1-2_BN_2/FusedBatchNormV3
Layer Index: 3
Layer Shape: [72]
Layer Type: <class 'numpy.float32'>
Quantization Parameters: (0.0, 0)
------------------------------
Layer Name: basic/model_1/BLOCK_1-3_BN_2/FusedBatchNormV3
Layer Index: 4
Layer Shape: [72]
Layer Type: <class 'numpy.float32'>
Quantization Parameters: (0.0, 0)
------------------------------
Layer Name: basic/model_1/BLOCK_2-1_BN_2/FusedBatc

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [75]:
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

# Obtener los índices de entrada y salida
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Configurar los índices de la capa de embedding (en este caso, la capa GlobalAveragePooling)
embedding_index = 545 #547  # Índice de model/GLOBAL_AVG_POOL/Mean, como identificaste

# Asignar una entrada de prueba
input_data = np.random.random(size=input_details[0]['shape']).astype(np.float32)
interpreter.set_tensor(input_details[0]['index'], input_data)

# Ejecutar la inferencia
interpreter.invoke()

# Extraer el embedding
embedding = interpreter.get_tensor(embedding_index)
print("Embedding shape:", embedding.shape)
print("Embedding vector:", embedding)

Embedding shape: (1, 1024)
Embedding vector: [[0.         0.37047914 0.32140145 ... 0.98294234 0.0562969  1.4812535 ]]


Embeddings shape is 1024

In [76]:
# Load tflite model
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
for detail in interpreter.get_output_details():
    print(detail)


{'name': 'StatefulPartitionedCall:0', 'index': 546, 'shape': array([ 1, 28], dtype=int32), 'shape_signature': array([-1, 28], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


In [85]:
# Load tflite model
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Audio Config
target_sample_rate = 48000
target_duration = 3  # Target Duration in sefcomds
target_length = target_sample_rate * target_duration  # Frame legth of 3 seconds

# Función para preprocesar cada audio
def preprocess_audio(audio_path):
    audio, sr = librosa.load(audio_path)
    audios = []
    segment_idx = []

    if sr != target_sample_rate:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)

    if len(audio) > target_length: # segment in as many audios as possible
        for i in range(0, len(audio) - target_length, target_length):
            audios.append(audio[i:i + target_length])
            segment_idx.append(int(i / target_length))
        return audios, segment_idx
    else:
        padding = target_length - len(audio)
        audio = np.pad(audio, (padding // 2, padding - padding // 2), 'constant')
        audios.append(audio)
        segment_idx.append(0)
    return audios, segment_idx

# Función para obtener el embedding desde el modelo TFLite
def get_embedding(audio_data):
    input_shape = input_details[0]['shape']
    audio_data = np.reshape(audio_data, input_shape).astype(np.float32)
    
    interpreter.set_tensor(input_details[0]['index'], audio_data)
    interpreter.invoke()
    
    # Obtener el embedding de la capa con index 545 (tamaño 1024)
    embedding_index = 545  # Índice de la capa deseada
    embedding = interpreter.get_tensor(embedding_index)

    return embedding

    # print(f"Embedding shape: {embedding.shape}")  # Confirmar el tamaño (1024)

### Train

In [66]:
# Obtener embeddings y etiquetas del conjunto de entrenamiento
train_folder = "../../../Data/Dataset/Audios/For Classifier/train"
train_embeddings = []
train_labels = []

for label_folder in os.listdir(train_folder):
    label_path = os.path.join(train_folder, label_folder)
    if os.path.isdir(label_path):
        for file_name in os.listdir(label_path):
            if file_name.endswith(".WAV") or file_name.endswith(".wav"):
                audio_path = os.path.join(label_path, file_name)
                audio_data, _ = preprocess_audio(audio_path)
                for audio in audio_data:
                    embedding = get_embedding(audio)
                    train_embeddings.append(embedding)
                    train_labels.append(label_folder)  # La etiqueta es el nombre de la carpeta

train_embeddings = np.array(train_embeddings)
train_labels = np.array(train_labels)

# Save embeddings and labels in Dataset/Embeddings
np.save("../../../Data/Dataset/Embeddings/train_embeddings.npy", train_embeddings)
np.save("../../../Data/Dataset/Embeddings/train_labels.npy", train_labels)

### Validation Embeddings

In [67]:
# Obtener embeddings y etiquetas del conjunto de entrenamiento
validation_folder = "../../../Data/Dataset/Audios/For Classifier/validation"
validation_embeddings = []
validation_labels = []

for label_folder in os.listdir(validation_folder):
    label_path = os.path.join(validation_folder, label_folder)
    if os.path.isdir(label_path):
        for file_name in os.listdir(label_path):
            if file_name.endswith(".WAV") or file_name.endswith(".wav"):
                audio_path = os.path.join(label_path, file_name)
                audio_data, _ = preprocess_audio(audio_path)
                for audio in audio_data:
                    embedding = get_embedding(audio)
                    validation_embeddings.append(embedding)
                    validation_labels.append(label_folder)  # La etiqueta es el nombre de la carpeta

validation_embeddings = np.array(validation_embeddings)
validation_labels = np.array(validation_labels)

# Save embeddings and labels in Dataset/Embeddings
np.save("../../../Data/Dataset/Embeddings/validation_embeddings.npy", validation_embeddings)
np.save("../../../Data/Dataset/Embeddings/validation_labels.npy", validation_labels)

#### Test Embeddings

#### Full Audios

In [86]:
# Obtener embeddings y etiquetas del conjunto de entrenamiento
test_folder = "../../../Data/Dataset/Audios/For Classifier/test"
test_embeddings = []
test_file_names = []
test_file_idx = []

# walk in subdirectories 
for root, dirs, files in os.walk(test_folder):
    for file_name in files:
        if file_name.endswith(".WAV") or file_name.endswith(".wav"):
            audio_path = os.path.join(root, file_name)
            audio_data, files_idx = preprocess_audio(audio_path)
            for audio, file_idx in zip(audio_data, files_idx):
                embedding = get_embedding(audio)
                test_embeddings.append(embedding)
                test_file_names.append(file_name)  # La etiqueta es el nombre de la carpeta
                test_file_idx.append(file_idx)

test_embeddings = np.array(test_embeddings)
test_file_names = np.array(test_file_names)
test_file_idx = np.array(test_file_idx)

# Save embeddings and labels in Dataset/Embeddings
np.save("../../../Data/Dataset/Embeddings/test_embeddings.npy", test_embeddings)
np.save("../../../Data/Dataset/Embeddings/test_file_names.npy", test_file_names)
np.save("../../../Data/Dataset/Embeddings/test_file_idx.npy", test_file_idx)

#### Bird Song Detector

In [87]:
# Obtener embeddings y etiquetas del conjunto de entrenamiento
test_folder = "../../../Data/Dataset/Audios/For Classifier/BirdSongDetectorTestSegments"
bsd_test_embeddings = []
bsd_test_file_names = []
bsd_files_idx = []

# walk in subdirectories 
for file_name in os.listdir(test_folder):
    if file_name.endswith(".WAV") or file_name.endswith(".wav"):
        audio_path = os.path.join(test_folder, file_name)
        audio_data, files_idx = preprocess_audio(audio_path)
        for audio, file_idx in zip(audio_data, files_idx):
            embedding = get_embedding(audio)
            bsd_test_embeddings.append(embedding)
            bsd_test_file_names.append(file_name)  # La etiqueta es el nombre de la carpeta
            bsd_files_idx.append(file_idx)

bsd_test_embeddings = np.array(bsd_test_embeddings)
bsd_test_file_names = np.array(bsd_test_file_names)
bsd_files_idx = np.array(bsd_files_idx)

# Save embeddings and labels in Dataset/Embeddings
np.save("../../../Data/Dataset/Embeddings/bsd_test_embeddings.npy", bsd_test_embeddings)
np.save("../../../Data/Dataset/Embeddings/bsd_test_file_names.npy", bsd_test_file_names)
np.save("../../../Data/Dataset/Embeddings/bsd_files_idx.npy", bsd_files_idx)

In [88]:
# check size of embeddings
print("Train Embeddings Shape:", train_embeddings.shape)
print("Validation Embeddings Shape:", validation_embeddings.shape)
print("Test Embeddings Shape:", test_embeddings.shape)
print("BSD Test Embeddings Shape:", bsd_test_embeddings.shape)

Train Embeddings Shape: (3739, 1, 1024)
Validation Embeddings Shape: (985, 1, 1024)
Test Embeddings Shape: (1840, 1, 1024)
BSD Test Embeddings Shape: (329, 1, 1024)
