In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

In [3]:
spectrogram_folder = os.path.abspath("../../data/processed/spectrograms/xeno_canto/")
train_parquet_file = "../../data/cleaned/70_15_15_cleaned_train.parquet"

In [4]:
def index_spectrogram_files(spectrogram_folder):
    """
    Create an index of spectrogram files for quick lookup.
    """
    spectrogram_files = os.listdir(spectrogram_folder)
    file_index = {}
    for filename in spectrogram_files:
        file_id = filename.split('_')[0]  # Extract ID prefix
        if file_id not in file_index:
            file_index[file_id] = []
        file_index[file_id].append(os.path.join(spectrogram_folder, filename))
    return file_index

In [5]:
def normalize_spectrogram(spec, target_shape):
    """
    Normalize a spectrogram to the target shape by padding or truncating.
    """
    if spec.shape == target_shape:
        return spec
    elif spec.shape[1] < target_shape[1]:  # Pad if too short
        pad_width = target_shape[1] - spec.shape[1]
        return np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
    else:  # Truncate if too long
        return spec[:, :target_shape[1]]

In [6]:
def load_spectrograms_by_id(file_id, file_index, target_shape=(128, 626)):
    """
    Load spectrograms for a specific ID using pre-indexed files.
    """
    file_id = str(file_id).strip()
    if file_id not in file_index:
        return None

    spectrograms = []
    for full_path in file_index[file_id]:
        try:
            spec = np.load(full_path)
            spec = normalize_spectrogram(spec, target_shape)  # Normalize shape
            spectrograms.append(spec)
        except Exception as e:
            print(f"Error loading file {full_path}: {e}")

    if spectrograms:
        return np.mean(np.array(spectrograms), axis=0)  # Combine all spectrograms
    return None

In [7]:
def load_data(df, file_index, target_shape=(128, 626)):
    """
    Load and process spectrograms for the entire dataset.
    """
    X, y = [], []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Loading Spectrograms"):
        spec_id = row['id']
        spectrogram = load_spectrograms_by_id(spec_id, file_index, target_shape)
        if spectrogram is not None:
            X.append(spectrogram)
            y.append(row['en'])  # Assuming 'en' is the label column
    print(f"Processed {len(X)} samples with {len(y)} labels.")
    return np.array(X), np.array(y)

In [8]:
def visualize_spectrograms(X, y, label_encoder):
    """
    Visualize a few spectrogram samples.
    """
    plt.figure(figsize=(10, 5))
    for i in range(5):  # Display first 5 samples
        plt.subplot(1, 5, i + 1)
        plt.imshow(X[i], aspect='auto', origin='lower')
        plt.title(label_encoder.inverse_transform([y[i]])[0])
        plt.axis('off')
    plt.tight_layout()
    plt.show()

In [9]:
def load_data_in_batches(df, file_index, batch_size=500, target_shape=(128, 626)):
    """
    Load spectrograms in batches to reduce memory usage.
    """
    num_samples = len(df)
    for start_idx in range(0, num_samples, batch_size):
        batch_df = df.iloc[start_idx:start_idx + batch_size]
        X, y = [], []
        for _, row in batch_df.iterrows():
            spec_id = row['id']
            spectrogram = load_spectrograms_by_id(spec_id, file_index, target_shape)
            if spectrogram is not None:
                X.append(spectrogram)
                y.append(row['en'])  # Assuming 'en' is the label column
        yield np.array(X), np.array(y)


In [10]:
def build_cnn(input_shape, num_classes):
    """
    Build a CNN model for spectrogram classification.
    """
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

In [11]:
def encode_labels(y, label_encoder):
    """
    Encode labels into one-hot format using a LabelEncoder.
    """
    return to_categorical(label_encoder.transform(y), num_classes=len(label_encoder.classes_))


In [12]:
print("Indexing spectrogram files...")
file_index = index_spectrogram_files(spectrogram_folder)

print("Loading train data metadata...")
train_df = pd.read_parquet(train_parquet_file)

Indexing spectrogram files...
Loading train data metadata...


In [13]:
print("Splitting data into train, validation, and test sets...")
train_df, temp_df = train_test_split(train_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}, Test samples: {len(test_df)}")


Splitting data into train, validation, and test sets...
Train samples: 40516, Validation samples: 8682, Test samples: 8682


In [14]:
print("Encoding labels...")
le = LabelEncoder()
le.fit(train_df['en']) 

Encoding labels...


In [15]:
print("Building CNN model...")
input_shape = (128, 626, 1)
num_classes = len(le.classes_)
model = build_cnn(input_shape, num_classes)

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train CNN in batches
print("Training the CNN model...")
batch_size = 1000
epochs = 20

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for X_batch, y_batch in load_data_in_batches(train_df, file_index, batch_size):
        X_batch = X_batch[..., np.newaxis]  # Add channel dimension
        y_batch = encode_labels(y_batch, le)
        model.fit(X_batch, y_batch, epochs=1, verbose=1)

    # Validate at the end of each epoch
    print("Validating...")
    val_accuracy = []
    for X_val_batch, y_val_batch in load_data_in_batches(val_df, file_index, batch_size):
        X_val_batch = X_val_batch[..., np.newaxis]
        y_val_batch = encode_labels(y_val_batch, le)
        val_loss, val_acc = model.evaluate(X_val_batch, y_val_batch, verbose=0)
        val_accuracy.append(val_acc)
    print(f"Validation Accuracy: {np.mean(val_accuracy):.4f}")

# Evaluate the model on the test set
print("Evaluating the model on test data...")
test_accuracy = []
for X_test_batch, y_test_batch in load_data_in_batches(test_df, file_index, batch_size):
    X_test_batch = X_test_batch[..., np.newaxis]
    y_test_batch = encode_labels(y_test_batch, le)
    test_loss, test_acc = model.evaluate(X_test_batch, y_test_batch, verbose=0)
    test_accuracy.append(test_acc)

print(f"Test Accuracy: {np.mean(test_accuracy):.4f}")

# Classification report
print("\nGenerating classification report...")
y_true, y_pred = [], []
for X_test_batch, y_test_batch in load_data_in_batches(test_df, file_index, batch_size):
    X_test_batch = X_test_batch[..., np.newaxis]
    y_test_batch_encoded = encode_labels(y_test_batch, le)
    y_pred_batch = model.predict(X_test_batch).argmax(axis=1)
    y_true.extend(y_test_batch)
    y_pred.extend(y_pred_batch)

print(classification_report(y_true, y_pred, target_names=le.classes_))

Building CNN model...
Training the CNN model...
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 267ms/step - accuracy: 0.0106 - loss: 26.4195
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 282ms/step - accuracy: 0.0167 - loss: 5.8743
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 278ms/step - accuracy: 0.0093 - loss: 5.8659
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 277ms/step - accuracy: 0.0224 - loss: 5.8561
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 273ms/step - accuracy: 0.0107 - loss: 5.8496
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 274ms/step - accuracy: 0.0156 - loss: 5.8387
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 278ms/step - accuracy: 0.0132 - loss: 5.8338
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 274ms/step - accuracy: 0.0090 - loss: 5.8291
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 279ms/step - accuracy: 0.0137 - loss: 5.8213
[1m32/32[0m [32

ValueError: Mix of label input types (string and number)