In [40]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import glob
from tqdm import tqdm

In [27]:
# Paths
spectrogram_folder = os.path.abspath("../../data/processed/spectrograms/xeno_canto/")
train_parquet_file = "../../data/cleaned/70_15_15_cleaned_train.parquet"
val_parquet_file = "../../data/cleaned/70_15_15_cleaned_val.parquet"

In [43]:
def index_spectrogram_files(spectrogram_folder):
    """
    Create an index of spectrogram files for quick lookup.
    """
    spectrogram_files = os.listdir(spectrogram_folder)
    file_index = {}

    for filename in spectrogram_files:
        file_id = filename.split('_')[0]
        if file_id not in file_index:
            file_index[file_id] = []
        file_index[file_id].append(os.path.join(spectrogram_folder, filename))
    
    return file_index

In [34]:
def normalize_spectrogram(spec, target_shape):
    """
    Normalize a spectrogram to the target shape by padding or truncating.
    """
    if spec.shape == target_shape:
        return spec
    elif spec.shape[1] < target_shape[1]:  # Pad if too short
        pad_width = target_shape[1] - spec.shape[1]
        return np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
    else:  # Truncate if too long
        return spec[:, :target_shape[1]]

In [44]:
def load_spectrograms_by_id(file_id, spectrogram_folder, target_shape=(128, 626)):
    file_id = str(file_id).strip()
    if file_id not in file_index:
        return None

    spectrograms = []
    for full_path in file_index[file_id]:
        try:
            spec = np.load(full_path)
            spec = normalize_spectrogram(spec, target_shape)  # Normalize the shape
            spectrograms.append(spec)
        except Exception as e:
            print(f"Error loading file {full_path}: {e}")

    if spectrograms:
        return np.mean(np.array(spectrograms), axis=0)  # Combine all spectrograms
    return None


In [38]:
def load_data(df, spectrogram_folder):
    X, y = [], []
    for _, row in df.iterrows():
        spec_id = row['id']
        spectrogram = load_spectrograms_by_id(spec_id, spectrogram_folder)
        
        if spectrogram is not None:
            X.append(spectrogram)
            y.append(row['en'])  # Assuming 'en' is the target label
        else:
            print(f"No spectrogram found for ID: {spec_id}")
    print(f"Processed {len(X)} samples with {len(y)} labels.")
    return np.array(X), np.array(y)


In [31]:
# Load parquet files
print("Loading training and validation data...")
train_df = pd.read_parquet(train_parquet_file)
val_df = pd.read_parquet(val_parquet_file)

Loading training and validation data...


In [46]:
def process_spectrograms_in_batches(parquet_df, spectrogram_folder, target_shape=(128, 626), batch_size=50):
    num_rows = len(parquet_df)
    X, y = [], []
    missing_spectrograms = []

    # Process DataFrame in batches
    for start_idx in tqdm(range(0, num_rows, batch_size), desc="Processing Spectrograms"):
        batch_df = parquet_df.iloc[start_idx:start_idx + batch_size]

        for _, row in batch_df.iterrows():
            spec_id = row['id']
            label = row['en']  # Assuming 'en' is the label column
            result = load_spectrograms_by_id(spec_id, file_index, target_shape)
            if result is None:
                missing_spectrograms.append(spec_id)
            else:
                X.append(result)
                y.append(label)

    print(f"Missing spectrograms: {len(missing_spectrograms)}")
    if missing_spectrograms:
        print(f"Example missing spectrogram IDs: {missing_spectrograms[:10]}")

    return np.array(X), np.array(y)

In [None]:
# Pre-index spectrogram files
print("Indexing spectrogram files...")
file_index = index_spectrogram_files(spectrogram_folder)

# Process training data
print("Processing training data...")
X_train, y_train = process_spectrograms_in_batches(train_df, file_index)

In [24]:
# Process training and validation data
print("Processing training data...")
X_train, y_train = load_data(train_df, spectrogram_folder)
print("Processing validation data...")
X_val, y_val = load_data(val_df, spectrogram_folder)

Processing training data...
No spectrogram found for ID: 585887
No spectrogram found for ID: 690196
No spectrogram found for ID: 148888
No spectrogram found for ID: 715617
No spectrogram found for ID: 307948
No spectrogram found for ID: 542556
No spectrogram found for ID: 495532
No spectrogram found for ID: 607413
No spectrogram found for ID: 476606
No spectrogram found for ID: 594748
No spectrogram found for ID: 396678
No spectrogram found for ID: 510639
No spectrogram found for ID: 492501
No spectrogram found for ID: 541514
No spectrogram found for ID: 744026
No spectrogram found for ID: 576990
No spectrogram found for ID: 803950
No spectrogram found for ID: 939986
No spectrogram found for ID: 812604
No spectrogram found for ID: 838365
No spectrogram found for ID: 602189
No spectrogram found for ID: 661113
No spectrogram found for ID: 547102
No spectrogram found for ID: 492742
No spectrogram found for ID: 796333
No spectrogram found for ID: 381572
No spectrogram found for ID: 786117


In [9]:
# Ensure data is reshaped for CNN input
print("Reshaping data for CNN input...")
X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]

Reshaping data for CNN input...


In [11]:
print("y_train shape:", y_train.shape)
print("y_train sample:", y_train[:10] if len(y_train) > 0 else "Empty")
print("y_val shape:", y_val.shape)
print("y_val sample:", y_val[:10] if len(y_val) > 0 else "Empty")


y_train shape: (0,)
y_train sample: Empty
y_val shape: (0,)
y_val sample: Empty


In [10]:
# Encode labels
print("Encoding labels...")
le = LabelEncoder()
y_train = to_categorical(le.fit_transform(y_train))
y_val = to_categorical(le.transform(y_val))

Encoding labels...


ValueError: zero-size array to reduction operation maximum which has no identity