In [3]:
%pip install librosa
%pip install tensorflow 


Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensor

In [5]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [12]:
train_file = "../../data/cleaned/70_15_15_cleaned_train.csv"
val_file = "../../data/cleaned/70_15_15_cleaned_val.csv"
audio_folder = "../../data/raw/audio/xeno-canto/"

In [7]:
# Parameters
n_mfcc = 40  # Number of MFCC features
max_pad_len = 173  # Maximum padding length for audio
batch_size = 32
epochs = 50
learning_rate = 0.001

In [8]:
# Function to load and preprocess audio files
def preprocess_audio(file_id, folder, max_pad_len=173, n_mfcc=40):
    file_path_mp3 = os.path.join(folder, f"{file_id}.mp3")
    file_path_wav = os.path.join(folder, f"{file_id}.wav")
    file_path = file_path_mp3 if os.path.exists(file_path_mp3) else file_path_wav
    
    if not os.path.exists(file_path):
        return None
    
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        pad_width = max_pad_len - mfcc.shape[1]
        if pad_width > 0:
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]
        return mfcc
    except Exception as e:
        print(f"Error processing file {file_id}: {e}")
        return None

In [9]:
# Function to load dataset
def load_dataset(csv_file, audio_folder, n_mfcc=40, max_pad_len=173):
    df = pd.read_csv(csv_file)
    audio_features = []
    labels = []
    for _, row in df.iterrows():
        mfcc = preprocess_audio(row['id'], audio_folder, max_pad_len, n_mfcc)
        if mfcc is not None:
            audio_features.append(mfcc)
            labels.append(row['en'])  # Assuming 'en' is the target label
    return np.array(audio_features), np.array(labels)

In [17]:
# Load the datasets
train_df = pd.read_csv("../../data/cleaned/70_15_15_cleaned_train.csv")
val_df = pd.read_csv("../../data/cleaned/70_15_15_cleaned_val.csv")

# Inspect the data
print("Training dataset sample:")
print(train_df.head())
print("\nValidation dataset sample:")
print(val_df.head())


Training dataset sample:
   Unnamed: 0.1  Unnamed: 0      id         gen          sp  ssp  group  \
0         12324       14055  370848     Egretta    garzetta  NaN  birds   
1         10347       11625  629154       Larus  argentatus  NaN  birds   
2         15365       17353  579339       Picus     viridis  NaN  birds   
3         23668       26557  802334     Sturnus    vulgaris  NaN  birds   
4         11397       13042  799697  Ixobrychus     minutus  NaN  birds   

                          en               rec                 cnt  ...  \
0               Little Egret  Albert Lastukhin  Russian Federation  ...   
1      European Herring Gull     Meena Haribal              Norway  ...   
2  European Green Woodpecker       Samuel Levy      United Kingdom  ...   
3            Common Starling  Susanne Kuijpers         Netherlands  ...   
4             Little Bittern     Ricardo Hevia               Spain  ...   

                                                 rmk  bird-seen  animal-s

In [18]:
import os

audio_folder = "../data/raw/audio/xeno-canto/"
missing_files = []

# Check if files exist
for file_id in train_df['id']:
    mp3_path = os.path.join(audio_folder, f"{file_id}.mp3")
    wav_path = os.path.join(audio_folder, f"{file_id}.wav")
    if not (os.path.exists(mp3_path) or os.path.exists(wav_path)):
        missing_files.append(file_id)

print(f"Missing audio files for {len(missing_files)} IDs.")
if len(missing_files) > 0:
    print("Example missing file IDs:", missing_files[:10])


Missing audio files for 22754 IDs.
Example missing file IDs: [370848, 629154, 579339, 802334, 799697, 723673, 842246, 384576, 892221, 411273]


In [13]:
# Load and preprocess training and validation datasets
print("Loading and preprocessing training data...")
X_train, y_train = load_dataset(train_file, audio_folder, n_mfcc, max_pad_len)
print("Loading and preprocessing validation data...")
X_val, y_val = load_dataset(val_file, audio_folder, n_mfcc, max_pad_len)

Loading and preprocessing training data...
Loading and preprocessing validation data...


In [15]:
print("y_train shape:", y_train.shape)
print("y_train sample:", y_train[:10])  # Print the first 10 labels
print("y_val shape:", y_val.shape)
print("y_val sample:", y_val[:10])  # Print the first 10 labels


y_train shape: (0,)
y_train sample: []
y_val shape: (0,)
y_val sample: []


In [14]:
# Reshape for CNN input and encode labels
X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]

le = LabelEncoder()
y_train = to_categorical(le.fit_transform(y_train))
y_val = to_categorical(le.transform(y_val))

ValueError: zero-size array to reduction operation maximum which has no identity

In [None]:
# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(n_mfcc, max_pad_len, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

print("Training the CNN model...")
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[early_stopping])

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Save the model
model.save('cnn_audio_baseline.h5')

In [None]:
# Visualize training history
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Model Loss')
plt.show()