<a href="https://colab.research.google.com/github/AlameluPriya-arunagiri1421/CODSOFT/blob/main/114S.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!pip install librosa tensorflow scikit-learn tqdm matplotlib




In [27]:
ZIP_PATH = "/content/drive/MyDrive/Dataset/archive (1).zip"
EXTRACT_PATH = "/content/archive"

!unzip -q "$ZIP_PATH" -d "$EXTRACT_PATH"


replace /content/archive/Birds Voice.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [28]:
DATASET_PATH = "/content/archive/Voice of Birds/Voice of Birds"


In [29]:
import os

folders = os.listdir(DATASET_PATH)
print("Bird species folders:", len(folders))
print("First 5 species:", folders[:5])


Bird species folders: 114
First 5 species: ['Little Spotted Kiwi_sound', 'Berlepschs Tinamou_sound', 'Quebracho Crested Tinamou_sound', 'Tanimbar Megapode_sound', 'North Island Brown Kiwi_sound']


In [30]:
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


In [31]:
SR = 22050
DURATION = 5
SAMPLES = SR * DURATION

N_MELS = 128
IMG_HEIGHT = 128
IMG_WIDTH = 216   # fixed width after librosa


In [32]:
def load_audio(path):
    y, _ = librosa.load(path, sr=SR)
    if len(y) < SAMPLES:
        y = np.pad(y, (0, SAMPLES - len(y)))
    else:
        y = y[:SAMPLES]
    return y


In [33]:
def extract_features(y):
    mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS)
    mel = librosa.power_to_db(mel, ref=np.max)

    mfcc = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=40)

    return mel, mfcc


In [34]:
X_mel, X_mfcc, y_labels = [], [], []

AUDIO_EXTS = ('.mp3', '.wav', '.flac', '.ogg')

for species in tqdm(os.listdir(DATASET_PATH)):
    species_path = os.path.join(DATASET_PATH, species)
    if not os.path.isdir(species_path):
        continue

    for file in os.listdir(species_path):
        if file.lower().endswith(AUDIO_EXTS):
            file_path = os.path.join(species_path, file)
            try:
                audio = load_audio(file_path)
                mel, mfcc = extract_features(audio)

                X_mel.append(mel)
                X_mfcc.append(mfcc)
                y_labels.append(species)
            except Exception as e:
                print("Skipped:", file_path)


  y, _ = librosa.load(path, sr=SR)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, _ = librosa.load(path, sr=SR)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, _ = librosa.load(path, sr=SR)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
100%|██████████| 114/114 [04:59<00:00,  2.62s/it]


In [35]:
print("Total samples:", len(y_labels))
print("Unique species:", len(set(y_labels)))


Total samples: 2161
Unique species: 114


In [36]:
X_mel = np.array(X_mel)[..., np.newaxis]
X_mfcc = np.array(X_mfcc)[..., np.newaxis]


In [37]:
le = LabelEncoder()
y_encoded = le.fit_transform(y_labels)
y_cat = to_categorical(y_encoded)


In [46]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np

# Count samples per class
counts = Counter(y_labels)
valid_classes = {cls for cls, cnt in counts.items() if cnt >= 2}

print("Classes before filtering:", len(counts))
print("Classes after filtering:", len(valid_classes))

# Rebuild arrays TOGETHER (IMPORTANT)
X_mel_f, X_mfcc_f, y_f = [], [], []

for mel, mfcc, label in zip(X_mel, X_mfcc, y_labels):
    if label in valid_classes:
        X_mel_f.append(mel)
        X_mfcc_f.append(mfcc)
        y_f.append(label)

# Convert to numpy
X_mel = np.array(X_mel_f)
X_mfcc = np.array(X_mfcc_f)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_f)
y_cat = to_categorical(y_encoded)

print("Final samples:", X_mel.shape[0])
print("Final classes:", y_cat.shape[1])


Classes before filtering: 114
Classes after filtering: 110
Final samples: 2153
Final classes: 110


In [49]:
from sklearn.model_selection import train_test_split

X_mel_train, X_mel_test, X_mfcc_train, X_mfcc_test, y_train, y_test = train_test_split(
    X_mel, X_mfcc, y_cat,
    test_size=0.1,
    random_state=42
)


In [51]:
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

input_shape_mel = X_mel_train.shape[1:]
input_shape_mfcc = X_mfcc_train.shape[1:]
num_classes = y_train.shape[1]   # DYNAMIC (IMPORTANT)

def cnn_branch(input_shape):
    inp = Input(shape=input_shape)
    x = BatchNormalization()(inp)
    x = Conv2D(32, (3,3), activation='relu')(x)
    x = MaxPooling2D((2,2))(x)
    x = Conv2D(64, (3,3), activation='relu')(x)
    x = MaxPooling2D((2,2))(x)
    x = Flatten()(x)
    return inp, x

inp1, out1 = cnn_branch(input_shape_mel)
inp2, out2 = cnn_branch(input_shape_mfcc)

merged = concatenate([out1, out2])
x = Dense(256, activation='relu')(merged)
x = Dropout(0.5)(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[inp1, inp2], outputs=output)

model.compile(
    optimizer=SGD(learning_rate=0.001, momentum=0.9),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [52]:
model.fit(
    [X_mel_train, X_mfcc_train],
    y_train,
    epochs=30,
    batch_size=16,
    validation_split=0.2
)


Epoch 1/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 81ms/step - accuracy: 0.0163 - loss: 4.7084 - val_accuracy: 0.0129 - val_loss: 4.7151
Epoch 2/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0349 - loss: 4.5304 - val_accuracy: 0.0696 - val_loss: 4.4832
Epoch 3/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.0508 - loss: 4.3674 - val_accuracy: 0.1418 - val_loss: 4.3267
Epoch 4/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.1125 - loss: 4.2023 - val_accuracy: 0.1907 - val_loss: 4.0877
Epoch 5/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.1503 - loss: 3.9504 - val_accuracy: 0.2577 - val_loss: 3.8367
Epoch 6/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.2294 - loss: 3.5491 - val_accuracy: 0.3222 - val_loss: 3.5202
Epoch 7/30
[1m97/97[0m [32m━━━

<keras.src.callbacks.history.History at 0x7f081d90cd40>

In [53]:
model.save("/content/drive/MyDrive/bird_sound_model.h5")
print("Model saved successfully")




Model saved successfully


In [54]:
test_loss, test_acc = model.evaluate(
    [X_mel_test, X_mfcc_test],
    y_test,
    verbose=1
)

print("Test Accuracy:", test_acc)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 432ms/step - accuracy: 0.5660 - loss: 2.6413
Test Accuracy: 0.5231481194496155


In [55]:
import numpy as np

y_pred_prob = model.predict([X_mel_test, X_mfcc_test])
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step


In [58]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np

labels_present = np.unique(y_true)

print("Precision (macro):", precision_score(y_true, y_pred, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_true, y_pred, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_true, y_pred, average='macro', zero_division=0))

print("\nDetailed Classification Report:\n")
print(classification_report(
    y_true,
    y_pred,
    labels=labels_present,
    target_names=le.inverse_transform(labels_present),
    zero_division=0
))


Precision (macro): 0.4990696649029982
Recall (macro): 0.4555820105820106
F1-score (macro): 0.4408344125010792

Detailed Classification Report:

                                  precision    recall  f1-score   support

               Andean Guan_sound       0.62      0.83      0.71         6
            Andean Tinamou_sound       0.33      0.50      0.40         2
    Australian Brushturkey_sound       0.50      1.00      0.67         1
          Band-tailed Guan_sound       0.17      0.50      0.25         2
         Bartletts Tinamou_sound       1.00      1.00      1.00         2
              Bearded Guan_sound       0.25      0.20      0.22         5
            Biak Scrubfowl_sound       1.00      0.50      0.67         2
             Black Tinamou_sound       0.00      0.00      0.00         1
  Black-billed Brushturkey_sound       0.50      0.25      0.33         4
      Black-capped Tinamou_sound       0.50      0.50      0.50         2
 Black-fronted Piping Guan_sound       1.

In [63]:
# MUST be run once after train-test split

MEL_SHAPE = X_mel_train.shape[1:3]
MFCC_SHAPE = X_mfcc_train.shape[1:3]

print("Training Mel shape:", MEL_SHAPE)
print("Training MFCC shape:", MFCC_SHAPE)


Training Mel shape: (128, 216)
Training MFCC shape: (40, 216)


In [64]:
import librosa
import numpy as np

SR = 22050
DURATION = 5
SAMPLES = SR * DURATION
N_MELS = MEL_SHAPE[0]
N_MFCC = MFCC_SHAPE[0]

def fix_length(spec, target_frames):
    if spec.shape[1] < target_frames:
        pad_width = target_frames - spec.shape[1]
        spec = np.pad(spec, ((0,0),(0,pad_width)))
    else:
        spec = spec[:, :target_frames]
    return spec

def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=SR)

    if len(y) < SAMPLES:
        y = np.pad(y, (0, SAMPLES - len(y)))
    else:
        y = y[:SAMPLES]

    # Mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel = librosa.power_to_db(mel, ref=np.max)
    mel = fix_length(mel, MEL_SHAPE[1])
    mel = (mel - mel.min()) / (mel.max() - mel.min() + 1e-6)
    mel = mel[..., np.newaxis]

    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    mfcc = fix_length(mfcc, MFCC_SHAPE[1])
    mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min() + 1e-6)
    mfcc = mfcc[..., np.newaxis]

    return mel, mfcc


In [66]:
from google.colab import files

uploaded = files.upload()


Saving Bartletts Tinamou4.mp3 to Bartletts Tinamou4.mp3


In [69]:
import librosa
import numpy as np

def extract_features_fixed(audio_path, mel_bins=128, mfcc_bins=20, time_steps=216):
    y, sr = librosa.load(audio_path, sr=None)

    # Trim or pad audio to fixed length
    target_len = time_steps * 512  # 512 hop length
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    # Mel
    mel = librosa.feature.melspectrogram(y, sr=sr, n_mels=mel_bins)
    mel = librosa.power_to_db(mel)

    # MFCC
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=mfcc_bins)

    # Normalize
    mel = (mel - mel.min()) / (mel.max() - mel.min())
    mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())

    # Expand dims
    mel = np.expand_dims(mel, axis=-1)
    mfcc = np.expand_dims(mfcc, axis=-1)

    return mel, mfcc


TypeError: melspectrogram() takes 0 positional arguments but 1 positional argument (and 1 keyword-only argument) were given