In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [None]:
esc50 = tf.keras.utils.get_file('esc-50.zip',
                        'https://github.com/karoldvl/ESC-50/archive/master.zip',
                        cache_dir='./',
                        cache_subdir='datasets',
                        extract=True)

In [None]:
esc50_csv = '/content/datasets/esc-50_extracted/ESC-50-master/meta/esc50.csv'
base_data_path = '/content/datasets/esc-50_extracted/ESC-50-master/audio'

import pandas as pd
data = pd.read_csv(esc50_csv)

In [None]:
import tensorflow_hub as hub
import librosa
import numpy as np

yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
data.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [None]:
data['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
dog,40
chirping_birds,40
vacuum_cleaner,40
thunderstorm,40
door_wood_knock,40
can_opening,40
crow,40
clapping,40
fireworks,40
chainsaw,40


In [None]:
import os
all_classes = sorted(data['category'].unique())
map_class_to_id = {name: idx for idx, name in enumerate(all_classes)}
data['target'] = data['category'].map(map_class_to_id)
def full_path(filename):
    return os.path.join(base_data_path, filename)
data['filename'] = data['filename'].map(full_path)


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import scipy.signal
import soundfile as sf

filenames = data['filename']
targets = data['target']
folds = data['fold']
main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets, folds))

def load_wav_16k_mono(filename):
    audio, sr = sf.read(filename.numpy().decode())
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    audio_16k = scipy.signal.resample(audio, int(16000 * len(audio) / sr))
    return tf.convert_to_tensor(audio_16k, dtype=tf.float32)

def tf_load_wav(filename, label, fold):
    audio = tf.py_function(load_wav_16k_mono, [filename], tf.float32)
    return audio, label, fold

main_ds = main_ds.map(tf_load_wav)

yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

def extract_embedding(wav_data, label, fold):
    _, embeddings, _ = yamnet_model(wav_data)
    n = tf.shape(embeddings)[0]
    return embeddings, tf.repeat(label, n), tf.repeat(fold, n)

main_ds = main_ds.map(extract_embedding).unbatch()


In [None]:
cached_ds = main_ds.cache()

train_ds = cached_ds.filter(lambda emb, label, fold: fold < 4)
val_ds = cached_ds.filter(lambda emb, label, fold: fold == 4)
test_ds = cached_ds.filter(lambda emb, label, fold: fold == 5)
def remove_fold(embedding, label, fold):
    return embedding, label

train_ds = train_ds.map(remove_fold)
val_ds = val_ds.map(remove_fold)
test_ds = test_ds.map(remove_fold)

train_ds = train_ds.shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
num_classes = len(data['target'].unique())

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024,), name='embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(num_classes)
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [None]:
history = model.fit(train_ds,
                       epochs=3,
                       validation_data=val_ds,
                       callbacks=callback)

Epoch 1/3
    372/Unknown [1m112s[0m 285ms/step - accuracy: 0.5548 - loss: 2.0113



[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 285ms/step - accuracy: 0.5557 - loss: 2.0056 - val_accuracy: 0.6133 - val_loss: 1.4750
Epoch 2/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - accuracy: 0.7034 - loss: 1.0484 - val_accuracy: 0.6145 - val_loss: 1.4639
Epoch 3/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.7330 - loss: 0.9249 - val_accuracy: 0.6252 - val_loss: 1.4332


In [None]:
import soundfile as sf
import scipy.signal
import tensorflow_hub as hub

def load_wav_16k(filename):
    audio, sr = sf.read(filename)
    if len(audio.shape) > 1:
        audio = audio.mean(axis=1)  # Convert to mono
    audio_16k = scipy.signal.resample(audio, int(16000 * len(audio) / sr))
    return tf.convert_to_tensor(audio_16k, dtype=tf.float32)

wav_path = '/content/datasets/esc-50_extracted/ESC-50-master/audio/1-100038-A-14.wav'
waveform = load_wav_16k(wav_path)

yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)
_, embeddings, _ = yamnet_model(waveform)

logits = model(embeddings)
mean_logits = tf.reduce_mean(logits, axis=0)
predicted_class_index = tf.argmax(mean_logits)
predicted_class = list(map_class_to_id.keys())[int(predicted_class_index)]

print(f"🔊 Predicted class: {predicted_class}")


🔊 Predicted class: chirping_birds
