In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import librosa
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import layers, models, callbacks

In [None]:
!apt-get install unrar

In [None]:
!unrar x "/content/drive/MyDrive/audio_dataset.rar"  "/content"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Complete YAMNet Training Pipeline
# ---------------------------------------------------------
# 1. SETUP & IMPORTS
# ---------------------------------------------------------


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.19.0


In [None]:
# ---------------------------------------------------------
# 2. LOAD YAMNET & PREPARE DATASET
# ---------------------------------------------------------
# Load YAMNet from TensorFlow Hub
print("Loading YAMNet model...")
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

Loading YAMNet model...


In [None]:
def process_audio(file_path):
    """Loads audio, fixes rate to 16kHz, runs YAMNet."""
    # 1. Load audio (resample to 16kHz for YAMNet)
    wav_data, _ = librosa.load(file_path, sr=16000, mono=True)

    # 2. Run YAMNet
    # YAMNet expects a 1D tensor of shape (N,)
    scores, embeddings, spectrogram = yamnet_model(wav_data)

    # 3. Global Average Pooling
    # YAMNet outputs (N, 1024) where N is the number of 0.48s frames.
    # We average them to get one (1024,) vector for the whole file.
    global_embedding = tf.reduce_mean(embeddings, axis=0)
    return global_embedding.numpy()

In [None]:
import os
import numpy as np

# POINT THIS TO YOUR PARENT FOLDER
dataset_path = "audio_dataset/dataset"

print("Extracting features... (this may take a moment)")
X = []
y = []

# 1. Iterate through the top-level folders (which act as categories)
for category_name in os.listdir(dataset_path):
    category_dir = os.path.join(dataset_path, category_name)

    # Check if it is actually a directory (ignoring hidden files like .DS_Store)
    if os.path.isdir(category_dir):

        # 2. Iterate through every audio file inside that category folder
        for filename in os.listdir(category_dir):
            file_path = os.path.join(category_dir, filename)

            # Optional: Filter to ensure we only try to process audio files
            if not filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg','.m4a')):
                continue

            try:
                # Run the existing YAMNet helper function
                embedding = process_audio(file_path)

                X.append(embedding)
                y.append(category_name) # <--- The folder name becomes the label

            except Exception as e:
                print(f"Error processing {filename}: {e}")

X = np.array(X)
y = np.array(y)

print(f"Finished. Processed {len(X)} files across {len(np.unique(y))} categories.")

In [None]:
# Encode Labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

# ---------------------------------------------------------
# 4. SPLIT DATA (Train / Validation / Test)
# ---------------------------------------------------------
# Split: 80% Train, 20% Validation, 15% Test
X_train, X_val, y_train, y_val = train_test_split(X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded)


print(f"Training shapes: {X_train.shape}")
print(f"Validation shapes: {X_val.shape}")


Training shapes: (1056, 1024)
Validation shapes: (264, 1024)


In [None]:
import os
import numpy as np

# POINT THIS TO YOUR PARENT FOLDER
dataset_path = "audio_dataset_220/test"

print("Extracting features... (this may take a moment)")
X_test = []
y_test = []

# 1. Iterate through the top-level folders (which act as categories)
for category_name in os.listdir(dataset_path):
    category_dir = os.path.join(dataset_path, category_name)

    # Check if it is actually a directory (ignoring hidden files like .DS_Store)
    if os.path.isdir(category_dir):

        # 2. Iterate through every audio file inside that category folder
        for filename in os.listdir(category_dir):
            file_path = os.path.join(category_dir, filename)

            # Optional: Filter to ensure we only try to process audio files
            if not filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg','.m4a')):
                continue

            try:
                # Run the existing YAMNet helper function
                embedding = process_audio(file_path)

                X_test.append(embedding)
                y_test.append(category_name) # <--- The folder name becomes the label

            except Exception as e:
                print(f"Error processing {filename}: {e}")

X_test = np.array(X_test)
y_test = np.array(y_test)

print(f"Finished. Processed {len(X_test)} files across {len(np.unique(y_test))} categories.")

In [None]:
print(len(os.listdir('audio_dataset_220/test/adult_sound')))

In [None]:
# Encode Labels
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_test_categorical = tf.keras.utils.to_categorical(y_test_encoded)


print(f"Test shapes: {X_test.shape}")



In [None]:
# ---------------------------------------------------------
# 5. BUILD MODEL
# ---------------------------------------------------------
model = models.Sequential([
    layers.Input(shape=(1024,)),                 # YAMNet embedding size
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),                 # Stabilize learning
    layers.Dropout(0.5),                         # Reduce overfitting
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(6, activation='softmax') # Output layer
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# ---------------------------------------------------------
# 6. CALLBACKS (Early Stopping, Checkpoint)
# ---------------------------------------------------------
# Stop if validation loss doesn't improve for 5 epochs
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

# Save the best model only
checkpoint = callbacks.ModelCheckpoint(
    'best_audio_model.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Reduce learning rate if stuck
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=0.00001,
    verbose=1
)

In [None]:
# ---------------------------------------------------------
# 7. TRAIN
# ---------------------------------------------------------
print("\n--- Starting Training ---")
history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, checkpoint, reduce_lr]
)

In [None]:
# ---------------------------------------------------------
# 8. EVALUATION & CLASSIFICATION REPORT
# ---------------------------------------------------------
print("\n--- Evaluating on Test Set ---")
test_loss, test_acc = model.evaluate(X_test, y_test_categorical)
print(f"Test Accuracy: {test_acc*100:.2f}%")

In [None]:

# Generate Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test_categorical, axis=1)

# Classification Report
print("\n--- Classification Report ---")
print(classification_report(y_true_classes, y_pred_classes, target_names=le.classes_))

In [None]:
# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_true_classes, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:


# ---------------------------------------------------------
# 9. INFERENCE FUNCTION
# ---------------------------------------------------------
def predict_sound(audio_path):
    """Predicts class for a new audio file"""
    # Extract embedding
    emb = process_audio(audio_path)
    # Reshape for model (1, 1024)
    emb = emb.reshape(1, -1)
    # Predict
    prediction = model.predict(emb, verbose=0)
    class_index = np.argmax(prediction)
    confidence = np.max(prediction)
    return le.classes_[class_index], confidence

print("\nExample Inference:")
# Test on a random file from the test set
test_idx = np.random.randint(0, len(X_test))
# We can't trace back to the filename easily here, so we simulate a 'new' file prediction
# by passing the embedding directly to the model for demonstration
pred = model.predict(X_test[test_idx].reshape(1, -1), verbose=0)
print(f"Predicted: {le.classes_[np.argmax(pred)]}, True: {le.classes_[y_true_classes[test_idx]]}")

In [None]:
# @title Convert Model to TensorFlow Lite
import tensorflow as tf
import numpy as np

# 1. LOAD THE TRAINED MODEL
# We load the best version saved during training
try:
    model = tf.keras.models.load_model('best_audio_model.keras')
    print("Loaded 'best_audio_model.keras' successfully.")
except:
    print("Model file not found. Make sure you ran the training script above first!")

# ---------------------------------------------------------
# OPTION A: STANDARD CONVERSION (Float32)
# Best for: Android, iOS, Raspberry Pi
# ---------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the file
with open('audio_classifier_float.tflite', 'wb') as f:
    f.write(tflite_model)

print(f"Standard Model Saved: {len(tflite_model) / 1024:.2f} KB")

# ---------------------------------------------------------
# OPTION B: QUANTIZED CONVERSION (Int8 / Dynamic)
# Best for: ESP32, Arduino, Microcontrollers
# ---------------------------------------------------------
converter_opt = tf.lite.TFLiteConverter.from_keras_model(model)

# This flag tells TF to optimize for size (weights become 8-bit integers)
converter_opt.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_model_quant = converter_opt.convert()

# Save the file
with open('audio_classifier_quantized.tflite', 'wb') as f:
    f.write(tflite_model_quant)

print(f"Quantized Model Saved: {len(tflite_model_quant) / 1024:.2f} KB")

# ---------------------------------------------------------
# VERIFICATION: TEST THE TFLITE MODEL
# It's good practice to run one prediction to ensure the .tflite file works
# ---------------------------------------------------------
def test_tflite(tflite_path, test_data):
    # Initialize Interpreter
    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()

    # Get input and output details
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    # Prepare input data (Ensure it matches float32 requirement)
    input_data = np.array(test_data, dtype=np.float32).reshape(1, 1024)

    # Set input tensor
    interpreter.set_tensor(input_details[0]['index'], input_data)

    # Run inference
    interpreter.invoke()

    # Get output
    output_data = interpreter.get_tensor(output_details[0]['index'])
    return np.argmax(output_data)

# Test on a random sample from your validation set (X_val)
if 'X_val' in locals():
    sample_idx = 0
    sample_input = X_val[sample_idx]
    true_label = np.argmax(y_val[sample_idx])

    pred_idx = test_tflite('audio_classifier_quantized.tflite', sample_input)

    print("\n--- Verification Test ---")
    print(f"True Class Index: {true_label}")
    print(f"TFLite Prediction: {pred_idx}")

    if true_label == pred_idx:
        print("✅ Conversion Successful!")
    else:
        print("⚠️ Prediction Mismatch (Check quantization settings)")