Loading and Exploring Audio Metadata

In [1]:
# Import the necessary libraries
import librosa  # Library for audio and music processing
import pandas as pd  # Library for data manipulation and analysis
import os  # Library for interacting with the operating system

# Define the path to the audio dataset
audio_dataset_path = 'UrbanSound8K/audio'

# Load the metadata from the CSV file
metadata = pd.read_csv("UrbanSound8K/metadata/UrbanSound8k.csv")

# Display the first 10 rows of the metadata dataframe
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


Extracting MFCC Features from Audio Files

In [2]:
# Import the necessary library
import numpy as np  # Library for numerical operations

# Define a function to extract features from an audio file
def feature_extractor(file):
    # Load the audio file with a specific sample rate conversion
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    
    # Extract MFCC (Mel-frequency cepstral coefficients) features from the audio
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    
    # Scale the MFCC features by taking the mean across the time axis
    mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
    
    # Return the scaled MFCC features
    return mfccs_scaled_features


Extracting Features from Audio Dataset with Progress Indicator

In [3]:
# Import the tqdm library for displaying a progress bar
from tqdm import tqdm

# Initialize an empty list to store the extracted features
extracted_features = []

# Iterate over each row in the metadata dataframe with a progress bar
for index_num, row in tqdm(metadata.iterrows()):
    # Construct the file path for each audio file
    file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    
    # Get the class label for the current audio file
    final_class_labels = row['class']
    
    # Extract features from the audio file using the feature_extractor function
    data = feature_extractor(file_name)
    
    # Append the extracted features and the class label to the list
    extracted_features.append([data, final_class_labels])


0it [00:00, ?it/s]

8732it [07:54, 18.41it/s]


Converting Extracted Audio Features to DataFrame

In [4]:
### Converting Extracted Features to Pandas DataFrame ###

# Convert the list of extracted features into a pandas DataFrame
extracted_features_df = pd.DataFrame(extracted_features, columns=['feature', 'class'])

# Display the first 10 rows of the DataFrame
extracted_features_df.head(10)


Unnamed: 0,feature,class
0,"[-217.35526, 70.22338, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
3,"[-413.89984, 101.66373, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402214, 60.302044,...",children_playing
5,"[-446.8255, 117.011925, -33.7923, 55.406204, 2...",children_playing
6,"[-476.60767, 119.41842, -28.514032, 55.966988,...",children_playing
7,"[-464.08258, 116.3101, -28.82692, 49.44204, -4...",children_playing
8,"[-471.3208, 125.25887, -36.935387, 57.428547, ...",children_playing
9,"[-196.822, 113.993126, -13.813408, 0.40220967,...",car_horn


Splitting Dataset into Features and Labels

In [5]:
# Split Dataset into Features and Labels

# Convert the 'feature' column into a numpy array
x = np.array(extracted_features_df['feature'].tolist())

# Convert the 'class' column into a numpy array
y = np.array(extracted_features_df['class'].tolist())

# Print the shape of the features array
print(x.shape)

# Print the shape of the labels array
print(y.shape)


(8732, 40)
(8732,)


One-Hot Encoding Class Labels

In [6]:
# Convert Class Labels to One-Hot Encoding

# Convert the class labels into one-hot encoded format
y = np.array(pd.get_dummies(y))

# Print the shape of the one-hot encoded labels array
y.shape


(8732, 10)

Splitting Dataset into Training and Testing Sets

In [7]:
# Train-Test Split

# Import the train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

# Print the shape of the training features array
print(x_train.shape)

# Print the shape of the training labels array
print(y_train.shape)

# Print the shape of the testing features array
print(x_test.shape)



(6985, 40)
(6985, 10)
(1747, 40)


Building a Neural Network Model

In [8]:
############################## MODEL BUILDING ###############################

# Import necessary libraries for building the neural network model
import tensorflow as tf
import keras as keras
from tensorflow.keras.models import Sequential  # For creating a linear stack of layers
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten  # Core layers for the model
from tensorflow.keras.optimizers import Adam  # Optimizer for compiling the model
from sklearn import metrics  # For model evaluation metrics


Determining the Number of Output Labels

In [9]:
# Determine the Number of Output Labels

# Calculate the number of output labels based on the shape of the y array (one-hot encoded labels)
num_labels = y.shape[1]

# Print the number of output labels
print(num_labels)


10


Creating a Sequential Neural Network Model

In [10]:
# Create a Sequential model
model = Sequential()

# Add layers to the model

# Layer 1
model.add(Dense(100, input_shape=(40,)))  # Add a Dense layer with 100 units and input shape of (40,)
model.add(Activation('relu'))  # Apply ReLU activation function
model.add(Dropout(0.25))  # Apply dropout with a rate of 25%

# Layer 2
model.add(Dense(1000))  # Add a Dense layer with 1000 units
model.add(Activation('relu'))  # Apply ReLU activation function
model.add(Dropout(0.01))  # Apply dropout with a rate of 1%

# Layer 3
model.add(Dense(500))  # Add a Dense layer with 500 units
model.add(Activation('relu'))  # Apply ReLU activation function
model.add(Dropout(0.01))  # Apply dropout with a rate of 1%

# Output Layer
model.add(Dense(num_labels))  # Add a Dense layer with the number of output labels
model.add(Activation('softmax'))  # Apply softmax activation for multi-class classification


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
model.summary()

Compiling the Neural Network Model

In [12]:
# Compile the Model

# Compile the model with categorical crossentropy loss, accuracy metric, and Adam optimizer
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')


Training the Neural Network Model

In [13]:
import os
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

# Create the directory if it does not exist
os.makedirs('saved_models', exist_ok=True)

# Define the number of epochs and batch size for training
num_epochs = 100
num_batch_size = 32

# Define the ModelCheckpoint callback to save the best model during training
checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.keras', 
                               verbose=1, save_best_only=True)

# Record the start time for training duration calculation
start = datetime.now()

# Train the model using the defined parameters and callbacks
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, 
          validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)

# Calculate and print the training duration
duration = datetime.now() - start
print('Training completed in time:', duration)


Epoch 1/100
[1m209/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.2946 - loss: 5.6119
Epoch 1: val_loss improved from inf to 1.47994, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2980 - loss: 5.4708 - val_accuracy: 0.4585 - val_loss: 1.4799
Epoch 2/100
[1m211/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.4872 - loss: 1.4496
Epoch 2: val_loss improved from 1.47994 to 1.13300, saving model to saved_models/audio_classification.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4885 - loss: 1.4471 - val_accuracy: 0.6348 - val_loss: 1.1330
Epoch 3/100
[1m204/219[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - accuracy: 0.5729 - loss: 1.2243
Epoch 3: val_loss improved from 1.13300 to 1.02770, saving model to saved_models/audio_classification.keras
[1m219/219[0m

Evaluating the Neural Network Model

In [14]:
# Evaluate the Model on Test Data

# Evaluate the model on the test data to get the test accuracy
test_accuracy = model.evaluate(x_test, y_test, verbose=0)

# Print the test accuracy
print(test_accuracy)


[0.32199984788894653, 0.926159143447876]


Making Predictions with the Trained Model

In [1]:
# Define the filename of the audio file for prediction
filename = 'UrbanSound8K\audio\fold3\17615-3-0-4.wav'

# Extract features from the audio file for prediction
prediction_feature = feature_extractor(filename)

# Reshape the extracted feature for prediction
prediction_feature = prediction_feature.reshape(1, -1)

# Predict the class probabilities for the input feature
prediction_probabilities = model.predict(prediction_feature)

# Determine the predicted class based on the highest probability
predicted_class = prediction_probabilities.argmax(axis=-1)

# Print the predicted class
print(predicted_class)


NameError: name 'feature_extractor' is not defined

In [16]:
# Get unique values from the 'class' column in the metadata DataFrame
unique_classes = metadata['class'].unique()

# Print the unique classes
print(unique_classes)


['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


Label Encoding and Decoding for Class Names

In [17]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Get unique class names from the metadata
class_names = metadata['class'].unique()

# Create and fit the LabelEncoder to map class names to numerical labels
label_encoder = LabelEncoder()
label_encoder.fit(class_names)

# Inverse transform the predicted class label to get the predicted class name
predicted_class_name = label_encoder.inverse_transform(predicted_class)

# Print the predicted class name
print(predicted_class_name)


['dog_bark']
