# Audio Detector Model

##### Dataset downloaded from Kaggle: https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist

In [None]:
# Import libraries

import os
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
from IPython.display import Audio, display
import librosa.display
plt.rcParams['figure.figsize'] = (10, 4)

import tensorflow as tf
from tensorflow.keras import layers, models

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### Data Preparation

In [None]:
# Get audio paths and store them in a dataframe
folder_path = os.getcwd() + '/recordings'
speaker_paths = [os.path.join(folder_path, speaker_folder) \
                 for speaker_folder in os.listdir(folder_path) \
                 if not speaker_folder.startswith('.')]

audio_paths = [os.path.join(speaker_path, audio_file) \
               for speaker_path in speaker_paths \
               for audio_file in os.listdir(speaker_path)]

audio_df = pd.DataFrame({'audio_path': audio_paths})
audio_df

In [None]:
# Assign speaker and audio label to dataframe

def extract_speaker(path):
    return path.split('/')[-1].split('_')[1]

def extract_labels(path):
    return path.split('/')[-1].split('_')[0]

audio_df['speaker_label'] = audio_df.get('audio_path').apply(extract_speaker)
audio_df['audio_label'] = audio_df.get('audio_path').apply(extract_labels)

audio_df = audio_df.sort_values(by='audio_label')
audio_df

In [None]:
# Convert spectrogram data into tensors and save dataframe as a csv

def spectrogram_to_tensor(path):
    audio, sr = librosa.load(path)
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    spectrogram /= np.max(spectrogram)
    return tf.convert_to_tensor(spectrogram, dtype=tf.float32)

audio_df.insert(2, 'audio_tensor', audio_df['audio_path'].apply(spectrogram_to_tensor))
audio_df

### Data Exploration

In [None]:
# Display column types

audio_df.dtypes

In [None]:
# Display some audio files and their spectrogram

audio_sample = audio_df.sample(5)

for index, row in audio_sample.iterrows():
    display(Audio(row['audio_path']))
    y, sr = librosa.load(row['audio_path'])
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    librosa.display.specshow(librosa.power_to_db(spectrogram, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram (Label: ' + row['audio_label'] + ', Speaker: ' + row['speaker_label'] + ')')
    plt.show()

In [None]:
# Display all unique tensor shapes and their counts

shapes = audio_df['audio_tensor'].apply(lambda x: x.shape)
shapes_tuples = shapes.apply(tuple)
unique_shape_counts = shapes_tuples.value_counts()

print('Unique Shapes:')

for shape, count in unique_shape_counts.items():
    print(f'Shape: {shape}, Count: {count}')

In [None]:
# Resize the spectrogram tensors to same shapes

def resize_spectrogram(tensor, target_shape=(128, 32)):
    tensor = tf.expand_dims(tensor, axis=-1)
    padded_tensor = tf.image.resize_with_crop_or_pad(tensor, target_shape[0], target_shape[1])
    return padded_tensor.numpy()

X = audio_df['audio_tensor'].apply(resize_spectrogram)

In [None]:
# Display fixed tensor shapes

fixed_shapes = X.apply(lambda x: x.shape)
fixed_shapes_tuples = fixed_shapes.apply(tuple)
fixed_shape_counts = fixed_shapes_tuples.value_counts()

print('Unique Shapes:')

for shape, count in fixed_shape_counts.items():
    print(f'Shape: {shape}, Count: {count}')

### Neural Network Model

In [None]:
# Split the data into training and testing sets

X = np.stack(X)
y = audio_df['audio_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print('Size of dataset:', X.shape)
print('Size of training set:', X_train.shape)
print('Size of testing set:', X_test.shape)

In [None]:
# CNN model architecture

def AudioCNN(input_shape):
    model = models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

In [None]:
# Create the CNN model

y_train = y_train.astype(np.int32)

input_shape = X_train[0].shape
model = AudioCNN(input_shape)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

In [None]:
# Evaluate the model

y_test = y_test.astype(np.int32)

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [None]:
# Display correct and incorrect predictions

predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)

correct_idxs = np.where(predicted_labels == y_test)[0]
incorrect_idxs = np.where(predicted_labels != y_test)[0]

correct_labels = y_test.iloc[correct_idxs]
print('Correct labels:', len(correct_labels))
print(correct_labels)

incorrect_labels = y_test.iloc[incorrect_idxs]
print('Incorrect labels:', len(incorrect_labels))
print(incorrect_labels)

### Save Model

In [None]:
# model.save('audio_detector_model')