<a href="https://colab.research.google.com/github/Ananya-AJ/Deep-Learning/blob/main/Assignment4/k_dataaugmentation_classification_audio_augly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install augly

In [None]:
!pip install tensorflow-gpu librosa numpy matplotlib augly
import tensorflow as tf
import librosa
import numpy as np
import matplotlib.pyplot as plt
import augly.audio as audaugs
from tensorflow import keras
import urllib.request

In [None]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz

In [3]:
!tar -xf speech_commands_v0.02.tar.gz

In [4]:
import os
import random

data_dir = 'speech_commands_v0.02'
words = ['dog', 'cat']
num_files_per_label = 10

# Create a list of all file paths and corresponding labels
file_paths = []
labels = []
for word in words:
    word_dir = os.path.join(word)
    filenames = os.listdir(word_dir)
    random.shuffle(filenames)  # Shuffle filenames within each label
    filenames = filenames[:num_files_per_label]  # Choose first num_files_per_label filenames
    for filename in filenames:
        file_paths.append(os.path.join(word_dir, filename))
        labels.append(word)

# Shuffle the file paths and labels in unison
zipped = list(zip(file_paths, labels))
random.shuffle(zipped)
file_paths, labels = zip(*zipped)

In [None]:
# define function to extract features from audio files
def extract_features(file_path):
    # load audio file
    audio_file, sr = librosa.load(file_path)
    # extract Mel-frequency cepstral coefficients (MFCCs) from the audio signal
    mfccs = librosa.feature.mfcc(y=audio_file, sr=sr, n_mfcc=40)
    # pad or truncate the sequence to be of length 174
    pad_width = 174 - mfccs.shape[1]
    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return mfccs


In [None]:
import os

cat_folder = './cat'
dog_folder = './dog'

cat_data = []
dog_data = []

# select first 10 files from cat folder
for i, file_name in enumerate(os.listdir(cat_folder)):
    if i == 10:
        break
    file_path = os.path.join(cat_folder, file_name)
    cat_data.append(extract_features(file_path))

# select first 10 files from dog folder
for i, file_name in enumerate(os.listdir(dog_folder)):
    if i == 10:
        break
    file_path = os.path.join(dog_folder, file_name)
    dog_data.append(extract_features(file_path))

cat_data = np.array(cat_data)
dog_data = np.array(dog_data)

# create labels for the data
cat_labels = np.zeros(len(cat_data))
dog_labels = np.ones(len(dog_data))

# concatenate the data and labels
X = np.concatenate((cat_data, dog_data), axis=0)
y = np.concatenate((cat_labels, dog_labels), axis=0)

# reshape data for input to neural network
X = X.reshape(X.shape[0], 40, 174, 1)


In [None]:
from sklearn.model_selection import train_test_split
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# build the neural network
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(40, 174, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# train the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

In [None]:
# evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.5


With audio augmentation using augly

In [None]:
!pip install augly[audio]

Apply pitch shift augmentation on the cat and dog audio files at sample rate of 16000

In [16]:
import os
import librosa
import augly.audio as audaugs

import librosa

def augment(file_path, sample_rate=16000):
    audio_data, _ = librosa.load(file_path, sr=sample_rate)
    aug_audio_data, sr = audaugs.pitch_shift(audio_data, n_steps=10)
    return aug_audio_data

# Load the audio files
cat_files = [os.path.join("./cat", f) for f in os.listdir("./cat") if f.endswith(".wav")]
dog_files = [os.path.join("./dog", f) for f in os.listdir("./dog") if f.endswith(".wav")]

# Augment the audio files
cat_files_aug = [augment(f, sample_rate=16000) for f in cat_files]
dog_files_aug = [augment(f, sample_rate=16000) for f in dog_files]

In [17]:
print(len(dog_files_aug))

2128


Extract spectrograms for the augmented audio files. spectrograms help capture both temporal and frequency information in the audio signal(raw audio) and returns a numpy ndarray

In [18]:
def extract_mel_spectrograms(audio_data, sr=16000, n_fft=2048, hop_length=512, n_mels=128):
    mel_spectrograms = []
    for audio in audio_data:
        # Extract Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        # Convert to decibels
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrograms.append(log_mel_spectrogram)
    return mel_spectrograms


In [19]:
cat_files_aug_mel = extract_mel_spectrograms(cat_files_aug)
dog_files_aug_mel = extract_mel_spectrograms(dog_files_aug)

Reshaping the spectrograms by adding 4th dimension

In [20]:
max_time_steps = max([m.shape[1] for m in cat_files_aug_mel + dog_files_aug_mel])

# Pad or truncate each Mel spectrogram to have the same number of time steps
cat_files_aug_mel = np.array([np.pad(m, ((0, 0), (0, max_time_steps - m.shape[1])), mode='constant') for m in cat_files_aug_mel])
dog_files_aug_mel = np.array([np.pad(m, ((0, 0), (0, max_time_steps - m.shape[1])), mode='constant') for m in dog_files_aug_mel])

# Combine cat and dog Mel spectrograms and create labels
X = np.concatenate((cat_files_aug_mel, dog_files_aug_mel), axis=0)
y = np.concatenate((np.zeros(len(cat_files_aug_mel)), np.ones(len(dog_files_aug_mel))), axis=0)


In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

# Reshape the data to include an extra dimension
X = np.expand_dims(X, axis=-1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=X_train.shape[1:]))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing set
test_loss, test_acc = model.evaluate(X_test, y_test)

# Print the test accuracy
print('Test accuracy:', test_acc)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.942307710647583


We can see that accuracy drastically increased from 50% to 94% after augmenting the audio files using pitchshift. The loss reduced incredibly too