Imports

In [2]:
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

Function for spliting the data into train and test set + preparing for AudioDataset class

In [3]:
def getData(audio_file_path, train = True):

    audio_waveform_list = []
    labels_list = []

    for dir in os.listdir(audio_file_path):
        for file in os.listdir(os.path.join(audio_file_path, dir)):
            if file.endswith('.wav'):
                waveform, _ = torchaudio.load(os.path.join(audio_file_path, dir, file))
                audio_waveform_list.append(waveform)
                labels_list.append(dir)


    train_waveform_list, test_waveform_list = train_test_split(audio_waveform_list, test_size=0.25, shuffle=False)
    train_labels_list, test_lables_list = train_test_split(labels_list, test_size=0.25, shuffle=False)

    if train:
        return train_waveform_list, train_labels_list
    else:
        return test_waveform_list, test_lables_list



Creating Class to load the audio files

In [4]:
class AudioDataset():
    def __init__(self, audio_file_path, train):
        self.audio_file_path = audio_file_path

        self.audio_waveform_list, self.labels_list = getData(audio_file_path, train)

    def __len__(self):
        return len(self.audio_waveform_list)
    
    def __getitem__(self, idx):
        label_encoding = {'bird': 0, 'cat': 1, 'dog': 2}
        waveform = self.audio_waveform_list[idx]
        label = self.labels_list[idx]

        return waveform, label_encoding[label]

    
def collate_fn(batch):
    waveforms, labels = zip(*batch)
    waveforms = [pad_or_truncate(w, 1600) for w in waveforms]
    waveforms = torch.stack(waveforms)
    labels = torch.tensor(labels)
    return waveforms, labels

def pad_or_truncate(waveform, target_length=16000):
    channels, samples = waveform.size()
    
    if samples > target_length:
        return waveform[:, :target_length]
    elif samples < target_length:
        pad_size = target_length - samples
        padding = torch.zeros(channels, pad_size)
        return torch.cat([waveform, padding], dim=1)
    else:
        return waveform



Loading the audio files and creating a dataset

In [None]:
audio_dataset = AudioDataset(audio_file_path='/Users/florianhaglsperger/Desktop/Coding/audioClassification/Animals', train=True)
train_laoder = DataLoader(audio_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

Create the model

In [None]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(AudioClassifier, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2)
        )

        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 128), nn.ReLU(),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x
    
model = AudioClassifier(num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)

num_epochs = 50

for epoch in range(num_epochs):
    for i, (waveforms, labels) in enumerate(train_laoder):
        
        spectrograms = []
        for waveform in waveforms:
            spectrogram = librosa.feature.melspectrogram(y=waveform.numpy().squeeze(), sr=22050, n_mels=64)
            spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            spectrograms.append(torch.tensor(spectrogram).unsqueeze(0))


        spectrograms = pad_sequence(spectrograms, batch_first=True)

        outputs = model(spectrograms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        

Testing the model on unseen data

In [23]:
dataset = AudioDataset(audio_file_path='/Users/florianhaglsperger/Desktop/Coding/audioClassification/Animals', train=False)
test_loader = DataLoader(audio_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

model.eval()

correct = 0
total = 0

with torch.no_grad():
    for i, (waveforms, labels) in enumerate(test_loader):
        spectrograms = []
        for waveform in waveforms:
            spectrogram = librosa.feature.melspectrogram(y=waveform.numpy().squeeze(), sr=22050, n_mels=64)
            spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            spectrograms.append(torch.tensor(spectrogram).unsqueeze(0))


        spectrograms = pad_sequence(spectrograms, batch_first=True)

        outputs = model(spectrograms)
        prediction = torch.argmax(outputs, dim=1)

        correct += (prediction == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print("Test Accuracy: "+ str(accuracy * 100) + " %")
    

Test Accuracy: 99.56236323851203 %
