Imports

In [36]:
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

Creating Class to load the audio files

In [37]:
class AudioDataset():
    def __init__(self, audio_file_path):
        self.audio_file_path = audio_file_path

        self.audio_waveform_list = []
        self.labels_list = []

        for dir in os.listdir(audio_file_path):
            for file in os.listdir(os.path.join(audio_file_path, dir)):
                if file.endswith('.wav'):
                    waveform, _ = torchaudio.load(os.path.join(audio_file_path, dir, file))
                    self.audio_waveform_list.append(waveform)
                    self.labels_list.append(dir)


    def __len__(self):
        return len(self.audio_waveform_list)
    
    def __getitem__(self, idx):
        label_encoding = {'bird': 0, 'cat': 1, 'dog': 2}
        waveform = self.audio_waveform_list[idx]
        label = self.labels_list[idx]

        return waveform, label_encoding[label]

    
def collate_fn(batch):
    waveforms, labels = zip(*batch)
    waveforms = pad_sequence(waveforms, batch_first=True)
    labels = torch.tensor(labels)
    return waveforms, labels


Loading the audio files and creating a dataset

In [38]:
audio_dataset = AudioDataset(audio_file_path='/Users/florianhaglsperger/Desktop/Coding/audioClassification/Animals')
train_laoder = DataLoader(audio_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

for batch in train_laoder:
    waveforms, labels = batch
    print(waveforms.shape)
    print(labels)
    break

torch.Size([2, 1, 16000])
tensor([1, 0])




Create the model

In [None]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(AudioClassifier, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2)
        )

        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 32 * 2, 128), nn.ReLU(),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x
    
model = AudioClassifier(num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    for i, (waveforms, labels) in enumerate(train_laoder):

        max_frames = 0
        # Convert waveforms to spectrograms
        spectrograms = []
        for waveform in waveforms:
            spectrogram = librosa.feature.melspectrogram(y=waveform.numpy().squeeze(), sr=22050, n_mels=64)
            spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            spectrograms.append(torch.tensor(spectrogram).unsqueeze(0))

        spectrograms = pad_sequence(spectrograms, batch_first=True)

        padded_specs = []
        for spec in spectrograms:
            pad_width = max_frames - spec.shape[1]
            if pad_width > 0:
                spec = torch.nn.functional.pad(spec, (0, pad_width))  # (left,right)
            padded_specs.append(spec.unsqueeze(0))

        spectrograms = torch.stack(padded_specs)

        print(spectrogram.shape)
        print(labels.shape)

        outputs = model(spectrograms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        

(64, 32)
torch.Size([2])
(64, 32)
torch.Size([2])
(64, 32)
torch.Size([2])
(64, 32)
torch.Size([2])
(64, 32)
torch.Size([2])
(64, 32)
torch.Size([2])


RuntimeError: The size of tensor a (14118) must match the size of tensor b (9558) at non-singleton dimension 1