Clone the ESC-50 dataset

In [None]:
!git clone https://github.com/karolpiczak/ESC-50.git

fatal: destination path 'ESC-50' already exists and is not an empty directory.


#Data Loader for ESC-50 dataset

In [None]:
import os
import pandas as pd
import torchaudio
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


#class in version allows for duration setting: applying data augmentation
#turns out waveform of audio files with fixed hertz rate:22050
class ESC50Dataset(Dataset):
    def __init__(self, path, meta_file, duration=8.0, transform=None):
        self.path = path
        self.meta = pd.read_csv(meta_file)
        self.duration = duration
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        audio_path = os.path.join(self.path, self.meta.iloc[idx, 0])
        label = self.meta.iloc[idx, 1]
        waveform, sr = torchaudio.load(audio_path)
        if sr != 22050:
            waveform = torchaudio.transforms.Resample(sr, 22050)(waveform)
            sr = 22050
        length = int(self.duration * sr)
        if waveform.shape[1] < length:
            waveform = torch.nn.functional.pad(waveform, (0, length - waveform.shape[1]))
        else:
            waveform = waveform[:, :length]
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label


#Preprocessing

In [None]:
#we choose STFT as our input representaion
#turn the 1 dimensional time-domain waveform into STFT: 2 dimensional time-frequency domain
class STFTDataset(ESC50Dataset):
    def __init__(self, path, meta_file, duration=8.0, transform=None):
        super().__init__(path, meta_file, duration, transform)

    def __getitem__(self, idx):
        waveform, label = super().__getitem__(idx)
        stft = torch.stft(waveform, n_fft=512, hop_length=256, win_length=512, return_complex=True)
        stft = torch.abs(stft)  # Compute the magnitude of STFT
        stft = stft.transpose(1, 2)  # Swap time and frequency dimensions
        return stft, label

#Train and test set

In [None]:
#split data into train and test set
root = "ESC-50/audio/"
meta_file = "ESC-50/meta/esc50.csv"
dataset = STFTDataset(root, meta_file)

import random
#can adjust input fraction: allow to train in our local machine
fraction = 0.05
train_data, valid_data = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset.meta["target"])

# Limit the dataset size
train_data = random.sample(train_data, int(len(train_data) * fraction))
valid_data = random.sample(valid_data, int(len(valid_data) * fraction))

train_loader = DataLoader(train_data, batch_size=8, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_data, batch_size=8, shuffle=False, num_workers=4)




# Model: build up our HarmonicCNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class Conv_2d(nn.Module):
    def __init__(self, in_channels, out_channels, pooling=None):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, (3,3), stride=1, padding=(1,1))
        self.bn = nn.BatchNorm2d(out_channels)
        self.pool = None
        if pooling is not None:
            self.pool = nn.MaxPool2d(pooling)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        if self.pool is not None:
            x = self.pool(x)
        return x

class Res_2d(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Res_2d, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, (3,3), stride=1, padding=(1,1))
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, (3,3), stride=1, padding=(1,1))
        self.bn2 = nn.BatchNorm2d(out_channels)

        if in_channels != out_channels:
            self.conv_skip = nn.Conv2d(in_channels, out_channels, (1,1), stride=1, padding=(0,0))
        else:
            self.conv_skip = None

    def forward(self, x):
        skip = x

        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)

        if self.conv_skip is not None:
            skip = self.conv_skip(skip)

        x += skip
        x = F.relu(x)

        return x

class HarmonicCNN2D(nn.Module):
    def __init__(self,
                n_channels=128,
                sample_rate=16000,
                n_fft=512,
                f_min=0.0,
                f_max=8000.0,
                n_mels=128,
                n_class=50,
                n_harmonic=6,
                semitone_scale=2,
                learn_bw='only_Q'):
        super(HarmonicCNN2D, self).__init__()

        # Harmonic STFT
        self.hstft = nn.Sequential(
            nn.Conv2d(1, n_harmonic, (7,7), stride=1, padding=(3,3)),
            nn.BatchNorm2d(n_harmonic),
            nn.ReLU(),
            nn.Conv2d(n_harmonic, n_harmonic, (5,5), stride=1, padding=(2,2)),
            nn.BatchNorm2d(n_harmonic),
            nn.ReLU(),
            nn.Conv2d(n_harmonic, n_harmonic, (3,3), stride=1, padding=(1,1)),
            nn.BatchNorm2d(n_harmonic),
            nn.ReLU(),
            nn.Conv2d(n_harmonic, n_harmonic, (3,3), stride=1, padding=(1,1)),
            nn.BatchNorm2d(n_harmonic),
            nn.ReLU()
        )

        # CNN
        self.layer1 = Conv_2d(n_harmonic, n_channels, pooling=2)
        self.layer2 = Res_2d(n_channels, n_channels)
        self.layer3 = Res_2d(n_channels, n_channels)
        self.layer4 = Res_2d(n_channels, n_channels)
        self.layer5 = Conv_2d(n_channels, n_channels*2, pooling=(2,3))
        self.layer6 = Res_2d(n_channels*2, n_channels*2)
        self.layer7 = Res_2d(n_channels*2, n_channels*2)

        # Dense
        self.dense1 = nn.Linear(n_channels*2, n_channels*2)
        self.bn = nn.BatchNorm1d(n_channels*2)
        self.dense2 = nn.Linear(n_channels*2, n_class)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Spectrogram
        x = self.hstft(x)

        # CNN
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        x = x.squeeze(2)

        # Global Max Pooling
        x = nn.AdaptiveMaxPool2d((1, 1))(x)
        x = torch.flatten(x, 1)

        # Dense
        x = self.dense1(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        x = nn.Sigmoid()(x)

        return x


#Define train and validation methods

In [None]:
from tqdm import tqdm

def train(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(loader, desc="Training", ncols=100)
    
    for inputs, targets in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

        #Update the progress bar
        progress_bar.set_postfix({"Loss": running_loss / (total // loader.batch_size), "Accuracy": 100 * correct / total})
    
    progress_bar.close()
    
    accuracy = 100 * correct / total
    return running_loss / len(loader), accuracy


In [None]:
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(loader, desc="Validation", ncols=100)
    
    with torch.no_grad():
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

            # Update the progress bar
            progress_bar.set_postfix({"Loss": running_loss / (total // loader.batch_size), "Accuracy": 100 * correct / total})
    
    progress_bar.close()
    
    accuracy = 100 * correct / total
    return running_loss / len(loader), accuracy


#Start training

In [None]:
from torch.optim import Adam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = HarmonicCNN().to(device)
model = HarmonicCNN2D()
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(1, num_epochs + 1):
    train_loss, train_accuracy = train(model, train_loader, criterion, optimizer, device)
    valid_loss, valid_accuracy = validate(model, valid_loader, criterion, device)

    print(f'Epoch: {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_accuracy:.2f}%')


Training: 100%|███████████████████████████| 10/10 [16:08<00:00, 96.87s/it, Loss=3.81, Accuracy=12.5]
Validation: 100%|█████████████████████████████| 3/3 [01:14<00:00, 24.67s/it, Loss=5.47, Accuracy=25]


Epoch: 1/50, Train Loss: 3.8132, Train Acc: 12.50%, Valid Loss: 3.6457, Valid Acc: 25.00%


Training: 100%|█████████████████████████████| 10/10 [15:52<00:00, 95.26s/it, Loss=3.58, Accuracy=25]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.68s/it, Loss=5.11, Accuracy=20]


Epoch: 2/50, Train Loss: 3.5814, Train Acc: 25.00%, Valid Loss: 3.4066, Valid Acc: 20.00%


Training: 100%|█████████████████████████████| 10/10 [15:38<00:00, 93.87s/it, Loss=3.42, Accuracy=15]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.88s/it, Loss=4.93, Accuracy=15]


Epoch: 3/50, Train Loss: 3.4213, Train Acc: 15.00%, Valid Loss: 3.2840, Valid Acc: 15.00%


Training: 100%|███████████████████████████| 10/10 [15:32<00:00, 93.30s/it, Loss=3.31, Accuracy=17.5]
Validation: 100%|██████████████████████████████| 3/3 [01:15<00:00, 25.23s/it, Loss=4.86, Accuracy=5]


Epoch: 4/50, Train Loss: 3.3059, Train Acc: 17.50%, Valid Loss: 3.2395, Valid Acc: 5.00%


Training: 100%|█████████████████████████████| 10/10 [15:39<00:00, 93.91s/it, Loss=3.31, Accuracy=15]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.85s/it, Loss=4.81, Accuracy=10]


Epoch: 5/50, Train Loss: 3.3071, Train Acc: 15.00%, Valid Loss: 3.2065, Valid Acc: 10.00%


Training: 100%|███████████████████████████| 10/10 [16:02<00:00, 96.25s/it, Loss=3.24, Accuracy=18.8]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.91s/it, Loss=4.76, Accuracy=20]


Epoch: 6/50, Train Loss: 3.2425, Train Acc: 18.75%, Valid Loss: 3.1720, Valid Acc: 20.00%


Training: 100%|███████████████████████████| 10/10 [15:35<00:00, 93.52s/it, Loss=3.19, Accuracy=26.2]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.72s/it, Loss=4.75, Accuracy=25]


Epoch: 7/50, Train Loss: 3.1891, Train Acc: 26.25%, Valid Loss: 3.1688, Valid Acc: 25.00%


Training: 100%|███████████████████████████| 10/10 [15:39<00:00, 93.95s/it, Loss=3.18, Accuracy=27.5]
Validation: 100%|█████████████████████████████| 3/3 [01:17<00:00, 25.67s/it, Loss=4.71, Accuracy=25]


Epoch: 8/50, Train Loss: 3.1776, Train Acc: 27.50%, Valid Loss: 3.1414, Valid Acc: 25.00%


Training:  20%|█████▍                     | 2/10 [04:46<19:05, 143.23s/it, Loss=3.19, Accuracy=37.5]


KeyboardInterrupt: ignored