https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5

In [2]:
import torch
import random
import torchaudio
from torchaudio import transforms
import os
from torch.utils.data import DataLoader, Dataset
from AudioUtil import AudioUtil

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, data_path):
    self.data_path = str(data_path)
    self.files = [f for f in os.listdir(data_path) if f.endswith('.wav')]
    self.duration = 2000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.files)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.files[idx]
    # Get the Class ID
    class_id = 1 if audio_file.split('_')[0] == 'knock' else 0

    aud = AudioUtil.open(os.path.join(self.data_path, audio_file))
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

In [3]:
train_data = SoundDS('data/train')
test_data = SoundDS('data/test')
valid_data = SoundDS('data/validation')

train_dl = DataLoader(train_data, batch_size=16, shuffle=True)
test_dl = DataLoader(test_data, batch_size=16, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=16, shuffle=True)

In [4]:
import torch.nn.functional as F
from torch.nn import init
import torch.nn as nn

# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifier (nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=10)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [5]:
print(torch.__version__)

2.3.0+cu118


In [6]:
import ipywidgets as widgets
widgets.IntSlider()

IntSlider(value=0)

In [7]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')
  
num_epochs=7   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 1/7, Loss: 2.21, Accuracy: 0.25
Epoch: 2/7, Loss: 1.45, Accuracy: 0.84
Epoch: 3/7, Loss: 0.93, Accuracy: 0.89
Epoch: 4/7, Loss: 0.57, Accuracy: 0.95
Epoch: 5/7, Loss: 0.58, Accuracy: 0.95
Epoch: 6/7, Loss: 0.49, Accuracy: 0.96
Epoch: 7/7, Loss: 0.53, Accuracy: 0.95
Finished Training


In [8]:
def inference(model, val_dl):
    correct_prediction = 0
    total_prediction = 0
    class_names = {0: 'Noise', 1: 'Knock'}  # Mapping class indices to class names

    # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs, 1)
            # Print the predicted class names, using the key itself as the default value if not found in the dictionary
            predicted_classes = [class_names.get(p.item(), p.item()) for p in prediction]
            actual_classes = [class_names.get(l.item(), l.item()) for l in labels]
            print(f'Predicted:\t{predicted_classes}\nActual:\t\t{actual_classes}')
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

    acc = correct_prediction / total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Assuming myModel and valid_dl are defined
inference(myModel, valid_dl)

Predicted:	['Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Knock', 'Noise', 'Knock', 'Knock', 'Knock', 'Noise', 'Noise', 'Noise', 'Noise']
Actual:		['Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Knock', 'Noise', 'Knock', 'Knock', 'Noise', 'Knock', 'Noise', 'Noise', 'Noise']
Predicted:	['Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Noise']
Actual:		['Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Noise']
Predicted:	['Knock', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock']
Actual:		['Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Knock', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise', 'Noise']
Predicted:	['Knock', 'Knock', 'Noise', 'Noise', 'Noise', 'Noise', 'Kno

In [9]:
#saving the model
torch.save(myModel.state_dict(), 'data/models/V8_model_state_dict.pth')
torch.save(myModel, 'data/models/V8_model_full.pth')

RuntimeError: CUDA error: the launch timed out and was terminated
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
