https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from SoundDS import SoundDS
from AudioClassifier import AudioClassifier

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
train_data = SoundDS('../data/trainv2')
test_data = SoundDS('../data/testv2')
valid_data = SoundDS('../data/validationv2')

train_dl = DataLoader(train_data, batch_size=16, shuffle=True)
test_dl = DataLoader(test_data, batch_size=16, shuffle=True)
val_dl = DataLoader(valid_data, batch_size=16, shuffle=True)

In [7]:
# Create the model and put it on the GPU if available
myModel = AudioClassifier(out_features=3)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [8]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.003)
  # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
  #                                               steps_per_epoch=int(len(train_dl)),
  #                                               epochs=num_epochs,
  #                                               anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        # if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    if acc > 0.95:
      break

  print('Finished Training')
  
num_epochs=25   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

Epoch: 1/25, Loss: 0.92, Accuracy: 0.62
Epoch: 2/25, Loss: 0.85, Accuracy: 0.70
Epoch: 3/25, Loss: 0.83, Accuracy: 0.72
Epoch: 4/25, Loss: 0.81, Accuracy: 0.73
Epoch: 5/25, Loss: 0.80, Accuracy: 0.75
Epoch: 6/25, Loss: 0.75, Accuracy: 0.81
Epoch: 7/25, Loss: 0.69, Accuracy: 0.87
Epoch: 8/25, Loss: 0.67, Accuracy: 0.89
Epoch: 9/25, Loss: 0.65, Accuracy: 0.91
Epoch: 10/25, Loss: 0.64, Accuracy: 0.91
Epoch: 11/25, Loss: 0.63, Accuracy: 0.93
Epoch: 12/25, Loss: 0.64, Accuracy: 0.92
Epoch: 13/25, Loss: 0.62, Accuracy: 0.93
Epoch: 14/25, Loss: 0.61, Accuracy: 0.94
Epoch: 15/25, Loss: 0.61, Accuracy: 0.94
Epoch: 16/25, Loss: 0.61, Accuracy: 0.94
Epoch: 17/25, Loss: 0.61, Accuracy: 0.94
Epoch: 18/25, Loss: 0.61, Accuracy: 0.95
Epoch: 19/25, Loss: 0.61, Accuracy: 0.95
Epoch: 20/25, Loss: 0.60, Accuracy: 0.95
Finished Training


In [9]:
def inference(model, val_dl):
    correct_prediction = 0
    total_prediction = 0
    class_names = {0: 'Noise', 1: 'Knock', 2:'Kn_se'}  # Mapping class indices to class names

    # ANSI escape codes for colors
    RED = '\033[91m'   # Red text
    RESET = '\033[0m'  # Reset to default color

    # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs, 1)
            # Convert predictions and actual labels to class names
            predicted_classes = [class_names.get(p.item(), p.item()) for p in prediction]
            actual_classes = [class_names.get(l.item(), l.item()) for l in labels]
            row_len = 50
            # Prepare to print 10 predictions and actuals per line
            for i in range(0, len(predicted_classes), row_len):
                pred_slice = predicted_classes[i:i+row_len]
                actual_slice = actual_classes[i:i+row_len]
                pred_str = ', '.join([f'{RED if pred != actual else RESET}{pred}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                actual_str = ', '.join([f'{RED if pred != actual else RESET}{actual}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                print(f'Predicted: {pred_str}\nActual:    {actual_str}\n')

            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

    acc = correct_prediction / total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

In [10]:
inference(myModel, val_dl)

Predicted: [0mKn_se[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKn_se[0m, [91mNoise[0m
Actual:    [0mKn_se[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKn_se[0m, [91mKn_se[0m

Predicted: [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [91mNoise[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKn_se[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m
Actual:    [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [91mKn_se[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKn_se[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise

In [11]:
#saving the model
torch.save(myModel, 'data/models/V8_model_fullV2.pth')

In [7]:
model = torch.load('data/models/V8_model_full.pth')
model = model.to(device)

inference(model, val_dl)

Predicted: [0mNoise[0m, [0mKn_se[0m, [0mKnock[0m, [0mKn_se[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mKn_se[0m, [91mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m
Actual:    [0mNoise[0m, [0mKn_se[0m, [0mKnock[0m, [0mKn_se[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mKn_se[0m, [91mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m

Predicted: [0mKn_se[0m, [0mKnock[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m
Actual:    [0mKn_se[0m, [0mKnock[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[0m, [0mKn_se[0m, [0mNoise[