https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5

In [1]:
import torch
import random
import torchaudio
from torchaudio import transforms
import os
from torch.utils.data import DataLoader, Dataset
from AudioUtil import AudioUtil

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, data_path):
    self.data_path = str(data_path)
    self.files = [f for f in os.listdir(data_path) if f.endswith('.wav')]
    self.duration = 2000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.files)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.files[idx]
    # Get the Class ID
    class_id = 0 if audio_file.split('_')[0] == 'noise' else 1

    aud = AudioUtil.open(os.path.join(self.data_path, audio_file))
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

In [2]:
train_data = SoundDS('data/train')
test_data = SoundDS('data/test')
valid_data = SoundDS('data/validation')

train_dl = DataLoader(train_data, batch_size=16, shuffle=True)
test_dl = DataLoader(test_data, batch_size=16, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=16, shuffle=True)

In [3]:
import torch.nn as nn
from AudioClassifier import AudioClassifier

# Create the model and put it on the GPU if available
myModel = AudioClassifier(out_features=2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [49]:
print(torch.__version__)

2.3.0+cu118


In [4]:
import ipywidgets as widgets
slider = widgets.IntSlider(
    value=7,  # Initial value
    min=0,    # Minimum value
    max=10,   # Maximum value
    step=1,   # Step size
    description='Test Slider:',  # Label for the slider
)

# Step 3: Display the slider
display(slider)

# Step 4: Access the slider's value (this would be done in a separate cell or in response to an event)
print(slider.value)

IntSlider(value=7, description='Test Slider:', max=10)

7


In [5]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.003)
  # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
  #                                               steps_per_epoch=int(len(train_dl)),
  #                                               epochs=num_epochs,
  #                                               anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        # if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    if acc > 0.92:
      break

  print('Finished Training')
  
num_epochs=25   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

Epoch: 1/25, Loss: 0.64, Accuracy: 0.64
Epoch: 2/25, Loss: 0.63, Accuracy: 0.67
Epoch: 3/25, Loss: 0.62, Accuracy: 0.65
Epoch: 4/25, Loss: 0.61, Accuracy: 0.68
Epoch: 5/25, Loss: 0.60, Accuracy: 0.69
Epoch: 6/25, Loss: 0.57, Accuracy: 0.73
Epoch: 7/25, Loss: 0.54, Accuracy: 0.77
Epoch: 8/25, Loss: 0.51, Accuracy: 0.80
Epoch: 9/25, Loss: 0.50, Accuracy: 0.81
Epoch: 10/25, Loss: 0.48, Accuracy: 0.83
Epoch: 11/25, Loss: 0.48, Accuracy: 0.84
Epoch: 12/25, Loss: 0.47, Accuracy: 0.85
Epoch: 13/25, Loss: 0.46, Accuracy: 0.85
Epoch: 14/25, Loss: 0.45, Accuracy: 0.87
Epoch: 15/25, Loss: 0.46, Accuracy: 0.85
Epoch: 16/25, Loss: 0.44, Accuracy: 0.87
Epoch: 17/25, Loss: 0.45, Accuracy: 0.86
Epoch: 18/25, Loss: 0.44, Accuracy: 0.87
Epoch: 19/25, Loss: 0.45, Accuracy: 0.86
Epoch: 20/25, Loss: 0.44, Accuracy: 0.88
Epoch: 21/25, Loss: 0.43, Accuracy: 0.89
Epoch: 22/25, Loss: 0.44, Accuracy: 0.88
Epoch: 23/25, Loss: 0.43, Accuracy: 0.89
Epoch: 24/25, Loss: 0.44, Accuracy: 0.87
Epoch: 25/25, Loss: 0.43,

In [6]:
def inference(model, val_dl):
    correct_prediction = 0
    total_prediction = 0
    class_names = {0: 'Noise', 1: 'Knock'}  # Mapping class indices to class names

    # ANSI escape codes for colors
    RED = '\033[91m'   # Red text
    RESET = '\033[0m'  # Reset to default color

    # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs, 1)
            # Convert predictions and actual labels to class names
            predicted_classes = [class_names.get(p.item(), p.item()) for p in prediction]
            actual_classes = [class_names.get(l.item(), l.item()) for l in labels]
            row_len = 50
            # Prepare to print 10 predictions and actuals per line
            for i in range(0, len(predicted_classes), row_len):
                pred_slice = predicted_classes[i:i+row_len]
                actual_slice = actual_classes[i:i+row_len]
                pred_str = ', '.join([f'{RED if pred != actual else RESET}{pred}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                actual_str = ', '.join([f'{RED if pred != actual else RESET}{actual}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                print(f'Predicted: {pred_str}\nActual:    {actual_str}\n')

            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

    acc = correct_prediction / total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Assuming myModel and valid_dl are defined
inference(myModel, valid_dl)

Predicted: [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [91mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [91mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m
Actual:    [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [91mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [91mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m

Predicted: [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [91mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m
Actual:    [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [91mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKno

In [7]:
#saving the model
torch.save(myModel.state_dict(), 'data/models/V8_model_state_dict.pth')
torch.save(myModel, 'data/models/V8_model_full.pth')

In [8]:
model = AudioClassifier()
model.load_state_dict(torch.load('data/models/V8_model_state_dict.pth'))
model = model.to(device)
single_ds = SoundDS('data/single')
single_dl = DataLoader(single_ds, shuffle=True)

inference(model, single_dl)

Predicted: [91mNoise[0m
Actual:    [91mKnock[0m

Accuracy: 0.00, Total items: 1
