https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5

In [60]:
import torch
import timm
import torch.nn as nn
from torch.utils.data import DataLoader
from SoundDS import SoundDS
from AudioClassifier import AudioClassifier

In [61]:
%reload_ext autoreload

In [62]:
train_data = SoundDS('../data/train')
test_data = SoundDS('../data/test')
valid_data = SoundDS('../data/validation')

train_dl = DataLoader(train_data, batch_size=16, shuffle=True)
test_dl = DataLoader(test_data, batch_size=16, shuffle=True)
val_dl = DataLoader(valid_data, batch_size=16, shuffle=True)

In [63]:
next(iter(train_dl))

[tensor([[[[-32.7696, -23.2426, -32.9378,  ..., -19.0813, -32.4139, -28.9768],
           [-21.1418, -30.3780, -35.1775,  ..., -23.8783, -30.6226, -30.0355],
           [-21.5617, -35.1382, -29.4472,  ..., -27.0059, -34.1754, -26.9220],
           ...,
           [-45.1180, -42.6466, -45.6137,  ..., -43.2369, -43.1089, -43.3858],
           [-38.4832, -38.5931, -36.8676,  ..., -39.0069, -37.4824, -37.8154],
           [-39.7686, -38.3933, -36.8613,  ..., -38.2027, -38.3498, -36.3121]],
 
          [[-32.7696, -23.2426, -32.9378,  ..., -19.0813, -32.4139, -28.9768],
           [-21.1418, -30.3780, -35.1775,  ..., -23.8783, -30.6226, -30.0355],
           [-21.5617, -35.1382, -29.4472,  ..., -27.0059, -34.1754, -26.9220],
           ...,
           [-45.1180, -42.6466, -45.6137,  ..., -43.2369, -43.1089, -43.3858],
           [-38.4832, -38.5931, -36.8676,  ..., -39.0069, -37.4824, -37.8154],
           [-39.7686, -38.3933, -36.8613,  ..., -38.2027, -38.3498, -36.3121]],
 
          [[-3

In [64]:
import torchvision.models as models
def create_model(num_classes=2):
    # Laden des vorab trainierten ResNet34 Modells
    # model = timm.create_model('resnet34', pretrained=True)
    
    # # Anpassen des letzten FC Layers für 2 Ausgabeklassen
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, num_classes)

    model = models.resnet18(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
    
    return model

In [65]:
# Create the model and put it on the GPU if available
# myModel = AudioClassifier()
myModel = create_model()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [66]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
  #                                               steps_per_epoch=int(len(train_dl)),
  #                                               epochs=num_epochs,
  #                                               anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        # if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    if acc > 0.98:
      break

  print('Finished Training')

In [32]:
num_epochs=3  # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[16, 2, 64, 172] to have 3 channels, but got 2 channels instead

In [14]:
def inference(model, val_dl):
    correct_prediction = 0
    total_prediction = 0
    class_names = {0: 'Noise', 1: 'Knock', 2:'Kn_se'}  # Mapping class indices to class names

    # ANSI escape codes for colors
    RED = '\033[91m'   # Red text
    RESET = '\033[0m'  # Reset to default color

    # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs, 1)
            # Convert predictions and actual labels to class names
            predicted_classes = [class_names.get(p.item(), p.item()) for p in prediction]
            actual_classes = [class_names.get(l.item(), l.item()) for l in labels]
            row_len = 50
            # Prepare to print 10 predictions and actuals per line
            for i in range(0, len(predicted_classes), row_len):
                pred_slice = predicted_classes[i:i+row_len]
                actual_slice = actual_classes[i:i+row_len]
                pred_str = ', '.join([f'{RED if pred != actual else RESET}{pred}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                actual_str = ', '.join([f'{RED if pred != actual else RESET}{actual}{RESET}' for pred, actual in zip(pred_slice, actual_slice)])
                print(f'Predicted: {pred_str}\nActual:    {actual_str}\n')

            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

    acc = correct_prediction / total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

In [15]:
inference(myModel, val_dl)

Predicted: [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKnock[0m
Actual:    [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mNoise[0m, [0mKnock[0m

Predicted: [91mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [91mNoise[0m, [91mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m
Actual:    [91mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [91mKnock[0m, [91mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKno

In [16]:
#saving the model
torch.save(myModel, '../data/models/V8_model_fullV4.pth')

In [10]:
model = torch.load('../data/models/V8_model_fullV4.pth')
model = model.to(device)

inference(model, val_dl)

Predicted: [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m
Actual:    [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m

Predicted: [0mKnock[0m, [0mKnock[0m, [91mNoise[0m, [91mNoise[0m, [0mKnock[0m, [91mKn_se[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [91mNoise[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m
Actual:    [0mKnock[0m, [0mKnock[0m, [91mKn_se[0m, [91mKn_se[0m, [0mKnock[0m, [91mNoise[0m, [0mKnock[0m, [0mNoise[0m, [0mNoise[0m, [0mKnock[0m, [0mKnock[0m, [91mKn_se[0m, [0mKnock[0m, [0mKnock[0m, [0mKnock[0m, [0mK