<a href="https://colab.research.google.com/github/AndreisMarco/02456_G128_bird_classification/blob/main/scripts/06_Audio_classification_with_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- check: https://towardsdatascience.com/cnns-for-audio-classification-6244954665ab

# 1. Set up environment

In [1]:
!pip install datasets evaluate --quiet

## 1.1 Import libraries

In [2]:
# setting up Drive and path for data loading and saving
import os
from google.colab import drive

# for data processing
import pandas as pd
from datasets import Dataset
from transformers import AutoFeatureExtractor

# for model training and evaluation
import torch
from torch import nn
from tqdm import tqdm
import numpy as np

# for visualisation
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 1.2 Load data

In [4]:
# mount Drive and set path
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Deep Learning - DTU 2024/'
os.chdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# load (one batch of) preprocessed data
batch_path = 'batch_1'
dataset = Dataset.load_from_disk(batch_path).remove_columns('__index_level_0__')

In [6]:
# inspect structure
dataset

Dataset({
    features: ['audio', 'label'],
    num_rows: 1244
})

In [7]:
dataset[0].keys()

dict_keys(['audio', 'label'])

In [8]:
num_classes = len(set(dataset["label"]))
print(f"Number of classes in the dataset: {num_classes}")

Number of classes in the dataset: 4


### 1.2.x Feature Extraction

In [9]:
model_dir = 'facebook/wav2vec2-base-960h'
feature_extractor = AutoFeatureExtractor.from_pretrained(model_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# extract features - from Marco's code
def preprocess_function(example):
    inputs = feature_extractor(example['audio'], sampling_rate=16000, padding=True)
    return inputs
dataset = dataset.map(preprocess_function, remove_columns="audio", batched=True, batch_size=32)
print("Preprocessed dataset with feature extractor.")

Preprocessed dataset with feature extractor.


### 1.2.x Split Dataset

In [11]:
# split dataset into train, val and test - from Marco's code
dataset = dataset.train_test_split(test_size=0.1, shuffle=True, stratify_by_column="label", seed=42)
print("Split dataset into training and testing.")

Split dataset into training and testing.


### 1.2.x Classes

In [12]:
# from Marco's code - not sure if needed

from sklearn.utils.class_weight import compute_class_weight

# Compute class weights and store in a dict
class_weights = compute_class_weight('balanced', classes=np.unique(dataset['train']['label']), y=dataset['train']['label'])
class_weights = {class_id: weight for class_id, weight in zip(np.unique(dataset['train']['label']), class_weights)}
print(f"Computed class weights: {class_weights}")
# Convert weights to Tensor
class_weight_tensor = torch.tensor(list(class_weights.values()), dtype=torch.float32).to(device)
print(f"Class weights tensor moved to device: {device}")

Computed class weights: {19: 1.5985714285714285, 40: 0.8797169811320755, 47: 1.8048387096774194, 49: 0.5939490445859873}
Class weights tensor moved to device: cuda


### 1.2.x Evaluation

In [13]:
# from Marco's code

import evaluate

# Use accuracy as performace metric
accuracy = evaluate.load("accuracy")
print("Loaded evaluation metric: accuracy.")

def compute_metrics(eval_pred):
    # Extract the model's predictions from eval_pred.
    predictions = eval_pred.predictions
    # Apply the softmax function to convert prediction scores into probabilities.
    predictions = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    # Extract the true label IDs from eval_pred.
    label_ids = eval_pred.label_ids
    # Calculate accuracy using the loaded accuracy metric by comparing predicted classes
    # (argmax of probabilities) with the true label IDs.
    acc_score = accuracy.compute(predictions=predictions.argmax(axis=1), references=label_ids)['accuracy']
    # Return the computed accuracy as a dictionary with a key "accuracy."
    return {
        "accuracy": acc_score
    }

Loaded evaluation metric: accuracy.


### 1.2.x Dataset to Loader

In [14]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# collate function for padding
def collate_fn(batch):
    inputs = [torch.tensor(item['input_values']) for item in batch]
    labels = [item['label'] for item in batch]
    padded_inputs = pad_sequence(inputs, batch_first=True)
    return padded_inputs, torch.tensor(labels)

In [15]:
# convert dataset col to tensors
dataset['train'].set_format(type='torch', columns=['input_values', 'label'])
dataset['test'].set_format(type='torch', columns=['input_values', 'label'])

In [16]:
# load data into trainloader
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=16, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=16, collate_fn=collate_fn)

In [17]:
# inspecting data format
for batch in train_loader:
    print(batch)
    break

(tensor([[ 9.7966e-01,  5.0985e-01, -1.0090e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.1648e-01,  7.4693e-01,  1.3755e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.5372e-01,  8.3759e-01,  7.3054e-01,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-8.1476e-01,  4.2780e-01,  7.5354e-01,  ...,  7.9526e-04,
          7.9526e-04,  7.9526e-04],
        [ 4.9028e-01,  8.0028e-01,  9.4498e-01,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 4.9799e-01,  5.5678e-01, -2.0586e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), tensor([19, 40, 40, 19, 49, 49, 49, 40, 47, 47, 49, 47, 49, 40, 49, 49]))


  inputs = [torch.tensor(item['input_values']) for item in batch]


### 1.2.x Intitiating CNN

In [18]:
import torch
import torch.nn as nn

class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(2, 2)
        self.fc1 = nn.Linear(64 * 40000, 256)  # input size should be 160000, which is confirmed wile the model was trained
        self.fc2 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))  # First convolution and pooling
        print("Shape after conv1:", x.shape)  # Print the shape after conv1
        x = self.pool(self.relu(self.conv2(x)))  # Second convolution and pooling
        print("Shape after conv2:", x.shape)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        print("Shape after flattening:", x.shape)
        x = self.relu(self.fc1(x))  # First fully connected layer
        print("Shape after fc1:", x.shape)
        # x = self.dropout(x)
        # print("Shape after dropout:", x.shape)
        x = self.fc2(x)  # Second fully connected layer (output)
        print("Shape after fc2:", x.shape)
        return x

In [19]:
# # adapted from CIFAR exercise
# class CNN_audio(nn.Module):
#     def __init__(self, num_classes):
#         super().__init__()
#         self.num_classes = num_classes
#         # Three convolutional layers
#         self.conv1 = nn.Conv1d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
#         self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
#         self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)

#         # Fully connected layers
#         self.fc1 = nn.Linear(4 * 4 * 64, 256)  # Input size calculated dynamically in forward method
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 10)

#         # Batch Normalization after each layer for Optimization
#         self.bn1 = nn.BatchNorm1d(16)
#         self.bn2 = nn.BatchNorm1d(32)
#         self.bn3 = nn.BatchNorm1d(64)

#         # Pooling
#         self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

#         # Activation
#         self.relu = nn.ReLU() # delete if you prefere F.relu in the forward part below


#     def forward(self, x):
#         # Your code here!

#         # without batch normalization
#         # x = self.pool(self.relu(self.conv1(x)))  # 32x32 -> 16x16
#         # x = self.pool(self.relu(self.conv2(x)))  # 16x16 -> 8x8
#         # x = self.pool(self.relu(self.conv3(x)))  # 8x8 -> 4x4

#         # with batch normalization
#         x = self.pool(self.relu(self.bn1(self.conv1(x))))
#         print(x.shape)
#         x = self.pool(self.relu(self.bn2(self.conv2(x))))
#         print(x.shape)
#         x = self.pool(self.relu(self.bn3(self.conv3(x))))
#         print(x.shape)


#         x = x.view(x.size(0), -1)  # Flatten all dimensions except batch
#         x = self.relu(self.fc1(x))  # if you get an error regarding this relu, specify it above as: self.relu1 = nn.ReLU() and so forth.
#         x = self.relu(self.fc2(x))
#         x = self.fc3(x)
#         return x

In [20]:
model = AudioCNN(num_classes).to(device)
print(model)

AudioCNN(
  (conv1): Conv1d(1, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=2560000, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=4, bias=True)
  (relu): ReLU()
)


In [21]:
loss_fct = nn.CrossEntropyLoss(weight=class_weight_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.01)

### 1.2.x Training the Model

In [22]:
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Add the channel dimension for Conv1d: [batch_size, 1, sequence_length]
        inputs = inputs.unsqueeze(1)  # Shape: [batch_size, 1, sequence_length]

        print(f"Shape of inputs before passing to model: {inputs.shape}")

        optimizer.zero_grad()
        outputs = model(inputs)

        print(f"Unique values in labels: {torch.unique(labels)}")

        labels = labels.long()
        loss = loss_fct(outputs, labels)
        loss.backward()
        optimizer.step()

Shape of inputs before passing to model: torch.Size([16, 1, 160000])


  inputs = [torch.tensor(item['input_values']) for item in batch]


Shape after conv1: torch.Size([16, 32, 80000])
Shape after conv2: torch.Size([16, 64, 40000])
Shape after flattening: torch.Size([16, 2560000])
Shape after fc1: torch.Size([16, 256])
Shape after fc2: torch.Size([16, 4])
Unique values in labels: tensor([19, 40, 47, 49], device='cuda:0')


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [25]:
labels.shape

torch.Size([16])

### 1.2.x Evaluate Model

In [None]:
# model.eval()
# correct = 0
# total = 0
# with torch.no_grad():
#     for inputs, labels in test_loader:
#         inputs, labels = inputs.to(device), labels.to(device)
#         outputs = model(inputs)
#         _, predicted = torch.max(outputs, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()
# accuracy = correct / total
# print(f"Test Accuracy: {accuracy:.2f}")