In [1]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------
import pandas as pd
from pathlib import Path

download_path = Path('data/DATA/output_chunks')

# Read metadata file
metadata_file = download_path / 'metadata.csv'
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating road and slice_file_name
df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

# Check class balance
class_counts = df['classID'].value_counts()
print('Class distribution:')
print(class_counts)

Class distribution:
classID
2     700
9     405
1     364
5     212
3     179
6     130
4      77
10     68
0      42
7       6
8       2
Name: count, dtype: int64


In [2]:
print(df.columns)

Index(['relative_path', 'classID'], dtype='object')


In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(metadata_file)

# Get distinct values from the 'class' column
distinct_classes = df['class'].unique()

# Display the result
print(distinct_classes)

['light truck' 'motorcycle' 'medium truck' 'PSV' 'other' 'pickup'
 'private car' 'SUV' 'bus' 'bicycle' 'heavy truck']


In [4]:
#import pandas as pd
#
# Load the large CSV file
#df = pd.read_csv(metadata_file)
#
# Define the mapping from class name to ClassID
#class_to_id = {
#    'bicycle': 0,
#    'motorcycle': 1,
#    'private car': 2,
#    'SUV': 3,
#    'pickup': 4,
#    'light truck': 5,
#    'medium truck': 6,
#    'heavy truck': 7,
#    'bus': 8,
#    'PSV': 9,
#    'other': 10
#}
#
# Update the ClassID column based on the class column
#df['classID'] = df['class'].map(class_to_id)
#
# Save the updated file (overwrite or create new)
#df.to_csv("metadata_updated.csv", index=False)

In [None]:
import math
import random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel=1):
        sig, sr = aud
        if (sig.shape[0] == new_channel):
            return aud
        if (new_channel == 1):
            if sig.shape[0] > 1:
                resig = sig.mean(dim=0, keepdim=True)
            else:
                resig = sig
        else:
            resig = torch.cat([sig, sig])
        return (resig, sr)

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if (sr == newsr):
            return aud
        num_channels = sig.shape[0]
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
        return ((resig, newsr))

    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr // 1000 * max_ms  # e.g., 132300 for 6s at 22kHz
        if (sig_len > max_len):
            sig = sig[:, :max_len]
        elif (sig_len < max_len):
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def spectro_gram(aud, n_mels=128, n_fft=1024, hop_len=512):
        sig, sr = aud
        if sig.shape[0] != 1:
            raise ValueError(f"Expected mono audio, got {sig.shape[0]} channels")
        top_db = 80
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)  # Shape: [1, 128, 259]
        # Compute deltas
        delta = torchaudio.functional.compute_deltas(spec)  # Shape: [1, 128, 259]
        delta_delta = torchaudio.functional.compute_deltas(delta)  # Shape: [1, 128, 259]
        # Squeeze channel dimension and stack
        spec = spec.squeeze(0)  # Shape: [128, 259]
        delta = delta.squeeze(0)  # Shape: [128, 259]
        delta_delta = delta_delta.squeeze(0)  # Shape: [128, 259]
        spec = torch.stack([spec, delta, delta_delta], dim=0)  # Shape: [3, 128, 259]
        # Per-sample normalization
        spec = (spec - spec.mean()) / (spec.std() + 1e-6)
        # Ensure fixed time dimension (259 for 6s at 22kHz, hop_len=512)
        expected_time_steps = 259
        if spec.shape[-1] != expected_time_steps:
            if spec.shape[-1] > expected_time_steps:
                spec = spec[:, :, :expected_time_steps]
            else:
                pad_len = expected_time_steps - spec.shape[-1]
                spec = torch.nn.functional.pad(spec, (0, pad_len))
        return spec

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.4, n_freq_masks=4, n_time_masks=4):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        return aug_spec

KeyboardInterrupt: 

: 

In [None]:
import os
import random
import torch
import torchaudio
from torch.utils.data import Dataset

class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 6000  # 6s
        self.sr = 22050  # 22kHz
        self.channel = 1  # Mono
        self.shift_pct = 0.4
        self.aug_prob = 0.5
            
    def __len__(self):
        return len(self.df)    
    
    def __getitem__(self, idx):
        # Construct the full path
        road = self.df.loc[idx, 'road']
        slice_file_name = self.df.loc[idx, 'slice_file_name']
        relative_path = road + '/' + slice_file_name
        audio_file = os.path.join(self.data_path, relative_path)
        
        # Check if the file exists
        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")
        
        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)
        
        # Pad/truncate first to ensure 6s length
        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        
        # Apply augmentations
        if random.random() < self.aug_prob:
            # Time shift
            shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
            # Gaussian noise
            noise = torch.randn_like(shift_aud[0]) * random.uniform(0.001, 0.015)
            shift_aud = (shift_aud[0] + noise, shift_aud[1])
        else:
            shift_aud = dur_aud

        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=128, n_fft=1024, hop_len=512)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.4, n_freq_masks=4, n_time_masks=4)

        # Validate shape
        expected_shape = (3, 128, 259)  # [feature_channels, mels, time_steps] for 6s
        if aug_sgram.shape != expected_shape:
            raise ValueError(f"Unexpected spectrogram shape {aug_sgram.shape} at index {idx}, file: {audio_file}. Expected {expected_shape}")

        return aug_sgram, class_id

In [None]:
from torch.utils.data import random_split, DataLoader

def custom_collate_fn(batch):
    # Separate spectrograms and labels
    spectrograms, labels = zip(*batch)
    # Convert to tensors and stack
    spectrograms = torch.stack(spectrograms, dim=0)
    labels = torch.tensor(labels, dtype=torch.long)
    return spectrograms, labels

myds = SoundDS(df, download_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders with custom collate
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_dl = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)

In [None]:
#import os
#import torchaudio

# Debug dataset shapes and audio lengths
#for i in range(min(10, len(myds))):
#    try:
#        # Get file path
#        road = myds.df.loc[i, 'road']
#        slice_file_name = myds.df.loc[i, 'slice_file_name']
#        relative_path = road + '/' + slice_file_name
#        audio_file = os.path.join(myds.data_path, relative_path)
#        
#        # Load raw audio to check length and channels
#        aud = AudioUtil.open(audio_file)
#        raw_len = aud[0].shape[-1]
#        raw_channels = aud[0].shape[0]
#        raw_duration = raw_len / aud[1]  # Duration in seconds
#       
#        # Process audio
#        reaud = AudioUtil.resample(aud, myds.sr)
#        rechan = AudioUtil.rechannel(reaud, myds.channel)
#        dur_aud = AudioUtil.pad_trunc(rechan, myds.duration)
#       
#        # Compute intermediate spectrograms
#       spec = torchaudio.transforms.MelSpectrogram(myds.sr, n_fft=1024, hop_length=512, n_mels=128)(dur_aud[0])
#        spec = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec)
#       delta = torchaudio.functional.compute_deltas(spec)
#        
#        # Get final spectrogram
#        sgram, class_id = myds[i]
#        
#        print(f"Index {i}: File {audio_file}, Raw length {raw_len} samples ({raw_duration:.2f}s), Raw channels {raw_channels}, Processed channels {dur_aud[0].shape[0]}, Mel shape {spec.shape}, Delta shape {delta.shape}, Final shape {sgram.shape}, Class ID {class_id}")
#    except Exception as e:
#        print(f"Error at index {i}, file {audio_file}: {str(e)}")

In [None]:
import torch
import torch.nn as nn
from torchvision.models import resnet18

# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=11):
        super().__init__()
        # Load pretrained ResNet18
        self.resnet = resnet18(pretrained=True)
        # Modify first conv layer for 3-channel input (mel + delta + delta-delta)
        self.resnet.conv1 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        # Modify final fully connected layer for 11 classes
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.3),  # Add dropout
            nn.Linear(self.resnet.fc.in_features, num_classes)
        )
        # Freeze early layers
        for param in self.resnet.parameters():
            param.requires_grad = False
        # Unfreeze later layers
        for param in self.resnet.layer4.parameters():
            param.requires_grad = True
        for param in self.resnet.fc.parameters():
            param.requires_grad = True

    def forward(self, x):
        x = self.resnet(x)
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

In [None]:
import torch
from torchmetrics import Precision, Recall, F1Score, ConfusionMatrix
import numpy as np
from sklearn.metrics import classification_report

def inference(model, data_dl, device):
    model.eval()
    correct_prediction = 0
    total_prediction = 0
    
    # Initialize metrics
    precision = Precision(task="multiclass", num_classes=11).to(device)
    recall = Recall(task="multiclass", num_classes=11).to(device)
    f1 = F1Score(task="multiclass", num_classes=11).to(device)
    confmat = ConfusionMatrix(task="multiclass", num_classes=11).to(device)
    
    all_predictions = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for data in data_dl:
            inputs, labels = data[0].to(device), data[1].to(device)

            # Get predictions
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            _, predictions = torch.max(outputs, 1)

            # Update metrics
            correct_prediction += (predictions == labels).sum().item()
            total_prediction += predictions.shape[0]
            
            precision.update(predictions, labels)
            recall.update(predictions, labels)
            f1.update(predictions, labels)
            confmat.update(predictions, labels)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    # Calculate metrics
    acc = correct_prediction / total_prediction
    prec = precision.compute().item()
    rec = recall.compute().item()
    f1_score = f1.compute().item()
    conf_matrix = confmat.compute().cpu().numpy()

    # Print results
    print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
    print(f'Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1_score:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('Classification Report:')
    print(classification_report(all_labels, all_predictions, digits=4))

    # Return results for further analysis
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1_score,
        'confusion_matrix': conf_matrix,
        #'predictions': np.array(all_predictions),
        #'labels': np.array(all_labels),
        #'probabilities': np.array(all_probs)
    }

In [None]:
import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, val_dl, num_epochs, patience=20):
    # Compute class weights for balanced loss
    class_weights = compute_class_weight('balanced', classes=np.unique(df['classID']), y=df['classID'])
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Loss Function, Optimizer and Scheduler
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=True)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[40, 80], gamma=0.1)

    best_val_acc = 0
    counter = 0

    # Repeat for each epoch
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0

        # Repeat for each batch in the training set
        for i, data in enumerate(train_dl):
            inputs, labels = data[0].to(device), data[1].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Keep stats for Loss and Accuracy
            running_loss += loss.item()
            _, prediction = torch.max(outputs, 1)
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

        # Print stats at the end of the epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        acc = correct_prediction / total_prediction
        print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Training Accuracy: {acc:.2f}')

        # Validation step (compute accuracy only, no printing)
        model.eval()
        correct_prediction = 0
        total_prediction = 0
        with torch.no_grad():
            for data in val_dl:
                inputs, labels = data[0].to(device), data[1].to(device)
                outputs = model(inputs)
                _, prediction = torch.max(outputs, 1)
                correct_prediction += (prediction == labels).sum().item()
                total_prediction += prediction.shape[0]
        val_acc = correct_prediction / total_prediction
        print(f'Validation Accuracy: {val_acc:.2f}')

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
            print("Saved best model")
        else:
            counter += 1
        #if counter >= patience:
        #    print(f'Early stopping at epoch {epoch}')
        #    break

    # Final validation step with full metrics
    print('Finished Training')
    final_results = inference(model, val_dl, device)
    print(f"Final Validation Metrics - Accuracy: {final_results['accuracy']:.4f}, "
          f"Total items: {final_results['total_items']}, "
          f"Precision: {final_results['precision']:.4f}, "
          f"Recall: {final_results['recall']:.4f}, "
          f"F1-Score: {final_results['f1']:.4f}, "
          f"Confusion Matrix:\n{final_results['confusion_matrix']}")
    
    return best_val_acc

num_epochs = 200
best_acc = training(myModel, train_dl, val_dl, num_epochs)

In [None]:
# Example usage
if __name__ == "__main__":
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = AudioClassifier(num_classes=11).to(device)
    model.load_state_dict(torch.load('best_model.pth'))
    results = inference(model, val_dl, device)

In [None]:
# Save the model
torch.save(myModel.state_dict(), 'audio_classifier_model.pth')
print("Model saved as 'audio_classifier_model.pth'")