## Dataset Information

This dataset contains 8732 labeled sound excerpts (<=4s) of urban sounds from 10 classes: 
<li>air_conditioner
<li>car_horn
<li>children_playing
<li>dog_bark
<li>drilling
<li>engine_idling
<li>gun_shot
<li>jackhammer
<li>siren
<li>street_music

## Import modules

In [None]:
import pandas as pd
import numpy as np
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
from torch import nn
import torch.nn.functional as F
from torch.nn import init
from pathlib import Path
import os
import random
from IPython.display import Audio
import warnings
warnings.filterwarnings('ignore')
import librosa

## Loading the dataset

In [None]:
download_path = Path('data/DATA/output_chunks')
metadata_file = download_path / 'metadata.csv'

# Read metadata file
df = pd.read_csv(metadata_file)

# Construct file path by concatenating road and slice_file_name
df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path', 'classID', 'road', 'slice_file_name']]

# Display the first few rows
df.head()

# Display distinct classes to verify
distinct_classes = df['classID'].unique()
print('Distinct classes:', distinct_classes)

## Exploratory Data Analysis

In [None]:
data, sampling_rate = librosa.load('data\DATA\output_chunks\Winners Chapel(Likoni Rd) (1°19_20_S 36°50_55_E)\Winners Chapel(Likoni Rd) (1°19_20_S 36°50_55_E)--1-0.wav')

In [None]:
data

In [None]:
sampling_rate

## Input Split

In [None]:
import os
import torch
import torchaudio
from torchaudio import transforms
import numpy as np

def parser(row, data_path='data/DATA/output_chunks', sr=44100, duration=6000, n_mels=64, n_fft=1024, hop_len=None):
    # Construct file path using 'road' and 'slice_file_name'
    road = row['road']
    slice_file_name = row['slice_file_name']
    file_name = os.path.join(data_path, road, slice_file_name)
    
    # Load the audio file
    sig, sample_rate = torchaudio.load(file_name)
    
    # Resample to target sample rate
    if sample_rate != sr:
        resampler = torchaudio.transforms.Resample(sample_rate, sr)
        sig = resampler(sig)
    
    # Convert to stereo (2 channels) if needed
    if sig.shape[0] == 1:
        sig = torch.cat([sig, sig])
    elif sig.shape[0] > 2:
        sig = sig[:2, :]
    
    # Pad or truncate to fixed length (duration in ms)
    max_len = sr // 1000 * duration
    num_rows, sig_len = sig.shape
    if sig_len > max_len:
        sig = sig[:, :max_len]
    elif sig_len < max_len:
        pad_begin_len = (max_len - sig_len) // 2
        pad_end_len = max_len - sig_len - pad_begin_len
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))
        sig = torch.cat((pad_begin, sig, pad_end), 1)
    
    # Generate Mel spectrogram
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=80)(spec)
    
    # Compute mean across time axis for feature vector
    feature = torch.mean(spec, dim=2).numpy()  # Shape: [channels, n_mels]
    feature = feature.flatten()  # Flatten to 1D vector
    
    # Get the label
    label = row['classID']
    
    return [feature, label]

In [None]:
data = df.apply(parser, axis=1)
data.columns = ['feature','label']

In [None]:
data[0]

In [None]:
# input split
X = np.array(list(zip(*data))[0])
y = np.array(list(zip(*data))[1])

## Label encoder

In [None]:
import torch

# Assuming y is a list or array of classIDs from the DataFrame (e.g., df['classID'])
# No need for LabelEncoder since classID is already integer-encoded (0 to 10)
y = torch.tensor(df['classID'].values, dtype=torch.long)

In [None]:
y.shape

In [None]:
y[0]

In [None]:
# Custom Dataset class to replace TensorDataset
class FeatureDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features  # Tensor of shape [n_samples, 128]
        self.labels = labels      # Tensor of shape [n_samples]
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

## Model Training

In [None]:
import torch
import torch.nn as nn

class DenseClassifier(nn.Module):
    def __init__(self, input_size=128):  # Adjust input_size based on parser output
        super(DenseClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 11)  # 11 classes for your dataset
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize model
input_size = 128  # 2 channels * 64 mel bands from parser
model = DenseClassifier(input_size=input_size)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
def training(model, train_dl, val_dl, num_epochs=100):
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_dl:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss = running_loss / len(train_dl)
        train_acc = train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_dl:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss = val_loss / len(val_dl)
        val_acc = val_correct / val_total
        
        print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}')
    
    print('Finished Training')

# Train the model
training(model, train_dl, val_dl)

# Save the model
torch.save(model.state_dict(), 'dense_classifier_model.pth')
print("Model saved as 'dense_classifier_model.pth'")

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from pathlib import Path
import numpy as np
import torchaudio
from torchaudio import transforms
import os

# Parser function
def parser(row, data_path='data/DATA/output_chunks', sr=44100, duration=6000, n_mels=64, n_fft=1024, hop_len=256):
    road = row['road']
    slice_file_name = row['slice_file_name']
    file_name = os.path.join(data_path, road, slice_file_name)
    sig, sample_rate = torchaudio.load(file_name)
    if sample_rate != sr:
        resampler = torchaudio.transforms.Resample(sample_rate, sr)
        sig = resampler(sig)
    if sig.shape[0] == 1:
        sig = torch.cat([sig, sig])
    elif sig.shape[0] > 2:
        sig = sig[:2, :]
    max_len = sr // 1000 * duration
    num_rows, sig_len = sig.shape
    if sig_len > max_len:
        sig = sig[:, :max_len]
    elif sig_len < max_len:
        pad_begin_len = (max_len - sig_len) // 2
        pad_end_len = max_len - sig_len - pad_begin_len
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))
        sig = torch.cat((pad_begin, sig, pad_end), 1)
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=80)(spec)
    feature = torch.mean(spec, dim=2).flatten().numpy()
    label = row['classID']
    return [feature, label]

# Custom Dataset class
class FeatureDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features  # Tensor of shape [n_samples, 128]
        self.labels = labels      # Tensor of shape [n_samples]
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Load metadata
download_path = Path('data/DATA/output_chunks')
metadata_file = download_path / 'metadata.csv'
df = pd.read_csv(metadata_file)

# Map class names to classIDs if necessary
class_to_id = {
    'bicycle': 0, 'motorcycle': 1, 'private car': 2, 'SUV': 3, 'pickup': 4,
    'light truck': 5, 'medium truck': 6, 'heavy truck': 7, 'bus': 8, 'PSV': 9, 'other': 10
}
if df['classID'].dtype == object or df['classID'].isna().any():
    df['classID'] = df['class'].map(class_to_id)
    df.to_csv(download_path / 'metadata_updated.csv', index=False)

# Apply parser to extract features and labels
data = df.apply(parser, axis=1)
X = torch.tensor([x[0] for x in data], dtype=torch.float32)
y = torch.tensor([x[1] for x in data], dtype=torch.long)

# Create dataset and data loaders
dataset = FeatureDataset(X, y)
num_items = len(dataset)
num_train = round(num_items * 0.75)  # 75% training to match validation_split=0.25
num_val = num_items - num_train
train_ds, val_ds = random_split(dataset, [num_train, num_val])
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

# Define DenseClassifier
class DenseClassifier(nn.Module):
    def __init__(self, input_size=128):  # 2 channels * 64 mel bands
        super(DenseClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 11)  # 11 classes
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize model
input_size = 128  # 2 channels * 64 mel bands from parser
model = DenseClassifier(input_size=input_size)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                               steps_per_epoch=len(train_dl),
                                               epochs=1000,  # Set to 1000 epochs
                                               anneal_strategy='linear')

# Training loop
def training(model, train_dl, val_dl, num_epochs=1000):  # Set to 1000 epochs
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_dl:
            inputs, labels = inputs.to(device), labels.to(device)
            # Normalize inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss = running_loss / len(train_dl)
        train_acc = train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_dl:
                inputs, labels = inputs.to(device), labels.to(device)
                inputs_m, inputs_s = inputs.mean(), inputs.std()
                inputs = (inputs - inputs_m) / inputs_s
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss = val_loss / len(val_dl)
        val_acc = val_correct / val_total
        
        print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}')
    
    print('Finished Training')

# Train the model
training(model, train_dl, val_dl)

# Save the model
torch.save(model.state_dict(), 'dense_classifier_model2.pth')
print("Model saved as 'dense_classifier_model2.pth'")

Epoch: 1, Train Loss: 2.3460, Train Accuracy: 0.1995, Val Loss: 2.2589, Val Accuracy: 0.3260
Epoch: 2, Train Loss: 2.1340, Train Accuracy: 0.3185, Val Loss: 2.0076, Val Accuracy: 0.3260
Epoch: 3, Train Loss: 1.9436, Train Accuracy: 0.3179, Val Loss: 1.9255, Val Accuracy: 0.3260
Epoch: 4, Train Loss: 1.9116, Train Accuracy: 0.3191, Val Loss: 1.9090, Val Accuracy: 0.3260
Epoch: 5, Train Loss: 1.8983, Train Accuracy: 0.3221, Val Loss: 1.8948, Val Accuracy: 0.3260
Epoch: 6, Train Loss: 1.8937, Train Accuracy: 0.3197, Val Loss: 1.8849, Val Accuracy: 0.3242
Epoch: 7, Train Loss: 1.8821, Train Accuracy: 0.3240, Val Loss: 1.8718, Val Accuracy: 0.3315
Epoch: 8, Train Loss: 1.8807, Train Accuracy: 0.3240, Val Loss: 1.8620, Val Accuracy: 0.3352
Epoch: 9, Train Loss: 1.8747, Train Accuracy: 0.3356, Val Loss: 1.8478, Val Accuracy: 0.3352
Epoch: 10, Train Loss: 1.8501, Train Accuracy: 0.3441, Val Loss: 1.8360, Val Accuracy: 0.3370
Epoch: 11, Train Loss: 1.8432, Train Accuracy: 0.3368, Val Loss: 1.82