In [1]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------
import pandas as pd
from pathlib import Path
download_path  = Path('data\\DATA\\output_chunks')
# Read metadata file
metadata_file = download_path/'metadata.csv'
df = pd.read_csv(metadata_file)
df.head()
# Construct file path by concatenating road and slice_file_name
df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)
# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,5
1,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,1
2,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,1
3,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,6
4,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,9


In [2]:
import pandas as pd
# Load the CSV file
df = pd.read_csv(metadata_file)
# Get distinct values from the 'class' column
distinct_classes = df['class'].unique()
# Display the result
print(distinct_classes)

['light truck' 'motorcycle' 'medium truck' 'PSV' 'other' 'pickup'
 'private car' 'SUV' 'bus' 'bicycle' 'heavy truck']


In [3]:
import math
import random
import torch
import torchaudio
from torchaudio import transforms
import os
from torch.utils.data import Dataset

# -------------------------------
# Audio Utility Class with MFCC Preprocessing + Normalization
# -------------------------------
class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if sig.shape[0] == new_channel:
            return aud
        if new_channel == 1:
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return (resig, sr)

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if sr == newsr:
            return aud
        num_channels = sig.shape[0]
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1, :])
        if num_channels > 1:
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:, :])
            resig = torch.cat([resig, retwo])
        return (resig, newsr)

    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr // 1000 * max_ms
        if sig_len > max_len:
            sig = sig[:, :max_len]
        elif sig_len < max_len:
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def mfcc_feature(aud, n_mfcc=40, n_fft=1024, hop_len=512):
        sig, sr = aud
        mfcc = torchaudio.transforms.MFCC(
            sample_rate=sr,
            n_mfcc=n_mfcc,
            melkwargs={"n_fft": n_fft, "hop_length": hop_len, "n_mels": 64}
        )(sig)
        mfcc_db = torchaudio.transforms.AmplitudeToDB(top_db=80)(mfcc)
        # Normalize
        mfcc_db = (mfcc_db - mfcc_db.mean()) / (mfcc_db.std() + 1e-6)
        return mfcc_db

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        return aug_spec




In [4]:
# -------------------------------
# Dataset Class Using MFCC + Augmentation
# -------------------------------
class SoundDS(Dataset):
    def __init__(self, df, data_path, train=True):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 22050  # Resample to 22.05 kHz for consistency
        self.channel = 1
        self.shift_pct = 0.4 if train else 0.0
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        road = self.df.loc[idx, 'road']
        slice_file_name = self.df.loc[idx, 'slice_file_name']
        relative_path = road + '/' + slice_file_name
        audio_file = os.path.join(self.data_path, relative_path)

        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")

        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        #reaud = AudioUtil.resample(aud, self.sr)
        #rechan = AudioUtil.rechannel(reaud, self.channel)
        #dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        #shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct) if self.train else dur_aud

        mfcc_feat = AudioUtil.mfcc_feature(aud, n_mfcc=40, n_fft=1024, hop_len=512)
        aug_mfcc = AudioUtil.spectro_augment(mfcc_feat, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2) if self.train else mfcc_feat

        return aug_mfcc, class_id


In [5]:
from torch.utils.data import random_split
import numpy as np
from torch.utils.data import WeightedRandomSampler


print(df['classID'].value_counts(normalize=True))

myds = SoundDS(df, download_path)

num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create subset DataFrames with reindexed rows
train_df = df.iloc[train_ds.indices].reset_index(drop=True)
val_df = df.iloc[val_ds.indices].reset_index(drop=True)

train_myds = SoundDS(train_df, download_path, train=True)
val_myds = SoundDS(val_df, download_path, train=False)

# Weighted sampler for imbalance
weights = 1. / df['classID'].value_counts().sort_index().values
sample_weights = weights[df['classID'].values[train_ds.indices]]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights))

train_dl = torch.utils.data.DataLoader(train_myds, batch_size=16, sampler=sampler)
val_dl = torch.utils.data.DataLoader(val_myds, batch_size=16, shuffle=False)

# Compute global mean/std using training subset
all_sgrams = []
for idx in range(len(train_ds)):  # Use range based on train_ds length
    sgram, _ = train_myds[idx]
    all_sgrams.append(sgram.numpy())
train_sgrams = np.concatenate(all_sgrams, axis=0)
train_mean = train_sgrams.mean()
train_std = train_sgrams.std()
print(f'Global train mean: {train_mean}, std: {train_std}')

classID
2     0.320366
9     0.185355
1     0.166590
5     0.097025
3     0.081922
6     0.059497
4     0.035240
10    0.031121
0     0.019222
7     0.002746
8     0.000915
Name: proportion, dtype: float64
Global train mean: 0.0008759719785302877, std: 0.9158594608306885


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ImprovedAudioClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(128 * 8 * 32, 256)
        self.fc2 = nn.Linear(256, 128)
        self.lin = nn.Linear(128, 11)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.lin(x)
        return x
    
import torch.nn as nn
import torch.nn.functional as F

class AudioCNN(nn.Module):
    def __init__(self, num_classes=11):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.2)

        # 🔹 Adaptive pooling ensures consistent shape
        self.gap = nn.AdaptiveAvgPool2d((4, 4))  

        # After GAP, feature size = 128 * 4 * 4 = 2048
        self.fc1 = nn.Linear(128 * 4 * 4, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.gap(x)                  # fixed-size output
        x = x.view(x.size(0), -1)        # flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


myModel = AudioCNN()
next(myModel.parameters()).device

device(type='cpu')

In [None]:
def training(model, train_dl, val_dl, num_epochs, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5)

    for epoch in range(num_epochs):
        model.train()
        running_loss, correct, total = 0, 0, 0
        for inputs, labels in train_dl:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()


            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()


        train_acc = 100. * correct / total
        val_acc = evaluate_model(model, val_dl, device)

        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Loss: {running_loss:.4f} | "
              f"Train Acc: {train_acc:.2f}% | "
              f"Val Acc: {val_acc:.2f}%")


    
def evaluate_model(model, data_loader, device="cpu"):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
        return 100. * correct / total



        

    print('Finished Training')

num_epochs=1000
training(myModel, train_dl, val_dl, num_epochs, device="cpu")

Epoch 1/1000 | Loss: 4138.8873 | Train Acc: 16.53% | Val Acc: 3.20%
Epoch 2/1000 | Loss: 3853.0577 | Train Acc: 24.60% | Val Acc: 3.66%
Epoch 3/1000 | Loss: 3546.6215 | Train Acc: 29.98% | Val Acc: 1.83%
Epoch 4/1000 | Loss: 3303.9245 | Train Acc: 35.30% | Val Acc: 15.10%
Epoch 5/1000 | Loss: 3111.7862 | Train Acc: 40.50% | Val Acc: 18.31%
Epoch 6/1000 | Loss: 3010.0648 | Train Acc: 40.56% | Val Acc: 13.27%
Epoch 7/1000 | Loss: 2817.7045 | Train Acc: 44.39% | Val Acc: 25.17%
Epoch 8/1000 | Loss: 2818.5735 | Train Acc: 43.31% | Val Acc: 20.14%
Epoch 9/1000 | Loss: 2755.5005 | Train Acc: 44.45% | Val Acc: 18.99%
Epoch 10/1000 | Loss: 2669.4436 | Train Acc: 44.68% | Val Acc: 23.57%
Epoch 11/1000 | Loss: 2667.3370 | Train Acc: 45.25% | Val Acc: 20.37%
Epoch 12/1000 | Loss: 2604.2112 | Train Acc: 47.48% | Val Acc: 11.21%
Epoch 13/1000 | Loss: 2640.2897 | Train Acc: 45.31% | Val Acc: 16.25%
Epoch 14/1000 | Loss: 2571.6919 | Train Acc: 45.37% | Val Acc: 15.56%
Epoch 15/1000 | Loss: 2521.9136 

MemoryError: Unable to allocate 1.01 MiB for an array with shape (264600, 1) and data type float32

In [None]:
# Save the model outside the training function
torch.save(myModel.state_dict(), 'audio_classifier_model2.pth')
print("Model saved as 'audio_classifier_model.pth'")

In [None]:
def inference(model, data_loader, train_mean, train_std):
    """
    CPU-optimized inference function
    """
    model.eval()
    correct_prediction = 0
    total_prediction = 0
    
    with torch.no_grad():
        for inputs, labels in data_loader:
            # Normalize inputs
            inputs = (inputs - train_mean) / train_std
            outputs = model(inputs)
            
            _, prediction = torch.max(outputs, 1)
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]
    
    return correct_prediction / total_prediction