In [1]:

import pandas as pd
from pathlib import Path
download_path  = Path('data\\DATA\\output_chunks')

metadata_file = download_path/'metadata.csv'
df = pd.read_csv(metadata_file)
df.head()

df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)

df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,5
1,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,1
2,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,1
3,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,6
4,/road/Around Arya School(1°16_32_ S 36°49_29_ ...,9


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(metadata_file)

# Get distinct values from the 'class' column
distinct_classes = df['class'].unique()


# Display the result
print(distinct_classes)

['light truck' 'motorcycle' 'medium truck' 'PSV' 'other' 'pickup'
 'private car' 'SUV' 'bus' 'bicycle' 'heavy truck']


In [3]:
import torch
import torchaudio
from torchaudio import transforms
import random
import albumentations as A
from albumentations.pytorch import ToTensorV2

class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if (sr == newsr):
            return aud
        num_channels = sig.shape[0]
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
        return ((resig, newsr))

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if (sig.shape[0] == new_channel):
            return aud
        if (new_channel == 1):
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return ((resig, sr))

    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms
        if (sig_len > max_len):
            sig = sig[:,:max_len]
        elif (sig_len < max_len):
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def mel_spectrogram_feature(aud, n_mels=32, n_fft=2822, hop_length=1103, f_max=8000):
        sig, sr = aud
        mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            f_max=f_max,
            window_fn=torch.hann_window
        )(sig)
        log_mel = transforms.AmplitudeToDB(top_db=80)(mel_spec)  # Log scale as in the paper
        return log_mel  # Shape: [1, 64, time_steps]

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2):
        _, n_mfcc, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        freq_mask_param = max_mask_pct * n_mfcc
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        return aug_spec
    


    @staticmethod
    def image_augment(spec):
        spec_np = spec.squeeze(0).numpy()  # To [40, 512]
        transform = A.Compose([
            A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),
            A.GridDistortion(p=0.5),
            A.Cutout(num_holes=8, max_h_size=8, max_w_size=8, p=0.5),
        ])
        augmented = transform(image=spec_np)['image']
        aug_spec = torch.from_numpy(augmented).unsqueeze(0)  # Back to [1, 40, 512]
        return aug_spec

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
from torch.utils.data import Dataset
import torchaudio

class SoundDS(Dataset):
    def __init__(self, df, data_path, train=True):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000  # 4 seconds
        self.sr = 44100      # Match paper
        self.channel = 1
        self.shift_pct = 0.4 if train else 0.0
        self.train = train
        self.target_time_steps = 512  # Adjust based on new hop_length if needed
        self.hop_length = 1103       # Match paper

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Use 'relative_path' and remove the '/road/' prefix to match the directory structure
        relative_path = self.df.loc[idx, 'relative_path']
        adjusted_path = relative_path[6:]  # Remove '/road/' (length 6)
        audio_file = os.path.join(self.data_path, adjusted_path)

        # Check if the file exists
        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")

        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        # Load and process audio
        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, self.sr)  # Resample to 44100 Hz
        rechan = AudioUtil.rechannel(reaud, self.channel)  # Force to 1 channel
        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)  # Pad or truncate to 4000 ms
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct) if self.train else dur_aud

        # Extract mel-spectrogram features with n_mels=32
        mel_feat = AudioUtil.mel_spectrogram_feature(shift_aud, n_mels=32, n_fft=2822, hop_length=self.hop_length, f_max=8000)
        # Ensure time dimension is exactly 512
        _, n_mels, n_steps = mel_feat.shape
        if n_steps > self.target_time_steps:
            mel_feat = mel_feat[:, :, :self.target_time_steps]  # Truncate to 512
        elif n_steps < self.target_time_steps:
            pad_size = self.target_time_steps - n_steps
            pad = torch.zeros((1, n_mels, pad_size))  # 1 channel
            mel_feat = torch.cat((mel_feat, pad), dim=2)  # Pad with zeros

        aug_mel = AudioUtil.spectro_augment(mel_feat, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2) if self.train else mel_feat

        return aug_mel, class_id

In [5]:
import torch
import torch.nn as nn

class UrbanSoundModel(nn.Module):
    def __init__(self, num_classes):
        super(UrbanSoundModel, self).__init__()
        
        # Global Feature Extractor (reduced layers)
        self.global_conv_blocks = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(4, 32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(8, 64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(16, 128),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.global_rnn = nn.GRU(32, 128, bidirectional=True, batch_first=True)  # Updated input_size to 32

        # Specific Feature Extractor (reduced layers)
        self.specific_conv_blocks = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(4, 32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(8, 64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(16, 128),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.specific_attention = nn.MultiheadAttention(32, num_heads=4, batch_first=True)  # embed_dim = 32

        # Fully Connected Layer
        self.fc = nn.Sequential(
            nn.Linear(256, 128),  # 128 (GRU) + 128 (Attention) = 256 (adjusted for symmetry)
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)  # Output logits
        )

    def forward(self, x):
        # x shape: [batch_size, 1, 32, 512]
        batch_size, channels, n_mels, time_steps = x.shape

        # Global Feature Extraction
        global_features = self.global_conv_blocks(x)  # [batch_size, 128, reduced_time]
        global_features = global_features.squeeze(-1).squeeze(-1)  # [batch_size, 128]
        # Reshape for GRU: remove channel dimension and transpose
        global_features = x.squeeze(1).transpose(1, 2)  # [batch_size, 32, 512] -> [batch_size, 512, 32]
        global_features, _ = self.global_rnn(global_features)  # [batch_size, 512, 256]
        global_features = global_features[:, -1, :][:,:128]  # [batch_size, 128] (truncate to match)

        # Specific Feature Extraction
        specific_features = self.specific_conv_blocks(x)
        specific_features = specific_features.squeeze(-1).squeeze(-1)  # [batch_size, 128]
        specific_features = x.squeeze(1).transpose(1, 2)  # [batch_size, 32, 512] -> [batch_size, 512, 32]
        specific_features, _ = self.specific_attention(specific_features, specific_features, specific_features)
        specific_features = specific_features[:, -1, :]  # [batch_size, 32]

        # Concatenate features
        combined_features = torch.cat((global_features, specific_features), dim=1)  # [batch_size, 160]
        # Project to match expected input of fc layer
        combined_features = nn.Linear(160, 256)(combined_features)  # Adjust to 256 for fc compatibility
        
        # Fully Connected Layers
        output = self.fc(combined_features)  # Output logits [batch_size, 11]
        return output

In [6]:
def training(model, train_dl, num_epochs, device):
    print("Starting training loop...")
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_dl):
            inputs, labels = inputs.to(device), labels.to(device).long()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 10 == 9:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {running_loss/10:.4f}')
                running_loss = 0.0

In [7]:
from torchmetrics import Precision, Recall, F1Score, ConfusionMatrix
import numpy as np
# Inference/Evaluation Function
def inference(model, val_dl, device):
    model.eval()
    correct_prediction = 0
    total_prediction = 0
    
    precision = Precision(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    recall = Recall(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    f1 = F1Score(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    confmat = ConfusionMatrix(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    
    all_predictions = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for data in val_dl:
            inputs, labels = data[0].to(device), data[1].to(device).float()
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()

            correct_prediction += (predicted == labels).sum().item()
            total_prediction += labels.numel()
            
            precision.update(predicted, labels.int())
            recall.update(predicted, labels.int())
            f1.update(predicted, labels.int())
            confmat.update(predicted, labels.int())
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(outputs.cpu().numpy())

    acc = correct_prediction / total_prediction
    prec = precision.compute().item()
    rec = recall.compute().item()
    f1_score = f1.compute().item()
    conf_matrix = confmat.compute().cpu().numpy()

    print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
    print(f'Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1_score:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)

    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1_score,
        'confusion_matrix': conf_matrix,
        'predictions': np.array(all_predictions),
        'labels': np.array(all_labels),
        'probabilities': np.array(all_probs)
    }

In [None]:
# Main Execution
if __name__ == "__main__":
    # Load data
    import pandas as pd
    from pathlib import Path
    download_path = Path('data\\DATA\\output_chunks')
    metadata_file = download_path / 'metadata.csv'
    df = pd.read_csv(metadata_file)
    df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)
    df = df[['relative_path', 'classID']]

    # Dataset and DataLoader
    dataset = SoundDS(df, download_path, train=True)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch size for CPU
    val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Device (forced to CPU)
    device = torch.device("cpu")
    print(f"Using device: {device}")

    # Model
    model = UrbanSoundModel(num_classes=11).to(device)

    # Training
    num_epochs = 20  # Reduced for CPU efficiency
    training(model, train_dl, num_epochs, device)

    # Evaluation
    results = inference(model, val_dl, device)

    # Save model
    torch.save(model.state_dict(), 'urban_sound_model_cpu.pth')
    print("Model saved as 'urban_sound_model_cpu.pth'")

Using device: cpu
Starting training loop...




Epoch [1/20], Step [10], Loss: 2.3335
Epoch [1/20], Step [20], Loss: 2.1816
Epoch [1/20], Step [30], Loss: 2.2024
Epoch [1/20], Step [40], Loss: 2.1510
Epoch [1/20], Step [50], Loss: 2.0770
Epoch [1/20], Step [60], Loss: 2.0152
Epoch [1/20], Step [70], Loss: 2.1345
Epoch [1/20], Step [80], Loss: 2.1147
Epoch [1/20], Step [90], Loss: 1.9910
Epoch [1/20], Step [100], Loss: 2.0688
Epoch [1/20], Step [110], Loss: 2.0020
Epoch [2/20], Step [10], Loss: 2.0695
Epoch [2/20], Step [20], Loss: 2.0362
Epoch [2/20], Step [30], Loss: 1.8904
Epoch [2/20], Step [40], Loss: 2.0070
Epoch [2/20], Step [50], Loss: 1.9745
Epoch [2/20], Step [60], Loss: 2.0212
Epoch [2/20], Step [70], Loss: 1.9123
Epoch [2/20], Step [80], Loss: 1.9055
Epoch [2/20], Step [90], Loss: 1.9509
Epoch [2/20], Step [100], Loss: 1.9951
Epoch [2/20], Step [110], Loss: 1.9549
Epoch [3/20], Step [10], Loss: 1.9334
Epoch [3/20], Step [20], Loss: 1.9567
Epoch [3/20], Step [30], Loss: 2.0048
Epoch [3/20], Step [40], Loss: 1.8670
Epoch [3