In [None]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------
import pandas as pd
from pathlib import Path

download_path = Path('data\\DATA\\output_chunks')

# Read metadata file
metadata_file = download_path / 'metadata.csv'
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating road and slice_file_name
df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(metadata_file)

# Get distinct values from the 'class' column
distinct_classes = df['class'].unique()


# Display the result
print(distinct_classes)

In [None]:
import math
import random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
import albumentations as A
from albumentations.pytorch import ToTensorV2

class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if (sig.shape[0] == new_channel):
            return aud
        if (new_channel == 1):
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return ((resig, sr))

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if (sr == newsr):
            return aud
        num_channels = sig.shape[0]
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
        return ((resig, newsr))

    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms
        if (sig_len > max_len):
            sig = sig[:,:max_len]
        elif (sig_len < max_len):
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig,sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        return aug_spec

    @staticmethod
    def image_augment(spec):
        # Convert tensor to numpy for albumentations
        spec_np = spec.numpy().transpose(1, 2, 0)  # Assuming spec is (C, H, W), adjust if needed
        transform = A.Compose([
            A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),
            A.GridDistortion(p=0.5),
            A.Cutout(num_holes=8, max_h_size=8, max_w_size=8, p=0.5),
        ])
        augmented = transform(image=spec_np)
        aug_spec = augmented['image'].transpose(2, 0, 1)  # Back to (C, H, W)
        return torch.from_numpy(aug_spec)

In [None]:
import os
from torch.utils.data import Dataset
import torchaudio

class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4
            
    def __len__(self):
        return len(self.df)    
    
    def __getitem__(self, idx):
        # Construct the full path using 'road' and 'slice_file_name'
        road = self.df.loc[idx, 'road']
        slice_file_name = self.df.loc[idx, 'slice_file_name']
        relative_path = road + '/' + slice_file_name
        audio_file = os.path.join(self.data_path, relative_path)
        
        # Check if the file exists
        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")
        
        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)
        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
        aug_sgram = AudioUtil.image_augment(aug_sgram)
        return aug_sgram, class_id

In [None]:
from torch import nn
# Neural Network Model (Dual Backbone Architecture)
class UrbanSoundModel(nn.Module):
    def __init__(self, num_classes, pretrained_global=True):
        super(UrbanSoundModel, self).__init__()
        
        # Global Feature Extractor (TALNet-like, pretrained on AudioSet)
        self.global_conv_blocks = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(8, 64),  # Group Normalization
            nn.Mish(),  # Mish activation
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(16, 128),
            nn.Mish(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(32, 256),
            nn.Mish(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(256, 512, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(64, 512),
            nn.Mish(),
            nn.MaxPool2d((2, 2))
        )
        self.global_rnn = nn.GRU(512, 256, bidirectional=True, batch_first=True)
        
        # Specific Feature Extractor (Modified TALNet)
        self.specific_conv_blocks = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(8, 64),  # Group Normalization
            nn.Mish(),  # Mish activation
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(16, 128),
            nn.Mish(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(32, 256),
            nn.Mish(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(256, 512, kernel_size=(3, 3), padding=1),
            nn.GroupNorm(64, 512),
            nn.Mish(),
            nn.MaxPool2d((2, 2))
        )
        self.specific_attention = nn.MultiheadAttention(512, num_heads=8, batch_first=True)
        
        # Fully Connected Layer
        self.fc = nn.Sequential(
            nn.Linear(1024, 512),  # 512 (GRU) + 512 (Attention)
            nn.Mish(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
            nn.Sigmoid()  # Output probabilities
        )
        
        # Gradient Centralization for convolutional layers
        self.apply(self._init_gc)

    def _init_gc(self, m):
        if isinstance(m, nn.Conv2d):
            nn.utils.weight_norm(m, name='weight')
            m.weight_g = nn.Parameter(m.weight_g - m.weight_g.mean())

    def forward(self, x):
        # Global Feature Extraction
        global_features = self.global_conv_blocks(x)
        global_features = global_features.transpose(1, 2)  # (batch, time, freq, channels) -> (batch, time, channels)
        global_features = global_features.contiguous().view(global_features.size(0), global_features.size(1), -1)
        global_features, _ = self.global_rnn(global_features)
        global_features = global_features[:, -1, :]  # Take the last time step

        # Specific Feature Extraction
        specific_features = self.specific_conv_blocks(x)
        specific_features = specific_features.transpose(1, 2)
        specific_features = specific_features.contiguous().view(specific_features.size(0), specific_features.size(1), -1)
        specific_features, _ = self.specific_attention(specific_features, specific_features, specific_features)
        specific_features = specific_features[:, -1, :]  # Take the last time step

        # Concatenate features
        combined_features = torch.cat((global_features, specific_features), dim=1)
        
        # Fully Connected Layers
        output = self.fc(combined_features)
        return output

In [None]:
# Training Function
def training(model, train_dl, num_epochs, device):
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multi-label
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                             steps_per_epoch=int(len(train_dl)),
                                             epochs=num_epochs,
                                             anneal_strategy='linear')

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0

        for i, data in enumerate(train_dl):
            inputs, labels = data[0].to(device), data[1].to(device).float()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct_prediction += (predicted == labels).sum().item()
            total_prediction += labels.numel()

        avg_loss = running_loss / len(train_dl)
        acc = correct_prediction / total_prediction
        print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

    print('Finished Training')

In [None]:
# Inference/Evaluation Function
def inference(model, val_dl, device):
    model.eval()
    correct_prediction = 0
    total_prediction = 0
    
    precision = Precision(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    recall = Recall(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    f1 = F1Score(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    confmat = ConfusionMatrix(task="multilabel", num_labels=model.fc[-2].out_features).to(device)
    
    all_predictions = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for data in val_dl:
            inputs, labels = data[0].to(device), data[1].to(device).float()
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()

            correct_prediction += (predicted == labels).sum().item()
            total_prediction += labels.numel()
            
            precision.update(predicted, labels.int())
            recall.update(predicted, labels.int())
            f1.update(predicted, labels.int())
            confmat.update(predicted, labels.int())
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(outputs.cpu().numpy())

    acc = correct_prediction / total_prediction
    prec = precision.compute().item()
    rec = recall.compute().item()
    f1_score = f1.compute().item()
    conf_matrix = confmat.compute().cpu().numpy()

    print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
    print(f'Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1_score:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)

    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1_score,
        'confusion_matrix': conf_matrix,
        'predictions': np.array(all_predictions),
        'labels': np.array(all_labels),
        'probabilities': np.array(all_probs)
    }

In [None]:
# Main Execution
if __name__ == "__main__":
    # Load data (example placeholders)
    import pandas as pd
    from pathlib import Path
    download_path = Path('data\\DATA\\output_chunks')
    metadata_file = download_path / 'metadata.csv'
    df = pd.read_csv(metadata_file)
    df['relative_path'] = '/road/' + df['road'].astype(str) + '/' + df['slice_file_name'].astype(str)
    df = df[['relative_path', 'classID']]

    # Dataset and DataLoader
    dataset = SoundDS(df, download_path)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dl = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model
    model = UrbanSoundModel(num_classes=11).to(device)  # Adjust num_classes based on dataset

    # Training
    num_epochs = 50
    training(model, train_dl, num_epochs, device)

    # Evaluation
    results = inference(model, val_dl, device)

    # Save model
    torch.save(model.state_dict(), 'urban_sound_model.pth')
    print("Model saved as 'urban_sound_model.pth'")