In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import soundfile as sf
from IPython.display import Audio, display

import torch
import torch.nn as nn
import torch.nn.functional as F
import os

In [2]:
class AudioCNN(nn.Module):

    def __init__(self, num_classes=3, n_mels=64):
        super(AudioCNN, self).__init__()

        # input (batch, 1, n_mels, time)

        # Conv block 1
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(2,2)
        self.dropout1 = nn.Dropout(0.25)

        # Conv block 2
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.pool2 = nn.MaxPool2d(2,2)
        self.dropout2 = nn.Dropout(0.25)

        # Conv block 3
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.pool3 = nn.MaxPool2d(2,1)
        self.dropout3 = nn.Dropout(0.25)

        # Conv block 4
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(512)
        self.pool4 = nn.MaxPool2d(1,2)
        self.dropout4 = nn.Dropout(0.25)

        # global avg pooling
        self.GAP = nn.AdaptiveAvgPool2d((1,1))

        # fully connected layers
        self.fc1 = nn.Linear(512, 256)
        self.dropout_fc = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    
    def forward(self, x):
        # Conv Block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.pool1(x)
        x = self.dropout1(x)
        
        # Conv Block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.pool2(x)
        x = self.dropout2(x)
        
        # Conv Block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.pool3(x)
        x = self.dropout3(x)
        
        # Conv Block 4
        x = self.conv4(x)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.pool4(x)
        x = self.dropout4(x)
        
        # Global Average Pooling
        x = self.GAP(x)
        x = x.view(x.size(0), -1)
        
        # Fully Connected
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout_fc(x)
        x = self.fc2(x)
        
        return x




In [4]:
# preprocess - pad to 5s, resample to fixed sample rate

import librosa

def preprocess_audio(file_path, target_sr=22050, target_duration=5.0):

    data, sr = sf.read(file_path, dtype='float32')

    # convert to mono
    if data.ndim > 1:
        data = data.mean(axis=1) # avg across channels

    # resample
    if sr != target_sr:
        data = librosa.resample(data, orig_sr=sr, target_sr=target_sr)
    
    # padding
    target_samples = int(target_sr * target_duration)
    curr_samples = len(data)

    if curr_samples < target_samples:
        padding = target_samples - curr_samples
        left = padding // 2
        right = padding - left
        data = np.pad(data, (left, right)) # if odd pad 1 extra on right

    # cropping
    elif curr_samples > target_samples:
        start = np.random.randint(0, curr_samples-target_samples+1)
        end = start + target_samples
        data = data[start:end]
        

    # Convert to PyTorch tensor with shape (1, samples)
    # Why: Need tensor for PyTorch training, shape (1, samples) = (channels, time)
    waveform = torch.from_numpy(data).float().unsqueeze(0)

    return waveform


In [None]:
# test - augmented much better on horns
import torchaudio.transforms as T


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model path
model_dir = "..\\pytorch training\\models"
model_path = os.path.join(model_dir, "best_model_augmented.pth")
print(model_path, "\n")

model = AudioCNN()
model = model.to(device)

model.load_state_dict(torch.load(model_path))
model.eval()

class_names = ["horn", "other", "siren"] # list(test_dataset.classes)

# audio file
test_audio_dir = "extra_test_clips"
test_clip_paths = [os.path.join(test_audio_dir, file) for file in os.listdir(test_audio_dir)]

print(f"Num Clips: {len(test_clip_paths)}")

for file_path in test_clip_paths:

    print("="*60)
    print(f"File: {os.path.basename(file_path)}")

    waveform = preprocess_audio(file_path=file_path)
    
    print("Processed audio clip: ")
    display(Audio(waveform.squeeze(0).numpy(), rate=22050))  # Convert tensor to numpy, remove channel dim

    mel_spec = T.MelSpectrogram(sample_rate=22050, n_mels=64, n_fft=1024, hop_length=256)(waveform)
    mel_db = T.AmplitudeToDB()(mel_spec)    # (1, n_mels ,time)
    
    with torch.no_grad():

        x = mel_db.unsqueeze(0).to(device)  # add batch dimension since model expects (batch, 1, n_mels ,time)

        # forward
        logits = model(x)
        probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
        pred_idx = probs.argmax() #int()?
        pred_class = class_names[pred_idx]

        print(f"Prediction: {pred_class}\n")

        # print all probs
        class_prob_pair = list(zip(class_names, probs))
        for cls, prob in sorted(class_prob_pair, key=lambda x: x[1], reverse=True):
            print(f"{cls}: {100*prob:.2f}%")

        print("="*60)
        print("\n")



..\pytorch training\models\best_model_augmented.pth 

Num Clips: 8
File: automobile-horn-153260.mp3
Processed audio clip: 


Prediction: horn

horn: 97.46%
other: 2.50%
siren: 0.04%


File: car-honk-386166.mp3
Processed audio clip: 


Prediction: horn

horn: 96.18%
other: 3.79%
siren: 0.02%


File: double-car-horn-352443.mp3
Processed audio clip: 


Prediction: horn

horn: 83.49%
other: 16.42%
siren: 0.09%


File: fire-truck-siren-29900.mp3
Processed audio clip: 


Prediction: siren

siren: 98.22%
other: 1.78%
horn: 0.00%


File: police-siren.mp3
Processed audio clip: 


Prediction: siren

siren: 97.58%
other: 2.42%
horn: 0.00%


File: siren-police.mp3
Processed audio clip: 


Prediction: siren

siren: 85.78%
other: 14.22%
horn: 0.00%


File: sound-effect-uk-ambulance-siren.mp3
Processed audio clip: 


Prediction: siren

siren: 96.50%
other: 3.50%
horn: 0.00%


File: truck-signal-153263.mp3
Processed audio clip: 


Prediction: horn

horn: 99.40%
other: 0.60%
siren: 0.00%


