<a href="https://colab.research.google.com/github/Benedictakel/Sound-Classification-Using-UrbanSound8K/blob/main/Sound_Classification_Using_UrbanSound8K.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install librosa pandas torch torchvision matplotlib scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
import librosa
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set paths
DATASET_PATH = 'UrbanSound8K/audio'
METADATA_PATH = 'UrbanSound8K/metadata/UrbanSound8K.csv'
SAMPLE_RATE = 22050
NUM_MFCC = 13
MAX_LEN = 174  # Adjust based on audio duration (e.g., 4 sec)

# Load metadata
metadata = pd.read_csv(METADATA_PATH)

# Map classes to numbers
class_labels = metadata['class'].unique()
label_to_index = {label: idx for idx, label in enumerate(class_labels)}
index_to_label = {idx: label for label, idx in label_to_index.items()}


In [None]:
def extract_features(file_path):
    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(signal, sr=sr, n_mfcc=NUM_MFCC)

    if mfcc.shape[1] < MAX_LEN:
        pad_width = MAX_LEN - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :MAX_LEN]

    return mfcc


In [None]:
from torch.utils.data import Dataset, DataLoader

class UrbanSoundDataset(Dataset):
    def __init__(self, metadata, data_path, transform=None):
        self.metadata = metadata
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        fold = f"fold{row['fold']}"
        file_name = row['slice_file_name']
        label = label_to_index[row['class']]

        file_path = os.path.join(self.data_path, fold, file_name)
        mfcc = extract_features(file_path)
        mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return mfcc_tensor, label_tensor

# Subset for quick testing (e.g., fold 1 only)
subset = metadata[metadata['fold'] == 1].reset_index(drop=True)
dataset = UrbanSoundDataset(subset, DATASET_PATH)

train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(32 * 2 * 42, 64)
        self.fc2 = nn.Linear(64, 10)  # 10 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))   # shape: [B, 16, 5, 86]
        x = self.pool(F.relu(self.conv2(x)))   # shape: [B, 32, 2, 42]
        x = x.view(x.size(0), -1)              # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
model = AudioClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")


In [None]:
def predict(model, file_path):
    model.eval()
    mfcc = extract_features(file_path)
    mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    output = model(mfcc_tensor)
    predicted = torch.argmax(output, 1).item()
    return index_to_label[predicted]

# Example usage
example_path = os.path.join(DATASET_PATH, 'fold1', subset.iloc[0]['slice_file_name'])
print("Predicted class:", predict(model, example_path))
