# Audio-Based Material Classification

Alexis Powell, Zitong Ren, and Jiaming Li

CIS 5190/4190

Fall 2024

In [None]:
!pip install ffmpeg
import os
import numpy as np
import soundfile as sf
import librosa
import librosa.display
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import matplotlib.pyplot as plt
from torchvision import transforms, models, datasets
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from PIL import Image



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!ls "/content/drive/Shareddrives/ML Project/Data/Data"

ben  blackboard  glass	railing  sofa  table  water


In [None]:
# Define the class-to-label mappings
CLASS_TO_LABEL = {
    'water': 0,
    'table': 1,
    'sofa': 2,
    'railing': 3,
    'glass': 4,
    'blackboard': 5,
    'ben': 6
}

# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 0.001
num_epochs = 35
num_classes = len(CLASS_TO_LABEL)

In [None]:
def calculate_label_statistics(dataset):
  labels = [label for _, label in dataset]
  mean = np.mean(labels)
  std = np.std(labels)
  return mean, std

In [None]:
class MelSpectrogramDataset(Dataset):
    def __init__(self, data_dir, transform=None, mean=None, std=None):
        self.data = []
        self.labels = []
        self.transform = transform
        self.mean = mean
        self.std = std

        # Traverse data directory and load images and labels
        for class_name, label in CLASS_TO_LABEL.items():
            class_dir = os.path.join(data_dir, class_name)
            if os.path.exists(class_dir):
                for file_name in os.listdir(class_dir):
                    if file_name.endswith('.png'):
                        self.data.append(os.path.join(class_dir, file_name))
                        self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data[idx]
        label = self.labels[idx]

        # Standardize label using the mean and std
        # if self.mean is not None and self.std is not None:
            # label = (label - self.mean) / self.std

        # Load image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, label

def denormalize_label(predicted_label, mean, std):
  return predicted_label * std + mean

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load dataset and split into train/test sets
# data_path = '/content/drive/Shareddrives/ML Project/mel_spectrograms'
data_path = '//content/drive/Shareddrives/ML Project/mel_spectrograms'

# dataset_initial = MelSpectrogramDataset(data_path, transform=transform)
# mean, std = calculate_label_statistics(dataset_initial)
dataset = MelSpectrogramDataset(data_path, transform=transform)

train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

In [None]:
class CNNClassifier(nn.Module):
    def __init__(self, num_classes=7):
        super(CNNClassifier, self).__init__()

        # Two convolutional layers with kernel size 5 and stride 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5, stride=1, padding=2)  # Padding = 2 to maintain dimensions
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2)  # Padding = 2 to maintain dimensions

        # Pooling layer
        self.pool = nn.MaxPool2d(2, 2)  # Reduces spatial dimensions by half

        # Calculate flattened size after convolutions and pooling for fc1
        self.flattened_size = 64 * 32 * 32  # Adjust based on input size (assumed 128x128 input image)

        # Fully connected layer
        self.fc1 = nn.Linear(self.flattened_size, num_classes)

    def forward(self, x):
        # Apply convolutional layers with ReLU and pooling
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))

        # Flatten feature maps dynamically
        x = x.view(x.size(0), -1)

        # Fully connected layer
        x = self.fc1(x)
        return x


In [None]:
# K-Fold Cross-Validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
fold_results = []

# Define batch size and learning rate
batch_size = 16
learning_rate = 0.0005

for fold, (train_indices, val_indices) in enumerate(kf.split(dataset)):
    print(f"Fold {fold + 1}/{k_folds}")

    # Subsets for the current fold
    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)
    val_subset = Subset(dataset, val_indices)

    # Create DataLoaders for the current fold
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

    # Instantiate a fresh model for each fold
    model = CNNClassifier(num_classes=7).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Train the model with per-epoch performance logging
    train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

    for epoch in range(num_epochs):
        model.train()
        running_loss, correct_train, total_train = 0.0, 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(train_accuracy)

        # Validation loop
        model.eval()
        running_val_loss, correct_val, total_val = 0.0, 0, 0
        all_labels, all_predictions = [], []

        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs, 1)

                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        epoch_val_loss = running_val_loss / len(test_loader.dataset)
        val_accuracy = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(val_accuracy)

    fold_results.append({
        "fold": fold + 1,
        "train_loss": np.mean(train_losses),
        "val_loss": np.mean(val_losses),
        "train_accuracy": np.mean(train_accuracies),
        "val_accuracy": np.mean(val_accuracies),
    })

    print(f"Fold {fold + 1} Results:")
    print(f" Train: Loss = {np.mean(train_losses):.4f}, Accuracy = {np.mean(train_accuracies):.4f}")
    print(f" Validation: Loss = {np.mean(val_losses):.4f}, Accuracy = {np.mean(val_accuracies):.4f}")


Fold 1/5
Fold 1 Results:
 Train: Loss = 0.0755, Accuracy = 0.9724
 Validation: Loss = 0.1987, Accuracy = 0.9562
Fold 2/5
Fold 2 Results:
 Train: Loss = 0.1162, Accuracy = 0.9595
 Validation: Loss = 0.0773, Accuracy = 0.9736
Fold 3/5
Fold 3 Results:
 Train: Loss = 0.0960, Accuracy = 0.9674
 Validation: Loss = 0.0608, Accuracy = 0.9788
Fold 4/5
Fold 4 Results:
 Train: Loss = 0.1076, Accuracy = 0.9624
 Validation: Loss = 0.0714, Accuracy = 0.9761
Fold 5/5
Fold 5 Results:
 Train: Loss = 0.0956, Accuracy = 0.9662
 Validation: Loss = 0.0538, Accuracy = 0.9834


In [None]:
# K-Fold Cross-Validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
fold_results = []

# Define batch size and learning rate
batch_size = 16
learning_rate = 0.0005

for fold, (train_indices, val_indices) in enumerate(kf.split(dataset)):
    print(f"Fold {fold + 1}/{k_folds}")

    # Subsets for the current fold
    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)
    val_subset = Subset(dataset, val_indices)

    # Create DataLoaders for the current fold
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

    # Instantiate a fresh model for each fold
    model = CNNClassifier(num_classes=7).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Train the model with per-epoch performance logging
    train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

    for epoch in range(num_epochs):
        model.train()
        running_loss, correct_train, total_train = 0.0, 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(train_accuracy)

        # Validation loop
        model.eval()
        running_val_loss, correct_val, total_val = 0.0, 0, 0
        all_labels, all_predictions = [], []

        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs, 1)

                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        epoch_val_loss = running_val_loss / len(test_loader.dataset)
        val_accuracy = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(val_accuracy)

    fold_results.append({
        "fold": fold + 1,
        "train_loss": np.mean(train_losses),
        "val_loss": np.mean(val_losses),
        "train_accuracy": np.mean(train_accuracies),
        "val_accuracy": np.mean(val_accuracies),
    })

    print(f"Fold {fold + 1} Results:")
    print(f" Train: Loss = {np.mean(train_losses):.4f}, Accuracy = {np.mean(train_accuracies):.4f}")
    print(f" Validation: Loss = {np.mean(val_losses):.4f}, Accuracy = {np.mean(val_accuracies):.4f}")


Fold 1/5
Fold 1 Results:
 Train: Loss = 0.1169, Accuracy = 0.9578
 Validation: Loss = 0.2334, Accuracy = 0.9476
Fold 2/5
Fold 2 Results:
 Train: Loss = 0.0860, Accuracy = 0.9692
 Validation: Loss = 0.0608, Accuracy = 0.9791
Fold 3/5
Fold 3 Results:
 Train: Loss = 0.1225, Accuracy = 0.9571
 Validation: Loss = 0.0851, Accuracy = 0.9709
Fold 4/5
Fold 4 Results:
 Train: Loss = 0.0875, Accuracy = 0.9703
 Validation: Loss = 0.0513, Accuracy = 0.9836
Fold 5/5
Fold 5 Results:
 Train: Loss = 0.0944, Accuracy = 0.9671
 Validation: Loss = 0.0531, Accuracy = 0.9820


In [None]:
# Calculate average metrics across all folds
avg_results = {
    "train_loss": np.mean([result["train_loss"] for result in fold_results]),
    "val_loss": np.mean([result["val_loss"] for result in fold_results]),
    "train_accuracy": np.mean([result["train_accuracy"] for result in fold_results]),
    "val_accuracy": np.mean([result["val_accuracy"] for result in fold_results]),
}

print("\nFinal K-Fold Results (Averaged):")
print(f" Train Loss: {avg_results['train_loss']:.4f}")
print(f" Validation Loss: {avg_results['val_loss']:.4f}")
print(f" Train Accuracy: {avg_results['train_accuracy']:.4f}")
print(f" Validation Accuracy: {avg_results['val_accuracy']:.4f}")


Final K-Fold Results (Averaged):
 Train Loss: 0.1014
 Validation Loss: 0.0967
 Train Accuracy: 0.9643
 Validation Accuracy: 0.9726


# Exploratory Experiments