In [10]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchaudio.transforms as T
from sklearn.metrics import classification_report
import hashlib


In [2]:
from torchaudio.datasets import SPEECHCOMMANDS

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)
        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as f:
                return [os.path.join(self._path, line.strip()) for line in f]
        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            self._walker = [w for w in self._walker if w not in excludes]

# Create train, validation, and test datasets
train_set = SubsetSC("training")
val_set = SubsetSC("validation")
test_set = SubsetSC("testing")


100%|██████████| 2.26G/2.26G [00:29<00:00, 81.4MB/s]


In [3]:
# Extract unique labels from the dataset
unique_labels = sorted(set([sample[2] for sample in train_set]))  # sample[2] is the label
label_to_index = {label: index for index, label in enumerate(unique_labels)}

# Number of unique labels
num_classes = len(unique_labels)
print(f"Number of classes: {num_classes}")


Number of classes: 35


In [11]:
# Define MelSpectrogram transformation
transform = T.MelSpectrogram(sample_rate=16000, n_mels=64)

# Define a fixed size for the spectrograms (e.g., 128 time frames)
fixed_width = 128

# Function to truncate or pad spectrograms to a fixed width
def pad_or_truncate(spectrogram, max_width=fixed_width):
    _, n_mels, time_steps = spectrogram.shape
    if time_steps > max_width:
        return spectrogram[:, :, :max_width]  # Truncate
    elif time_steps < max_width:
        pad_amount = max_width - time_steps
        padding = torch.zeros(1, n_mels, pad_amount)
        return torch.cat((spectrogram, padding), dim=2)  # Pad
    return spectrogram

# Custom dataset to preprocess on-the-fly, truncate/pad, and encode labels as integers
class PreprocessDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform=None, label_to_index=None, max_width=fixed_width):
        self.dataset = dataset
        self.transform = transform
        self.label_to_index = label_to_index
        self.max_width = max_width

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        waveform, _, label, *_ = sample
        if self.transform:
            spectrogram = self.transform(waveform)
            spectrogram = pad_or_truncate(spectrogram, max_width=self.max_width)  # Truncate or pad
        label_index = self.label_to_index[label]
        return spectrogram, label_index

# Apply the transformation and label encoding to datasets
train_data = PreprocessDataset(train_set, transform=transform, label_to_index=label_to_index)
val_data = PreprocessDataset(val_set, transform=transform, label_to_index=label_to_index)
test_data = PreprocessDataset(test_set, transform=transform, label_to_index=label_to_index)

# DataLoader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)


In [42]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)


In [54]:
class CNNModel(nn.Module):
    def __init__(self, num_classes=35):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()

        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))

        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))

        x = self.global_pool(x)
        print(f"After global pooling: {x.shape}")  # Debugging

        x = x.view(x.size(0), -1)
        print(f"After flattening: {x.shape}")  # Debugging

        x = self.relu(self.fc1(x))
        x = self.fc2(x)  # Final output shape: (batch_size, num_classes)

        print(f"Output from fc2: {x.shape}")
        print(f"Output values from fc2: {x}")  # Print the actual values

        return x


In [52]:
outputs = model(inputs)
print(f"Model output shape: {outputs.shape}")


Model output shape: torch.Size([64, 35])


In [59]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming the CNNModel class and other code is already defined

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, criterion, and optimizer
model = CNNModel(num_classes=35).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the training function
def train(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {running_loss/len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss/len(val_loader)}, Validation Accuracy: {val_accuracy:.2f}%")

# Run the training
train(model, train_loader, val_loader, criterion, optimizer, epochs=10)


After global pooling: torch.Size([32, 64, 1, 1])
After flattening: torch.Size([32, 64])
Output from fc2: torch.Size([32, 35])
Output values from fc2: tensor([[ 0.1134,  0.0508, -0.0427,  ..., -0.1114,  0.1049, -0.0124],
        [ 0.1919,  0.0596, -0.1881,  ..., -0.1650,  0.1224, -0.0362],
        [ 1.4176,  0.2153, -1.9667,  ..., -1.0206,  0.5447, -0.3481],
        ...,
        [ 0.4422,  0.1064, -0.5258,  ..., -0.3094,  0.2945, -0.1570],
        [ 0.0815,  0.0175,  0.0121,  ..., -0.1010,  0.0842, -0.0200],
        [ 1.2559,  0.1441, -1.8273,  ..., -0.9603,  0.4239, -0.4112]],
       grad_fn=<AddmmBackward0>)
After global pooling: torch.Size([32, 64, 1, 1])
After flattening: torch.Size([32, 64])
Output from fc2: torch.Size([32, 35])
Output values from fc2: tensor([[ 0.0505,  0.0123, -0.0709,  ..., -0.1696,  0.1085,  0.0116],
        [ 0.0649,  0.0075,  0.0078,  ..., -0.1194,  0.0978, -0.0212],
        [ 0.0638,  0.0084, -0.0013,  ..., -0.1291,  0.1016, -0.0157],
        ...,
        [ 

KeyboardInterrupt: 

In [36]:
# Function to evaluate the model
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch  # Unpack the tuple
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

    return all_labels, all_preds

# Evaluate the model
all_labels, all_preds = evaluate(model, test_loader)

# Classification report
print(classification_report(all_labels, all_preds, target_names=unique_labels))


RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 0