# Speech Commands Classification with ResNet (detecting silence)

Author: Jakub Borek, Bartosz Dybowski

Model with pre-trained model ResNet-18.

## Import

In [None]:
!pip install matplotlib
!pip install scikit-learn
!pip install soundfile

import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import DataLoader
from torchvision import models
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

## Settings

In [None]:
use_all_classes = 2
batch_size = 256
learning_rate = 1e-3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 3

## Dataset

In [None]:
class SubsetSC(torchaudio.datasets.SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__(root="./SpeechCommands", download=True)
        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as f:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in f]
        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64)

labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence']
label_to_index = {label: i for i, label in enumerate(labels)}

# Background noise
background_noises = []
background_dir = "./SpeechCommands/SpeechCommands/speech_commands_v0.02/_background_noise_"
if os.path.exists(background_dir):
    print("Found background noises.")
    for filename in os.listdir(background_dir):
        if filename.endswith('.wav'):
            path = os.path.join(background_dir, filename)
            waveform, sr = torchaudio.load(path)
            background_noises.append(waveform.squeeze(0))

# Collate function
def collate_fn(batch, silence_probability=0.1):
    tensors, targets_silence, _ = [], [], []
    max_len = 128
    silence_duration_samples = 16000

    for waveform, sample_rate, label, *_ in batch:
        if label not in labels:
            continue
        silence_label = 1 if label in ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'] else 0
        spec = transform(waveform).squeeze(0)
        if spec.shape[-1] > max_len:
            spec = spec[:, :max_len]
        elif spec.shape[-1] < max_len:
            spec = torch.nn.functional.pad(spec, (0, max_len - spec.shape[-1]))
        tensors.append(spec)
        targets_silence.append(silence_label)

    if background_noises:
        num_silence = int(len(tensors) * silence_probability)
        for _ in range(num_silence):
            noise = random.choice(background_noises)
            if noise.size(0) >= silence_duration_samples:
                start = random.randint(0, noise.size(0) - silence_duration_samples)
                silence_waveform = noise[start:start + silence_duration_samples]
            else:
                silence_waveform = torch.nn.functional.pad(noise, (0, silence_duration_samples - noise.size(0)))
            silence_spec = transform(silence_waveform.unsqueeze(0)).squeeze(0)
            if silence_spec.shape[-1] > max_len:
                silence_spec = silence_spec[:, :max_len]
            elif silence_spec.shape[-1] < max_len:
                silence_spec = torch.nn.functional.pad(silence_spec, (0, max_len - silence_spec.shape[-1]))
            tensors.append(silence_spec)
            targets_silence.append(0)

    if len(tensors) == 0:
        return torch.empty(0), torch.empty(0), torch.empty(0)

    return torch.stack(tensors), torch.tensor(targets_silence), torch.empty(0)

## Model

In [None]:
class SilenceDetector(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7,7), stride=(2,2), padding=(3,3), bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
    def forward(self, x):
        return self.resnet(x)

# DataLoader
train_loader = DataLoader(SubsetSC("training"), batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=8, pin_memory=True)
val_loader = DataLoader(SubsetSC("validation"), batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=8, pin_memory=True)
test_loader = DataLoader(SubsetSC("testing"), batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=8, pin_memory=True)

# Model, criterion, optimizer
silence_detector = SilenceDetector().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(silence_detector.parameters(), lr=learning_rate)

# Trenowanie
train_losses, train_accs = [], []
val_accs, test_accs = [], []

def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, targets, _ in loader:
            if inputs.numel() == 0:
                continue
            inputs, targets = inputs.to(device), targets.to(device)
            inputs = inputs.unsqueeze(1)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    return correct / total

def train(model, loader):
    model.train()
    running_loss, correct, total = 0, 0, 0
    for inputs, targets, _ in loader:
        if inputs.numel() == 0:
            continue
        inputs, targets = inputs.to(device), targets.to(device)
        inputs = inputs.unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)
    train_losses.append(running_loss / len(loader))
    train_accs.append(correct / total)

# Main training loop
for epoch in range(epochs):
    train(silence_detector, train_loader)
    val_acc = evaluate(silence_detector, val_loader)
    test_acc = evaluate(silence_detector, test_loader)
    val_accs.append(val_acc)
    test_accs.append(test_acc)

    print(f"Epoch {epoch+1}/{epochs}: Train Loss {train_losses[-1]:.4f}, Train Acc {train_accs[-1]:.4f}, Val Acc {val_acc:.4f}, Test Acc {test_acc:.4f}")

## Analysis

In [None]:
# Wykres Loss
plt.figure()
plt.plot(train_losses, label='Train Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()

# Wykres Accuracy
plt.figure()
plt.plot(train_accs, label='Train Accuracy')
plt.plot(val_accs, label='Validation Accuracy')
plt.plot(test_accs, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training/Validation/Test Accuracy')
plt.legend()
plt.show()

# Confusion Matrix
silence_detector.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for inputs, targets, _ in test_loader:
        if inputs.numel() == 0:
            continue
        inputs, targets = inputs.to(device), targets.to(device)
        inputs = inputs.unsqueeze(1)
        outputs = silence_detector(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())

cm = confusion_matrix(all_labels, all_preds)
ConfusionMatrixDisplay(cm, display_labels=['Non-speech', 'Speech']).plot()
plt.title('Confusion Matrix - Silence Detector')
plt.show()