# Speech Commands Classification with Transformer

Author: Jakub Borek, Bartosz Dybowski

Model with pre-trained model Wav2Vec.

In [None]:
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import os
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
TRAIN_AUDIO_DIR = './data/train/train/audio'
TEST_AUDIO_DIR = './data/test/test/audio'

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

model.to(device)

In [None]:
def load_wav_16k_mono(path):
    wav, sr = librosa.load(path, sr=16000)
    return wav

def parallel_load_audio(file_paths, max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        audios = list(tqdm(executor.map(load_wav_16k_mono, file_paths), total=len(file_paths), desc="Loading Audio Files"))
    return audios

def extract_wav2vec_embeddings(file_paths, batch_size=4, num_workers=8):
    embeddings = []

    audios = parallel_load_audio(file_paths, max_workers=num_workers)
    
    for i in tqdm(range(0, len(audios), batch_size), desc="Extracting Embeddings"):
        batch = audios[i:i+batch_size]
        
        inputs = processor(batch, return_tensors="pt", padding=True, sampling_rate=16000).input_values.to(device)
        
        with torch.no_grad():
            outputs = model(inputs).last_hidden_state  # shape (batch_size, time, features)
        
        pooled = outputs.mean(dim=1).cpu().numpy()
        embeddings.extend(pooled)
    
    embeddings = np.vstack(embeddings)
    return embeddings

In [None]:
def get_label_names(audio_dir):
    all_labels = sorted(os.listdir(audio_dir))
    label_names = [label for label in all_labels if not label.startswith('_')]
    return label_names

In [None]:
label_names_list = get_label_names(TRAIN_AUDIO_DIR)
print(label_names_list)

In [None]:
def load_file_paths_and_labels(audio_dir, max_per_class=20000):
    file_paths, labels = [], []
    label_names = label_names_list
    label_to_index = {label: idx for idx, label in enumerate(label_names)}

    for label in label_names:
        folder = os.path.join(audio_dir, label)
        if not os.path.isdir(folder):
            continue
        for i, fname in enumerate(os.listdir(folder)):
            if fname.endswith(".wav") and i < max_per_class:
                path = os.path.join(folder, fname)
                file_paths.append(path)
                labels.append(label_to_index[label])
    return file_paths, np.array(labels)

In [None]:
file_paths, labels = load_file_paths_and_labels(TRAIN_AUDIO_DIR)
print(f"Total files: {len(file_paths)}")
print(f"Labels array shape: {labels.shape}")

X_train_paths, X_temp_paths, y_train, y_temp = train_test_split(
    file_paths, labels, test_size=0.3, random_state=42, stratify=labels
)

X_val_paths, X_test_paths, y_val, y_test = train_test_split(
    X_temp_paths, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {len(X_train_paths)} files")
print(f"Val set: {len(X_val_paths)} files")
print(f"Test set: {len(X_test_paths)} files")


In [None]:
X_train_embed = extract_wav2vec_embeddings(X_train_paths)
X_val_embed = extract_wav2vec_embeddings(X_val_paths)
X_test_embed = extract_wav2vec_embeddings(X_test_paths)

X_train_tensor = torch.tensor(X_train_embed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val_embed, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X_test_tensor = torch.tensor(X_test_embed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
batch_size = 64

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False)

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=len(np.unique(y_train))):
        super(MLPClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        return self.net(x)

model_clf = MLPClassifier()
model_clf.to(device)


In [None]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [None]:
optimizer = torch.optim.Adam(model_clf.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
n_epochs = 50

for epoch in range(n_epochs):
    model_clf.train()
    train_loss, correct = 0, 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model_clf(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        correct += (preds.argmax(1) == yb).sum().item()

    train_acc = correct / len(train_loader.dataset)

    model_clf.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model_clf(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item()
            val_correct += (preds.argmax(1) == yb).sum().item()

    val_acc = val_correct / len(val_loader.dataset)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.3f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.3f}, Val Acc={val_acc:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
model_clf.eval()
with torch.no_grad():
    outputs = model_clf(X_test_tensor.to(device))
    y_pred_test = outputs.argmax(dim=1).cpu().numpy()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

test_acc = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_acc:.4f}")

cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names_list)

correct_per_class = cm.diagonal()
total_per_class = cm.sum(axis=1)
acc_per_class = correct_per_class / total_per_class

fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(ax=ax, cmap='Blues', values_format='d')

plt.title("Confusion Matrix on Test Set")
plt.xticks(rotation=45, fontsize=8)  
plt.yticks(fontsize=8)              
plt.show()


for label, acc in zip(label_names_list, acc_per_class):
    print(f"Class '{label}': Accuracy = {acc:.4f}")