In [1]:
import os
import torch
from torch.utils.data import Dataset


class YesNoDataset(Dataset):
    def __init__(self, base_dir, test_list_path=None, return_label_name=False, train=True):
        self.filepaths = []
        self.labels = []
        self.label_to_index = {"no": 0, "yes": 1}
        self.index_to_label = {0: "no", 1: "yes"}
        self.return_label_name = return_label_name

        included_files = set()

        # Load filenames from testing_list.txt if provided
        if test_list_path and os.path.exists(test_list_path):
            with open(test_list_path, "r") as f:
                for line in f:
                    if line.startswith(("yes/", "no/")):
                        file_name = os.path.splitext(os.path.basename(line.strip()))[0] + ".pt"
                        included_files.add(file_name)

        for label in ["no", "yes"]:
            label_path = os.path.join(base_dir, label)
            if not os.path.isdir(label_path):
                continue
            for file in os.listdir(label_path):
                if file.endswith(".pt"):
                    if (train and file not in included_files) or (not train and file in included_files):
                        self.filepaths.append(os.path.join(label_path, file))
                        self.labels.append(self.label_to_index[label])

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        spectrogram = torch.load(self.filepaths[idx])
        label_index = self.labels[idx]
        return spectrogram, (
            self.index_to_label[label_index] if self.return_label_name else label_index
        )


In [2]:
def pad_collate(batch):
    specs, labels = zip(*batch)
    max_len = max(spec.shape[-1] for spec in specs)
    padded_specs = []

    for spec in specs:
        pad_len = max_len - spec.shape[-1]
        padded_spec = torch.nn.functional.pad(spec, (0, pad_len), value=0)
        padded_specs.append(padded_spec)

    return torch.stack(padded_specs), list(labels)

In [3]:
from torch.utils.data import DataLoader

base_path = "../data/processed/train/audio"
test_list_path = "../data/raw/train/testing_list.txt"

train_dataset = YesNoDataset(base_path, test_list_path=test_list_path, return_label_name=False)
train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, collate_fn=pad_collate
)
test_loader = DataLoader(
    YesNoDataset(base_path, test_list_path=test_list_path, return_label_name=False, train=False),
    batch_size=16, shuffle=False, collate_fn=pad_collate
)

for spectrograms, labels in train_loader:
    print("Spectrogram shape:", spectrograms.shape)
    print("Labels:", labels)
    break

Spectrogram shape: torch.Size([16, 128, 81])
Labels: [1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]


  spectrogram = torch.load(self.filepaths[idx])


In [4]:
len(train_loader.dataset), len(test_loader.dataset)

(4244, 508)

In [5]:
yes_base_path = "../data/processed/train/audio/yes"
number_of_yes_files = len([f for f in os.listdir(yes_base_path) if f.endswith(".pt")])
print(f"Number of 'yes' files: {number_of_yes_files}")
no_base_path = "../data/processed/train/audio/no"
number_of_no_files = len([f for f in os.listdir(no_base_path) if f.endswith(".pt")])
print(f"Number of 'no' files: {number_of_no_files}")
print(f"we have removed {number_of_yes_files + number_of_no_files - 266*16} files from the train dataset to put them in the test set") 


Number of 'yes' files: 2377
Number of 'no' files: 2375
we have removed 496 files from the train dataset to put them in the test set


In [6]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from models import CNNRNNClassifier


model = CNNRNNClassifier(
    num_classes=2, input_freq_bins=128
)  


In [7]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from models import CNNRNNClassifier
from mlflow_tracking.tracking_utils import (
    mlflow_run,
    log_metrics,
    log_learning_curves,
    log_confusion_matrix,
    log_checkpoint_model,
)

In [8]:
import os 
config = {
    "experiment_name": "speech-commands-yesno",
    "run_name": "cnn-rnn-v1",
    
    "registered_model_name": "YesNoClassifier",
    "num_epochs": 6,
    "learning_rate": 0.002,
    "batch_size": 32,
}


In [9]:
class EarlyStopper:
    def __init__(self, patience=5, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.best_epoch = -1

    def should_stop(self, score, epoch):
        if self.best_score is None or score > self.best_score + self.min_delta:
            self.best_score = score
            self.best_epoch = epoch
            self.counter = 0
            return False
        else:
            self.counter += 1
            return self.counter >= self.patience

In [10]:
import time 

@mlflow_run(config)
def train():
    model = CNNRNNClassifier(num_classes=2, input_freq_bins=128)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []

    early_stopper = EarlyStopper(patience=5, min_delta=0.001)
    best_model_path = None

    for epoch in range(config["num_epochs"]):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        start_time = time.time()

        for X, y in train_loader:
            X, y = X.to(device), torch.tensor(y).to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(y).sum().item()
            total += y.size(0)

        avg_loss = running_loss / len(train_loader)
        train_acc = correct / total
        elapsed_time = time.time() - start_time

        # 🔍 Evaluate on test set
        model.eval()
        test_correct = 0
        test_total = 0
        test_running_loss = 0.0
        with torch.no_grad():
            for X_test, y_test in test_loader:
                X_test, y_test = X_test.to(device), torch.tensor(y_test).to(device)
                out = model(X_test)
                loss = criterion(out, y_test)  
                test_running_loss += loss.item()  
                pred = out.argmax(dim=1)
                test_correct += pred.eq(y_test).sum().item()
                test_total += y_test.size(0)
        test_acc = test_correct / test_total
        test_loss = test_running_loss / len(test_loader) 

        test_losses.append(test_loss)
        train_losses.append(avg_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)

        log_metrics(
            {
                "loss": avg_loss,
                "test_loss": test_loss,
                "accuracy": train_acc,
                "test_accuracy": test_acc,
                "epoch_time_sec": elapsed_time,
            },
            step=epoch,
        )

        print(f"Epoch {epoch+1}/{config['num_epochs']}, Time: {elapsed_time:.2f}s")
        print(f"Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

        # 💾 Save best model so far

        best_model_path = log_checkpoint_model(model, epoch)

        # ⏹️ Early stopping
        if early_stopper.should_stop(test_acc, epoch):
            print(
                f"Early stopping at epoch {epoch+1} — no improvement for {early_stopper.patience} epochs."
            )
            break

    log_learning_curves(
        train_metrics={"loss": train_losses, "accuracy": train_accuracies},
        val_metrics={"loss": test_losses, "accuracy": test_accuracies},
    )

    return {"model": model, "best_model_path": best_model_path}

In [11]:
train()

  spectrogram = torch.load(self.filepaths[idx])


Epoch 1/6, Time: 32.85s
Train Acc: 0.8742, Test Acc: 0.6654
Epoch 2/6, Time: 33.49s
Train Acc: 0.9444, Test Acc: 0.5846
Epoch 3/6, Time: 31.96s
Train Acc: 0.9533, Test Acc: 0.6260
Epoch 4/6, Time: 32.56s
Train Acc: 0.9621, Test Acc: 0.8406
Epoch 5/6, Time: 32.26s
Train Acc: 0.9720, Test Acc: 0.8504
Epoch 6/6, Time: 31.93s
Train Acc: 0.9696, Test Acc: 0.8425


Registered model 'YesNoClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'YesNoClassifier'.


{'model': CNNRNNClassifier(
   (cnn): Sequential(
     (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (2): ReLU()
     (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
     (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (6): ReLU()
     (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
   )
   (rnn): GRU(2048, 128, batch_first=True, bidirectional=True)
   (classifier): Sequential(
     (0): Linear(in_features=256, out_features=128, bias=True)
     (1): ReLU()
     (2): Dropout(p=0.3, inplace=False)
     (3): Linear(in_features=128, out_features=2, bias=True)
   )
 ),
 'best_model_path': 'checkpoints\\model_epoch_5.pth'}