In [1]:
import os
import torch
from torch.utils.data import Dataset


class YesNoDataset(Dataset):
    def __init__(self, base_dir, test_list_path=None, return_label_name=False):
        self.filepaths = []
        self.labels = []
        self.label_to_index = {"no": 0, "yes": 1}
        self.index_to_label = {0: "no", 1: "yes"}
        self.return_label_name = return_label_name

        # Prepare exclusion set from testing_list.txt
        excluded_filenames = set()
        if test_list_path and os.path.exists(test_list_path):
            with open(test_list_path, "r") as f:
                for line in f:
                    if line.startswith(("yes/", "no/")):
                        file_name = (
                            os.path.splitext(os.path.basename(line.strip()))[0] + ".pt"
                        )
                        excluded_filenames.add(file_name)

        # Load filepaths and labels
        for label in ["no", "yes"]:
            label_path = os.path.join(base_dir, label)
            if os.path.isdir(label_path):
                for file in os.listdir(label_path):
                    if file.endswith(".pt") and file not in excluded_filenames:
                        self.filepaths.append(os.path.join(label_path, file))
                        self.labels.append(self.label_to_index[label])

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        spectrogram = torch.load(self.filepaths[idx])
        label_index = self.labels[idx]
        return spectrogram, (
            self.index_to_label[label_index] if self.return_label_name else label_index
        )

In [2]:
def pad_collate(batch):
    specs, labels = zip(*batch)
    max_len = max(spec.shape[-1] for spec in specs)
    padded_specs = []

    for spec in specs:
        pad_len = max_len - spec.shape[-1]
        padded_spec = torch.nn.functional.pad(spec, (0, pad_len), value=0)
        padded_specs.append(padded_spec)

    return torch.stack(padded_specs), list(labels)

In [3]:
from torch.utils.data import DataLoader

base_path = "../data/processed/train/audio"
test_list_path = "../data/raw/train/testing_list.txt"

train_dataset = YesNoDataset(base_path, test_list_path=test_list_path, return_label_name=False)
train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, collate_fn=pad_collate
)

for spectrograms, labels in train_loader:
    print("Spectrogram shape:", spectrograms.shape)
    print("Labels:", labels)
    break

Spectrogram shape: torch.Size([16, 128, 81])
Labels: [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]


  spectrogram = torch.load(self.filepaths[idx])


In [4]:
yes_base_path = "../data/processed/train/audio/yes"
number_of_yes_files = len([f for f in os.listdir(yes_base_path) if f.endswith(".pt")])
print(f"Number of 'yes' files: {number_of_yes_files}")
no_base_path = "../data/processed/train/audio/no"
number_of_no_files = len([f for f in os.listdir(no_base_path) if f.endswith(".pt")])
print(f"Number of 'no' files: {number_of_no_files}")
print(f"we have removed {number_of_yes_files + number_of_no_files - 266*16} files from the train dataset to put them in the test set") 


Number of 'yes' files: 2377
Number of 'no' files: 2375
we have removed 496 files from the train dataset to put them in the test set


In [5]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from models import CNNRNNClassifier


model = CNNRNNClassifier(
    num_classes=2, input_freq_bins=128
)  


In [6]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from models import CNNRNNClassifier
from mlflow_tracking.tracking_utils import (
    mlflow_run,
    log_metrics,
    log_learning_curves,
    log_confusion_matrix,
    log_best_model,
)

In [7]:
config = {
    "experiment_name": "speech-commands-yesno",
    "run_name": "cnn-rnn-v1",
    
    "registered_model_name": "YesNoClassifier",
    "num_epochs": 10,
    "learning_rate": 0.001,
    "batch_size": 32,
}

In [8]:
import time

@mlflow_run(config)
def train():

    model = CNNRNNClassifier(num_classes=2, input_freq_bins=128)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_losses = []
    train_accuracies = []

    for epoch in range(config["num_epochs"]):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        start_time = time.time()

        for X, y in train_loader:
            X, y = X.to(device), torch.tensor(y).to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(y).sum().item()
            total += y.size(0)

        avg_loss = running_loss / len(train_loader)
        accuracy = correct / total
        elapsed_time = time.time() - start_time
        print(f"Epoch {epoch+1}/{config['num_epochs']}, Time: {elapsed_time:.2f}s")

        train_losses.append(avg_loss)
        train_accuracies.append(accuracy)

        # Logging to MLflow
        log_metrics({"loss": avg_loss, "accuracy": accuracy}, step=epoch)
        print(f"Epoch {epoch+1}: loss={avg_loss:.4f}, accuracy={accuracy:.4f}")

        # Save best model every epoch (you can add logic to keep only best)
        best_model_path = log_best_model(model, epoch)

    log_learning_curves(
    train_metrics={"loss": train_losses, "accuracy": train_accuracies},
    val_metrics={"loss": train_losses, "accuracy": train_accuracies}
)

    return {"model": model, "best_model_path": best_model_path}

In [None]:
train()

2025/04/25 14:21:59 INFO mlflow.tracking.fluent: Experiment with name 'speech-commands-yesno' does not exist. Creating a new experiment.


  spectrogram = torch.load(self.filepaths[idx])


Epoch 1/10, Time: 33.96s
Epoch 1: loss=0.2867, accuracy=0.8534




Epoch 2/10, Time: 41.44s
Epoch 2: loss=0.1234, accuracy=0.9402


