In [38]:
import io
import os
import time
import requests
from typing import (
    Optional,
    Any,
)
import warnings
from pathlib import Path
from dataclasses import dataclass, asdict

import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc,
    classification_report,
)
from ptflops import get_model_complexity_info

import torch
import librosa
from torch import nn
import lightning as L
from lightning.pytorch.loggers import MLFlowLogger
from lightning.pytorch.callbacks import (
    Callback,
    ModelCheckpoint,
    LearningRateMonitor,
    EarlyStopping,
)
from lightning.pytorch.utilities.types import OptimizerLRScheduler

import torchaudio
from torchaudio import functional as F
from torchaudio.datasets import SPEECHCOMMANDS

warnings.filterwarnings("ignore")

In [14]:
HW_PATH = Path("assignment1")

# 1. Check Mel Spectrogram implementation

In [15]:
from assignment1.melbanks import LogMelFilterBanks

In [16]:
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
response = requests.get(url)
audio_data = io.BytesIO(response.content)
signal, sr = torchaudio.load(audio_data)

In [17]:
melspec = torchaudio.transforms.MelSpectrogram(
    hop_length=160,
    n_mels=80
)(signal)
logmelbanks = LogMelFilterBanks()(signal)

assert torch.log(melspec + 1e-6).shape == logmelbanks.shape
assert torch.allclose(torch.log(melspec + 1e-6), logmelbanks)

# 2. Train CNN classification model

## Configs

In [31]:
@dataclass
class ModelConfig:
    n_mels: int = 60  # {20, 40, 80}
    groups: int = 1  # {1, 2, 4, 8, 16}
    sample_rate: int = 16000

@dataclass
class TrainingConfig:
    lr: float = 1e-4
    warmup_epochs: int = 5
    max_epochs: int = 2
    batch_size: int = 32
    gradient_clip_val: float = 1.0
    scheduler_patience: int = 5
    scheduler_factor: float = 0.5
    checkpoint_dir: str = "checkpoints"
    num_devices: int = 1
    patience: int = 4
    patience_min_delta: float = 1e-4
    device: str = 'cpu'

    # mlflow
    mlflow_tracking_uri: str = "http://127.0.0.1:5000"
    mlflow_experiment_name: str = "Audio Binary Classification CNN"
    experiment_name: str = "Audio Binary Classification CNN"
    run_name: str = "baseline"

In [98]:
model_config = ModelConfig(
    n_mels=80,  # {20, 40, 80}
    groups=16,  # {1, 2, 4, 8, 16}
    sample_rate=16000,
)

training_config = TrainingConfig(
    lr=1e-4,
    warmup_epochs=5,
    max_epochs=50,
    batch_size=32,
    gradient_clip_val=1,
    scheduler_patience=5,
    scheduler_factor=0.5,
    patience_min_delta=1e-6,
    run_name=f"n_mels={model_config.n_mels};groups={model_config.groups}",
)

## Data

In [99]:
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None, sample_rate: int = 16000):
        super().__init__("./", download=True)
        self.binary_classes = ['yes', 'no']
        self.target_length = sample_rate

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

        # Filter to keep only 'yes' and 'no' samples
        self._walker = [w for w in self._walker if self.get_label_from_path(w) in self.binary_classes]

    def get_label_from_path(self, path):
        return os.path.normpath(path).split(os.path.sep)[-2]

    def pad_or_truncate(self, waveform):
        """Pad or truncate waveform to target length."""
        current_length = waveform.shape[1]

        if current_length > self.target_length:
            # Truncate
            return waveform[:, :self.target_length]
        elif current_length < self.target_length:
            # Pad with zeros
            padding = torch.zeros(1, self.target_length - current_length)
            return torch.cat([waveform, padding], dim=1)

        return waveform

    def __getitem__(self, n):
        waveform, sample_rate, label, speaker_id, utterance_number = super().__getitem__(n)
        waveform = self.pad_or_truncate(waveform)
        # Convert labels to binary (0 for 'no', 1 for 'yes')
        binary_label = torch.tensor(1 if label == 'yes' else 0, dtype=torch.long)
        return waveform, sample_rate, binary_label, speaker_id, utterance_number

train_set = SubsetSC("training", sample_rate=model_config.sample_rate)
val_set = SubsetSC("validation", sample_rate=model_config.sample_rate)
test_set = SubsetSC("testing", sample_rate=model_config.sample_rate)

In [100]:
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=training_config.batch_size,
    shuffle=True,
    num_workers=0
)

val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=training_config.batch_size,
    shuffle=False,
    num_workers=0
)

test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=training_config.batch_size,
    shuffle=False,
    num_workers=0
)

## Training

In [101]:
class SpectroCNN(L.LightningModule):
    def __init__(
            self,
            n_mels: int,
            lr: float,
            groups: int,
            patience: int,
    ):
        super().__init__()
        self.lr = lr
        self.patience =patience

        self.model = nn.Sequential(
            LogMelFilterBanks(n_mels=n_mels),
            nn.Conv1d(in_channels=n_mels, out_channels=16, kernel_size=3, padding=1, groups=groups),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.MaxPool1d(kernel_size=2),

            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(16, 2),
            nn.Softmax()
        )
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        waveform, _, label, _, _ = batch
        preds = self(waveform)
        loss = self.loss(preds, label)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        waveform, _, label, _, _ = batch
        preds = self(waveform)
        loss = self.loss(preds, label)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.lr
        )

        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',
                factor=0.1,
                patience=self.patience,
                verbose=True,
                min_lr=1e-6
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

        return [optimizer], [scheduler]

In [102]:
class SpectroCNNTrainer:
    def __init__(
            self,
            model_config: ModelConfig,
            training_config: TrainingConfig
    ):
        self.model_config = model_config
        self.training_config = training_config
        self.mlf_logger = None

    def setup_trainer(self):
        self.mlf_logger = MLFlowLogger(
            experiment_name=self.training_config.mlflow_experiment_name,
            tracking_uri=self.training_config.mlflow_tracking_uri,
            run_name=self.training_config.run_name,
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=self.training_config.checkpoint_dir,
            filename="best-model-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            verbose=False,
            monitor="val_loss",
            mode="min"
        )

        early_stop_callback = EarlyStopping(
            monitor='val_loss',
            min_delta=self.training_config.patience_min_delta,
            patience=self.training_config.patience,
            verbose=False,
            mode='min'
        )

        lr_monitor = LearningRateMonitor(logging_interval='epoch')

        trainer = L.Trainer(
            max_epochs=self.training_config.max_epochs,
            callbacks=[checkpoint_callback, lr_monitor, early_stop_callback],
            logger=self.mlf_logger,
            gradient_clip_val=self.training_config.gradient_clip_val,
            accelerator=self.training_config.device,
            devices=self.training_config.num_devices,
        )

        return trainer

    def get_predictions(self, model, dataloader):
        model.eval()
        all_preds = []
        all_targets = []

        with torch.no_grad():
            for batch in dataloader:
                waveform, _, label, _, _ = batch
                logits = model(waveform)
                preds = torch.softmax(logits, dim=1)

                all_preds.append(preds.cpu().numpy())
                all_targets.append(label.cpu().numpy())

        return np.concatenate(all_preds), np.concatenate(all_targets)

    def on_fit_end(self, model, test_dataloader):
        y_pred_proba, y_true = self.get_predictions(model, test_dataloader)
        y_pred = np.argmax(y_pred_proba, axis=1)

        metrics = {
            "test_accuracy": accuracy_score(y_true, y_pred),
            "test_precision": precision_score(y_true, y_pred, average='weighted'),
            "test_recall": recall_score(y_true, y_pred, average='weighted'),
            "test_f1": f1_score(y_true, y_pred, average='weighted')
        }

        class_report = classification_report(y_true, y_pred)

        with open("classification_report.txt", "w") as f:
            f.write(class_report)
        mlflow.log_artifact("classification_report.txt")

        plt.figure(figsize=(10, 8))
        cm = confusion_matrix(y_true, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig('confusion_matrix.png')
        mlflow.log_artifact('confusion_matrix.png')
        plt.close()

        if y_pred_proba.shape[1] == 2:
            plt.figure(figsize=(8, 8))
            fpr, tpr, _ = roc_curve(y_true, y_pred_proba[:, 1])
            roc_auc = auc(fpr, tpr)

            plt.plot(fpr, tpr, color='darkorange', lw=2,
                     label=f'ROC curve (AUC = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic')
            plt.legend(loc="lower right")
            plt.savefig('roc_curve.png')
            mlflow.log_artifact('roc_curve.png')
            plt.close()

            metrics["roc_auc"] = roc_auc

        mlflow.log_metrics(metrics)

        print("\nValidation Metrics:")
        print(f"Accuracy: {metrics['test_accuracy']:.4f}")
        print(f"Precision: {metrics['test_precision']:.4f}")
        print(f"Recall: {metrics['test_recall']:.4f}")
        print(f"F1 Score: {metrics['test_f1']:.4f}")
        if y_pred_proba.shape[1] == 2:
            print(f"ROC AUC: {metrics['roc_auc']:.4f}")

        return metrics
    
    def calculate_macs(self, model, input_shape: tuple):
        """
        Calculate MACs (Multiply-Accumulate Operations) for the model.
        Args:
            model: The PyTorch model
            input_shape: Input tensor shape (batch_size, time)
        Returns:
            macs: Number of MACs
            params: Number of parameters
        """
        model.eval()
        input_shape = (1, 16000) 
        macs, params = get_model_complexity_info(
            model, 
            input_shape,
            as_strings=False,
            print_per_layer_stat=True
        )
        return macs, params

    def train(self, train_dataloader, val_dataloader, test_dataloader):
        model = SpectroCNN(
            n_mels=self.model_config.n_mels,
            lr=self.training_config.lr,
            groups=self.model_config.groups,
            patience=self.training_config.patience,
        )

        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

        waveform, _, _, _, _ = next(iter(train_dataloader))
        print(waveform.shape)
        macs, _ = self.calculate_macs(model, waveform.shape)

        print(f"\nModel Size Summary:")
        print(f"Total parameters: {total_params:,}")
        print(f"Trainable parameters: {trainable_params:,}")
        print(f"MACs: {macs/1e6:.2f}M")
        print(f"FLOPs: {(macs*2)/1e9:.2f}G")

        trainer = self.setup_trainer()

        class TimingCallback(Callback):
            def __init__(self):
                self.epoch_start_time = None
                self.training_start_time = None
                self.epoch_times = []

            def on_train_start(self, trainer, pl_module):
                self.training_start_time = time.time()

            def on_train_epoch_start(self, trainer, pl_module):
                self.epoch_start_time = time.time()

            def on_train_epoch_end(self, trainer, pl_module):
                epoch_time = time.time() - self.epoch_start_time
                self.epoch_times.append(epoch_time)

                # Log epoch time to MLflow
                mlflow.log_metric("epoch_time", epoch_time, step=trainer.current_epoch)

            def on_train_end(self, trainer, pl_module):
                total_time = time.time() - self.training_start_time
                avg_epoch_time = np.mean(self.epoch_times)

                # Log summary timing metrics
                mlflow.log_metrics({
                    "total_training_time": total_time,
                    "average_epoch_time": avg_epoch_time
                })

                # Create epoch timing plot
                plt.figure(figsize=(10, 6))
                plt.plot(range(1, len(self.epoch_times) + 1), self.epoch_times, marker='o')
                plt.title('Training Time per Epoch')
                plt.xlabel('Epoch')
                plt.ylabel('Time (seconds)')
                plt.grid(True)
                plt.savefig('epoch_times.png')
                mlflow.log_artifact('epoch_times.png')
                plt.close()

                # Log detailed timing data as CSV
                epoch_timing_df = pd.DataFrame({
                    'epoch': range(1, len(self.epoch_times) + 1),
                    'time_seconds': self.epoch_times
                })
                epoch_timing_df.to_csv('epoch_times.csv', index=False)
                mlflow.log_artifact('epoch_times.csv')

        timing_callback = TimingCallback()
        trainer.callbacks.append(timing_callback)

        with mlflow.start_run(run_name=self.training_config.run_name) as run:
            self.mlf_logger._run_id = run.info.run_id

            model_params = {"model_" + k: v for k, v in asdict(self.model_config).items()}
            training_params = {"training_" + k: v for k, v in asdict(self.training_config).items()}

            model_stats = {
                "total_params": total_params,
                "trainable_params": trainable_params,
                "macs": macs,
                "macs_M": macs/1e6,
                "flops": macs*2,
                "flops_G": (macs*2)/1e9,
            }

            # Log all parameters
            mlflow.log_params(model_params)
            mlflow.log_params(training_params)
            mlflow.log_params(model_stats)

            trainer.fit(
                model,
                train_dataloaders=train_dataloader,
                val_dataloaders=val_dataloader,
            )

            mlflow.pytorch.log_model(model, "model")
            metrics = self.on_fit_end(model, test_dataloader)

        return model, trainer

In [103]:
trainer = SpectroCNNTrainer(model_config, training_config)
model, trainer = trainer.train(
    train_loader,
    val_loader,
    test_loader
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (mps), used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs

  | Name  | Type             | Params | Mode
--------------------------------------------------
0 | model | Sequential       | 322    | eval
1 | loss  | CrossEntropyLoss | 0      | eval
--------------------------------------------------
322       Trainable params
0         Non-trainable params
322       Total params
0.001     Total estimated model params size (MB)
0         Modules in train mode
11        Modules in eval mode


torch.Size([32, 1, 16000])
SpectroCNN(
  322, 100.000% Params, 33.15 KMac, 89.152% MACs, 
  (model): Sequential(
    322, 100.000% Params, 33.15 KMac, 89.152% MACs, 
    (0): LogMelFilterBanks(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
    (1): Conv1d(256, 79.503% Params, 25.86 KMac, 69.528% MACs, 80, 16, kernel_size=(3,), stride=(1,), padding=(1,), groups=16)
    (2): ReLU(0, 0.000% Params, 1.62 KMac, 4.345% MACs, )
    (3): BatchNorm1d(32, 9.938% Params, 3.23 KMac, 8.691% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): MaxPool1d(0, 0.000% Params, 1.62 KMac, 4.345% MACs, kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): AdaptiveAvgPool1d(0, 0.000% Params, 800.0 Mac, 2.151% MACs, output_size=1)
    (6): Flatten(0, 0.000% Params, 0.0 Mac, 0.000% MACs, start_dim=1, end_dim=-1)
    (7): Linear(34, 10.559% Params, 34.0 Mac, 0.091% MACs, in_features=16, out_features=2, bias=True)
    (8): Softmax(0, 0.000% Params, 0.0 Mac, 0.000% MAC

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|█| 199/199 [00:06<00:00, 29.20it/s, v_num=a4c3, train_loss_step=0.444, val_loss_step=0.651, val_loss_epoch=0.432, train_loss_epoch
🏃 View run n_mels=80;groups=16 at: http://127.0.0.1:5000/#/experiments/0/runs/da75c971bfd74f89a02ed191786ca4c3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0





Validation Metrics:
Accuracy: 0.8968
Precision: 0.8976
Recall: 0.8968
F1 Score: 0.8968
ROC AUC: 0.9558
🏃 View run n_mels=80;groups=16 at: http://127.0.0.1:5000/#/experiments/0/runs/da75c971bfd74f89a02ed191786ca4c3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
