## Paramerers

In [1]:
import argparse
from pathlib import Path


parser = argparse.ArgumentParser()
'''train'''
parser.add_argument("--max_lr", default=3e-4, type=float)
parser.add_argument("--wd", default=1e-5, type=float)
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--run_name", default=None, type=Path)
parser.add_argument('--loss_type', default="label_smooth", type=str)
parser.add_argument('--n_epochs', default=None, type=int)
parser.add_argument('--epoch_mix', default=None, type=int)
parser.add_argument("--amp", action='store_true')
parser.add_argument("--filter_bias_and_bn", action='store_true', default=True)
parser.add_argument("--ext_pretrained", default=None, type=str)
parser.add_argument("--multilabel", action='store_true')
parser.add_argument('--save_path', default=None, type=Path)
parser.add_argument('--load_path', default=None, type=Path)
parser.add_argument('--scheduler', default=None, type=str)
# parser.add_argument('--augs_signal', nargs='+', type=str,
                    # default=['amp', 'neg', 'tshift', 'tmask', 'ampsegment', 'cycshift'])
parser.add_argument('--augs_signal', nargs='+', type=str,
                    default=[])
# parser.add_argument('--augs_noise', nargs='+', type=str,
#                     default=['awgn', 'abgn', 'apgn', 'argn', 'avgn', 'aun', 'phn', 'sine'])
parser.add_argument('--augs_noise', nargs='+', type=str,
                    default=[])
parser.add_argument('--augs_mix', nargs='+', type=str, default=['mixup', 'timemix', 'freqmix', 'phmix'])
parser.add_argument('--mix_loss', default='bce', type=str)
parser.add_argument('--mix_ratio', default=1, type=float)
parser.add_argument('--ema', default=0.995, type=float)
parser.add_argument('--log_interval', default=100, type=int)
parser.add_argument("--kd_model", default=None, type=Path)
parser.add_argument("--use_bg", action='store_true', default=False)
parser.add_argument("--resume_training", action='store_true', default=False)
parser.add_argument("--use_balanced_sampler", action='store_true', default=False)
'''common'''
parser.add_argument('--local_rank', default=0, type=int)
parser.add_argument('--gpu_ids', nargs='+', default=[0])
parser.add_argument("--use_ddp", action='store_true')
parser.add_argument("--use_dp", action='store_true')
parser.add_argument('--save_interval', default=100, type=int)
'''data'''
parser.add_argument('--fold_id', default=1, type=int)
parser.add_argument("--data_subtype", default='balanced', type=str)
parser.add_argument('--seq_len', default=90112, type=int)
parser.add_argument('--dataset', default="urban8k", type=str)
parser.add_argument('--n_classes', default=50, type=int)
'''net'''
parser.add_argument('--ds_factors', nargs='+', type=int, default=[4, 4, 4, 4])
parser.add_argument('--n_head', default=8, type=int)
parser.add_argument('--n_layers', default=4, type=int)
parser.add_argument("--emb_dim", default=128, type=int)
parser.add_argument("--model_type", default='SoundNetRaw', type=str)
parser.add_argument("--nf", default=16, type=int)
parser.add_argument("--dim_feedforward", default=512, type=int)
parser.add_argument("--sampling_rate", default=22050, type=int)
'''system'''
parser.add_argument('--data_dir', default='data/', type=Path)
parser.add_argument('--gpus', type=list, default=[0])
parser.add_argument('--num_workers', type=int, default=32)
args = parser.parse_args(args=[])


## Data

### ESC50 Dataset
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes.
[Github](https://github.com/karolpiczak/ESC-50)

[Huggingface](https://huggingface.co/datasets/ashraq/esc50) "ashraq/esc50"

In [2]:
import os
import lightning as L
import torch
from torch.utils.data import random_split, DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Audio

import random
import torch.nn.functional as F

from workspace.datasets.audio_augs import AudioAugs

def preprocess_audio(example):
    audio = example['audio']
    samling_rate = audio['sampling_rate']
    audio = audio['array']
    if audio.shape[0] >= args.seq_len:
        max_audio_start = audio.shape[0] - args.seq_len
        audio_start = random.randint(0, max_audio_start)
        audio = audio[audio_start : audio_start + args.seq_len]
    else:
        audio = F.pad(
            audio, (0, args.seq_len - audio.size(0)), "constant"
        ).data
    example['audio'] = audio
    return example

def transform(batch):
    transforms = args.augs_signal + args.augs_noise
    audio = torch.Tensor(batch['audio'])
    if transforms is not None:
        batch['audio'] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio)      
    batch['audio'] = audio.unsqueeze(0)
    return batch
    

class ESC50DataModule(L.LightningDataModule):
    def __init__(self, data_dir: Path = args.data_dir):
        super().__init__()
        self.data_dir = data_dir
        
    # called only within a single process on CPU
    def prepare_data(self):
        # download
        esc50 = load_dataset("ashraq/esc50", cache_dir=self.data_dir)
        esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
        # rename column target to label
        esc50 = esc50.rename_column("target", "label")
        esc50 = esc50.map(preprocess_audio)
        esc50.save_to_disk(os.path.join(self.data_dir, 'esc50processed'))

        

    # run on each GPU
    def setup(self, stage: str):
        # load from disk
        esc50 = load_from_disk(os.path.join(self.data_dir, 'esc50processed'))
        esc50 = esc50.remove_columns(['filename', 'fold', 'category', 'esc10', 'src_file', 'take'])
        # split into train, val, test
        esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
        self.dataset_test = esc50['test'].with_format('torch', columns=['audio', 'label'])
        self.dataset_train = esc50['train'].with_format('torch', columns=['audio', 'label'])
        self.dataset_test.set_transform(transform)
        self.dataset_train.set_transform(transform)

    def train_dataloader(self):
        return DataLoader(self.dataset_train, batch_size=args.batch_size, num_workers=args.num_workers,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        )

    def val_dataloader(self):
         return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers,
        pin_memory=True,
        shuffle=False,
        drop_last=True,
        )

    def test_dataloader(self):
         return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers,
        pin_memory=True,
        shuffle=False,
        drop_last=True,
        )

datamodule = ESC50DataModule()

  from .autonotebook import tqdm as notebook_tqdm


### Playground code


In [2]:
from datasets import load_dataset
from datasets import Audio
esc50 = load_dataset("ashraq/esc50")
esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# split into train, val, test
esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
esc50


  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
        num_rows: 400
    })
})

In [6]:
# rename column target to label
esc50 = esc50.rename_column("target", "label")

In [8]:

dataset_test = esc50['test'].with_format('torch', columns=['audio', 'label'])
dataset_train = esc50['train'].with_format('torch', columns=['audio', 'label'])

In [9]:
dataset_train

Dataset({
    features: ['filename', 'fold', 'label', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 1600
})

In [41]:
import random
import torch.nn.functional as F

def preprocess_audio(example):
    audio = example['audio']
    samling_rate = audio['sampling_rate']
    audio = audio['array']
    if audio.shape[0] >= args.seq_len:
        max_audio_start = audio.size(0) - args.seq_len
        audio_start = random.randint(0, max_audio_start)
        audio = audio[audio_start : audio_start + args.seq_len]
    else:
        audio = F.pad(
            audio, (0, args.seq_len - audio.size(0)), "constant"
        ).data
    example['audio'] = audio
    return example

preprocess_audio(dataset_train[0])

{'label': tensor(39),
 'audio': tensor([-0.0017,  0.0006, -0.0022,  ...,  0.0000,  0.0000,  0.0000])}

In [19]:
dataset_train[0]

{'label': tensor(39),
 'audio': {'path': None,
  'array': tensor([ 3.5112e-07, -3.0661e-07,  2.1097e-07,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]),
  'sampling_rate': tensor(22050)}}

In [11]:
# datamodule.prepare_data()
datamodule.setup(stage='fit')

In [12]:
ds = datamodule.dataset_train
ds[0]

{'label': 26,
 'audio': tensor([[-0.0021, -0.0028, -0.0052,  ..., -0.0028, -0.0030,  0.0002]])}

In [14]:
ds[0:10]['audio'].shape

torch.Size([1, 10, 90112])

In [15]:
train_dataloader = datamodule.train_dataloader()
first_batch = next(iter(train_dataloader))
first_batch

{'label': tensor([ 3,  0, 24, 12, 47,  9, 26,  9, 15, 44, 35,  6, 46, 43,  5, 15,  0, 36,
         22,  5, 31,  1, 33, 32, 43, 49, 41, 43, 32, 11, 10,  4, 47, 36, 29, 38,
         12, 41, 37, 14, 42, 20, 32,  7, 48, 32, 12, 10, 30,  4, 14, 20, 14, 35,
         32, 37, 40, 35, 28, 33, 25, 14, 17,  1,  1,  5, 17,  3, 49, 31, 13, 31,
          9, 27, 20, 30, 10, 29, 17, 22, 22, 15, 48, 48, 17,  0, 18,  4, 14, 20,
         42, 35,  2, 39,  2,  8, 42,  9, 48, 39, 26,  2,  0,  6, 49, 18, 33, 27,
          6, 43,  8,  1, 18, 35, 28, 41, 24, 12, 29, 15, 17, 35, 48, 12, 19,  0,
         28, 41]),
 'audio': tensor([[[-4.1832e-01, -3.8508e-01, -3.1146e-01,  ...,  2.8594e-01,
            3.3623e-01,  3.2967e-01]],
 
         [[ 7.5707e-02,  1.1831e-01,  1.4611e-01,  ..., -1.9797e-02,
           -1.6428e-02, -1.3772e-02]],
 
         [[-2.3252e-01, -3.0039e-01, -3.8625e-01,  ..., -5.2688e-03,
           -2.7096e-03,  5.9776e-06]],
 
         ...,
 
         [[-1.7497e-03, -1.8858e-03, -1.7380e-03, 

In [16]:
first_batch['audio'].shape

torch.Size([128, 1, 90112])

In [73]:
x = first_batch['audio']
x

tensor([[-0.0050,  0.0210,  0.0506,  ...,  0.0094,  0.0280,  0.0201],
        [ 0.0003,  0.0004,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0018,  0.0809,  0.0700,  ...,  0.0968,  0.1595,  0.2512],
        ...,
        [ 0.0051, -0.0120, -0.0181,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0024,  0.0027,  0.0022,  ..., -0.0243, -0.0349, -0.0430],
        [-0.0013, -0.0039, -0.0126,  ...,  0.0528,  0.0516,  0.0475]])

In [74]:
first_batch

{'label': tensor([ 8, 34, 41,  7, 15, 41, 17, 37, 38, 35, 19, 20, 22, 23, 21, 39, 44, 21,
         27, 18, 15, 40, 32,  9, 35, 48, 31, 34,  4, 22, 21, 32, 47, 49, 26, 49,
         40, 30, 23, 16, 48, 32, 13, 39, 28,  3, 37, 31, 42, 41, 25,  1, 24, 47,
         40,  8, 49,  9, 10, 44, 15, 37, 49, 41, 26, 13, 20, 25, 34,  8, 39, 10,
         44, 17,  2,  9, 46,  5, 34, 47, 16, 18, 44, 29, 36,  4,  8,  3, 11, 32,
          2, 17, 16,  5, 42, 45, 21, 33, 37, 36,  8, 23, 41, 18, 28, 41, 44, 16,
         48, 36, 13, 43, 27, 16, 24,  6, 18, 13, 49, 14, 11,  8, 20,  0, 22,  1,
         37, 35]),
 'audio': tensor([[-0.0050,  0.0210,  0.0506,  ...,  0.0094,  0.0280,  0.0201],
         [ 0.0003,  0.0004,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0018,  0.0809,  0.0700,  ...,  0.0968,  0.1595,  0.2512],
         ...,
         [ 0.0051, -0.0120, -0.0181,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0024,  0.0027,  0.0022,  ..., -0.0243, -0.0349, -0.0430],
         [-0.0013, -0.0039,

## Model

In [3]:
import torch
from workspace.datasets.batch_augs import BatchAugs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


ba_params = {
        'seq_len': args.seq_len,
        'fs': args.sampling_rate,
        'augs': args.augs_mix,
        'device': device,
        'mix_ratio': args.mix_ratio,
        'batch_sz': args.local_rank,
        'epoch_mix': args.epoch_mix,
        'resample_factors': [0.8, 0.9, 1.1, 1.2],
        'multilabel': True if args.multilabel else False,
        'mix_loss': args.mix_loss
    }
batch_augs = BatchAugs(ba_params)

In [4]:
import torch.nn as nn
#####################
# losses            #
#####################
if args.loss_type == "label_smooth":
    from modules.losses import LabelSmoothCrossEntropyLoss
    criterion = LabelSmoothCrossEntropyLoss(smoothing=0.1, reduction='sum')
elif args.loss_type == "cross_entropy":
    criterion = nn.CrossEntropyLoss(reduction='sum')
elif args.loss_type == "focal":
    from modules.losses import FocalLoss
    criterion = FocalLoss()
elif args.loss_type == 'bce':
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
else:
    raise ValueError

In [5]:
from typing import Any, Optional
from lightning.pytorch.utilities.types import STEP_OUTPUT
import numpy as np
import torch, torch.nn as nn
import lightning as L
from workspace.datasets.batch_augs import BatchAugs
from modules.soundnet import SoundNetRaw as SoundNet

class EAT(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.save_hyperparameters(args)
        ds_fac = np.prod(np.array(args.ds_factors)) * 4
        self.model = SoundNet(
                nf=args.nf,
                dim_feedforward=args.dim_feedforward,
                clip_length=args.seq_len // ds_fac,
                embed_dim=args.emb_dim,
                n_layers=args.n_layers,
                nhead=args.n_head,
                n_classes=args.n_classes,
                factors=args.ds_factors,
                )

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        x, targets, is_mixed = batch_augs(x, y) # TODO: removed epoch parameter
        pred = self(x)
        if is_mixed:
            loss_cls = batch_augs.mix_loss(pred, targets, n_classes=args.n_classes,
            pred_one_hot=args.multilabel)
        else:
            loss_cls = criterion(pred, y)
        self.log('loss_cls', loss_cls)
        return loss_cls
    
    def test_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        pred = self(x)
        loss_cls = criterion(pred, y)
        self.log('test_loss', loss_cls)
        return loss_cls
    
    # def validation_step(self, batch, batch_idx):
    #     loss = self.training_step(batch, batch_idx)
    #     self.log('val_loss', loss)
    #     return loss
    
    def configure_optimizers(self):
        if args.amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler(init_scale=2**10)
            eps = 1e-4
        else:
            scaler = None
            eps = 1e-8
        parameters = self.model.parameters()
        return torch.optim.AdamW(parameters,
                            lr=args.max_lr,
                            betas=[0.9, 0.99],
                            weight_decay=0,
                            eps=eps)
model = EAT()


In [21]:
# get one sample from datamodule
# datamodule.prepare_data()
datamodule.setup(stage='fit')
sample = next(iter(datamodule.train_dataloader()))
x = sample['audio']
print(x.shape)
model(x)

torch.Size([128, 1, 90112])


tensor([[ 0.7032,  0.2245, -0.4650,  ...,  0.1418,  0.3108, -0.3904],
        [ 0.3579, -1.0132, -0.0413,  ...,  0.1003, -0.0456,  0.0107],
        [ 1.0528,  0.5265, -0.7309,  ..., -0.0906,  0.1717, -0.2026],
        ...,
        [ 0.9733, -0.1774, -0.4244,  ..., -0.2977, -0.0049, -0.0283],
        [ 0.6691, -0.7178,  0.3668,  ..., -0.0950, -0.1738,  0.5077],
        [ 0.1911, -0.3720,  0.2180,  ...,  0.2844,  0.6657, -0.1125]],
       grad_fn=<AddmmBackward0>)

## Train

In [6]:
from lightning import Trainer


trainer = L.Trainer(max_epochs=1, accelerator='gpu', devices=args.gpus) # set devices to a list of GPU ids to train on

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:

# start training 
trainer.fit(model, datamodule=datamodule)

  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
Repo card metadata block was not found. Setting CardData to empty.
Saving the dataset (3/3 shards): 100%|██████████| 2000/2000 [00:03<00:00, 626.27 examples/s]
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type        | Params
--------------------------------------
0 | model | SoundNetRaw | 5.2 M 
--------------------------------------
5.2 M     Trainable params
0         Non-trainable params
5.2 M     Total params
20.722    Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0: 100%|██████████| 12/12 [00:15<00:00,  1.25s/it, v_num=13]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 12/12 [00:15<00:00,  1.27s/it, v_num=13]


## Test
Test the model on the test set.

In [8]:
trainer.test(model, datamodule=datamodule)

Repo card metadata block was not found. Setting CardData to empty.
Saving the dataset (3/3 shards): 100%|██████████| 2000/2000 [00:01<00:00, 1023.78 examples/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 12.00it/s]


[{'test_loss': 506.4825744628906}]