## Paramerers

In [8]:
import argparse
from pathlib import Path


parser = argparse.ArgumentParser()
'''train'''
parser.add_argument("--max_lr", default=3e-4, type=float)
parser.add_argument("--wd", default=1e-5, type=float)
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--run_name", default=None, type=Path)
parser.add_argument('--loss_type', default="label_smooth", type=str)
parser.add_argument('--n_epochs', default=None, type=int)
parser.add_argument('--epoch_mix', default=None, type=int)
parser.add_argument("--amp", action='store_true')
parser.add_argument("--filter_bias_and_bn", action='store_true', default=True)
parser.add_argument("--ext_pretrained", default=None, type=str)
parser.add_argument("--multilabel", action='store_true')
parser.add_argument('--save_path', default=None, type=Path)
parser.add_argument('--load_path', default=None, type=Path)
parser.add_argument('--scheduler', default=None, type=str)
# parser.add_argument('--augs_signal', nargs='+', type=str,
#                     default=['amp', 'neg', 'tshift', 'tmask', 'ampsegment', 'cycshift'])
parser.add_argument('--augs_signal', nargs='+', type=str,
                    default=['amp', 'neg', 'tshift'])
# parser.add_argument('--augs_noise', nargs='+', type=str,
#                     default=['awgn', 'abgn', 'apgn', 'argn', 'avgn', 'aun', 'phn', 'sine'])
parser.add_argument('--augs_noise', nargs='+', type=str,
                    default=[])
parser.add_argument('--augs_mix', nargs='+', type=str, default=['mixup', 'timemix', 'freqmix', 'phmix'])
parser.add_argument('--mix_loss', default='bce', type=str)
parser.add_argument('--mix_ratio', default=1, type=float)
parser.add_argument('--ema', default=0.995, type=float)
parser.add_argument('--log_interval', default=100, type=int)
parser.add_argument("--kd_model", default=None, type=Path)
parser.add_argument("--use_bg", action='store_true', default=False)
parser.add_argument("--resume_training", action='store_true', default=False)
parser.add_argument("--use_balanced_sampler", action='store_true', default=False)
'''common'''
parser.add_argument('--local_rank', default=0, type=int)
parser.add_argument('--gpu_ids', nargs='+', default=[0])
parser.add_argument("--use_ddp", action='store_true')
parser.add_argument("--use_dp", action='store_true')
parser.add_argument('--save_interval', default=100, type=int)
'''data'''
parser.add_argument('--fold_id', default=1, type=int)
parser.add_argument("--data_subtype", default='balanced', type=str)
parser.add_argument('--seq_len', default=90112, type=int)
parser.add_argument('--dataset', default="urban8k", type=str)
parser.add_argument('--n_classes', default=50, type=int)
'''net'''
parser.add_argument('--ds_factors', nargs='+', type=int, default=[4, 4, 4, 4])
parser.add_argument('--n_head', default=8, type=int)
parser.add_argument('--n_layers', default=4, type=int)
parser.add_argument("--emb_dim", default=128, type=int)
parser.add_argument("--model_type", default='SoundNetRaw', type=str)
parser.add_argument("--nf", default=16, type=int)
parser.add_argument("--dim_feedforward", default=512, type=int)
parser.add_argument("--sampling_rate", default=22050, type=int)
'''system'''
parser.add_argument('--data_dir', default='data/', type=Path)
parser.add_argument('--gpus', type=list, default=[0])
parser.add_argument('--num_workers', type=int, default=32)
args = parser.parse_args(args=[])


## Data

### ESC50 Dataset
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes.
[Github](https://github.com/karolpiczak/ESC-50)

[Huggingface](https://huggingface.co/datasets/ashraq/esc50) "ashraq/esc50"

### Preprocess
Prepare your dataset. All code that only need to be executed ones before training and results fit on disk.

In [18]:
def preprocess_audio(example):
    audio = example['audio']
    audio = audio['array']
    if audio.shape[0] >= args.seq_len:
        max_audio_start = audio.shape[0] - args.seq_len
        audio_start = random.randint(0, max_audio_start)
        audio = audio[audio_start : audio_start + args.seq_len]
    else:
        audio = F.pad(
            audio, (0, args.seq_len - audio.size(0)), "constant"
        ).data
    example['audio'] = audio
    return example

esc50 = load_dataset("ashraq/esc50", cache_dir=args.data_dir)
esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# rename column target to label
esc50 = esc50.rename_column("target", "label")
esc50 = esc50.map(preprocess_audio)
esc50.save_to_disk(os.path.join(args.data_dir, 'esc50processed'))

Repo card metadata block was not found. Setting CardData to empty.
Saving the dataset (3/3 shards): 100%|██████████| 2000/2000 [00:02<00:00, 699.50 examples/s]


In [84]:
import os
import lightning as L
import torch
from torch.utils.data import random_split, DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Audio

import random
import torch.nn.functional as F

from workspace.datasets.audio_augs import AudioAugs

def train_transform(batch):
    transforms = args.augs_signal + args.augs_noise
    audio = torch.Tensor(batch['audio'])
    if transforms is not None:
        # check if audio has more then 1 dimension
        if len(audio.shape) > 1:
            # iterate over all dimensions
            for i in range(audio.shape[0]):
                audio[i] = AudioAugs(transforms, args.sampling_rate, p=1)(audio[i])
        else:
            batch['audio'] = AudioAugs(transforms, args.sampling_rate, p=1)(audio)      
    batch['audio'] = audio.unsqueeze(0)
    return batch
    
def test_transform(batch):
    audio = torch.Tensor(batch['audio'])
    batch['audio'] = audio.unsqueeze(0)
    return batch

class ESC50DataModule(L.LightningDataModule):
    def __init__(self, data_dir: Path = args.data_dir):
        super().__init__()
        self.data_dir = data_dir
        
    # called only within a single process on CPU but everytime trainer is envoked
    # def prepare_data(self):

    # run on each GPU
    def setup(self, stage: str):
        # load from disk
        esc50 = load_from_disk(os.path.join(self.data_dir, 'esc50processed'))
        esc50 = esc50.remove_columns(['filename', 'fold', 'category', 'esc10', 'src_file', 'take'])
        # split into train, val, test
        esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
        self.dataset_train = esc50['train'].with_format('torch', columns=['audio', 'label'])
        self.dataset_test = esc50['test'].with_format('torch', columns=['audio', 'label'])

    def train_dataloader(self):
        self.dataset_train.set_transform(train_transform)
        return DataLoader(self.dataset_train, batch_size=args.batch_size, num_workers=args.num_workers,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        )

    # def val_dataloader(self):
    #     return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers,
    #     pin_memory=True,
    #     shuffle=False,
    #     drop_last=True,
    #     )

    def test_dataloader(self):
        dataset_test = self.dataset_test.set_transform(test_transform)
        return DataLoader(dataset_test, batch_size=args.batch_size, num_workers=args.num_workers,
        pin_memory=True,
        shuffle=False,
        drop_last=True,
        )

datamodule = ESC50DataModule()

### Playground code


In [33]:
from datasets import load_dataset
from datasets import Audio
esc50 = load_dataset("ashraq/esc50")
esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# split into train, val, test
esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
esc50


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
        num_rows: 400
    })
})

In [85]:
datamodule.prepare_data()
datamodule.setup(stage='fit')

In [72]:
batch = datamodule.dataset_train[0:10]
batch


{'label': tensor([18, 32, 11,  4, 18, 30, 32, 44, 12, 29]),
 'audio': tensor([[ 3.0243e-03,  3.3027e-03,  3.9883e-03,  ...,  8.2969e-03,
           1.9109e-02,  1.9960e-02],
         [-1.7522e-03, -9.7170e-04,  1.8503e-04,  ...,  1.3342e-01,
           2.0178e-02,  1.8933e-01],
         [-1.2765e-03, -3.0075e-02, -6.9552e-02,  ...,  1.8576e-01,
           1.4528e-01,  1.0023e-01],
         ...,
         [-3.9431e-03,  1.0530e-02,  2.1364e-02,  ..., -4.9134e-02,
          -4.9637e-02, -5.2972e-02],
         [ 4.0512e-03, -5.6218e-03, -1.9005e-02,  ..., -2.2932e-02,
          -2.5942e-02, -2.5678e-02],
         [ 2.1062e-01,  2.3057e-01,  2.4416e-01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]])}

In [73]:
transformed = train_transform(batch)
transformed

torch.Size([10, 90112])
torch.Size([1, 10, 90112])


{'label': tensor([18, 32, 11,  4, 18, 30, 32, 44, 12, 29]),
 'audio': tensor([[[ 7.7333e-04, -4.3723e-04,  3.0176e-04,  ...,  5.1747e-03,
           -4.9137e-04, -3.1689e-03],
          [ 1.3939e-03, -7.2317e-04,  4.8474e-04,  ..., -1.2747e-02,
           -1.5041e-02, -6.7834e-03],
          [ 3.1152e-03, -5.6959e-04, -2.9647e-05,  ..., -6.0230e-02,
           -1.2222e-01, -3.0191e-02],
          ...,
          [ 1.4703e-02,  2.4423e-02,  2.8221e-02,  ..., -7.0143e-04,
            1.0854e-03, -2.2116e-03],
          [-8.3951e-04, -9.1961e-04,  2.5133e-03,  ...,  1.6090e-05,
           -1.9715e-05,  3.2177e-05],
          [-5.7774e-04,  5.9911e-04, -6.2213e-04,  ...,  5.2188e-04,
           -5.3926e-04,  5.5783e-04]]])}

In [86]:
train_dataloader = datamodule.train_dataloader()
first_batch = next(iter(train_dataloader))
first_batch['audio'].shape

torch.Size([1, 90112])torch.Size([1, 90112])torch.Size([1, 90112])

torch.Size([1, 90112])

torch.Size([1, 90112])
torch.Size([1, 1, 90112])torch.Size([1, 1, 90112])torch.Size([1, 90112])torch.Size([1, 90112])torch.Size([1, 90112])torch.Size([1, 90112])

torch.Size([1, 90112])




torch.Size([1, 90112])torch.Size([1, 1, 90112])

torch.Size([1, 90112])
torch.Size([1, 1, 90112])
torch.Size([1, 1, 90112])torch.Size([1, 1, 90112])

torch.Size([1, 1, 90112])torch.Size([1, 1, 90112])

torch.Size([1, 1, 90112])torch.Size([1, 1, 90112])
torch.Size([1, 1, 90112])torch.Size([1, 1, 90112])


torch.Size([1, 90112])
torch.Size([1, 90112])torch.Size([1, 90112])

torch.Size([1, 1, 90112])
torch.Size([1, 90112])
torch.Size([1, 1, 90112])
torch.Size([1, 90112])torch.Size([1, 1, 90112])

torch.Size([1, 90112])
torch.Size([1, 1, 90112])torch.Size([1, 90112])
torch.Size([1, 90112])
torch.Size([1, 90112])torch.Size([1, 1, 90112])

torch.Size([1, 90112])

torch.Size([1, 1, 90112])torch.Size([1, 90112])

tor

torch.Size([128, 1, 90112])

In [16]:
first_batch['audio'].shape

torch.Size([128, 1, 90112])

In [73]:
x = first_batch['audio']
x

tensor([[-0.0050,  0.0210,  0.0506,  ...,  0.0094,  0.0280,  0.0201],
        [ 0.0003,  0.0004,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0018,  0.0809,  0.0700,  ...,  0.0968,  0.1595,  0.2512],
        ...,
        [ 0.0051, -0.0120, -0.0181,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0024,  0.0027,  0.0022,  ..., -0.0243, -0.0349, -0.0430],
        [-0.0013, -0.0039, -0.0126,  ...,  0.0528,  0.0516,  0.0475]])

In [74]:
first_batch

{'label': tensor([ 8, 34, 41,  7, 15, 41, 17, 37, 38, 35, 19, 20, 22, 23, 21, 39, 44, 21,
         27, 18, 15, 40, 32,  9, 35, 48, 31, 34,  4, 22, 21, 32, 47, 49, 26, 49,
         40, 30, 23, 16, 48, 32, 13, 39, 28,  3, 37, 31, 42, 41, 25,  1, 24, 47,
         40,  8, 49,  9, 10, 44, 15, 37, 49, 41, 26, 13, 20, 25, 34,  8, 39, 10,
         44, 17,  2,  9, 46,  5, 34, 47, 16, 18, 44, 29, 36,  4,  8,  3, 11, 32,
          2, 17, 16,  5, 42, 45, 21, 33, 37, 36,  8, 23, 41, 18, 28, 41, 44, 16,
         48, 36, 13, 43, 27, 16, 24,  6, 18, 13, 49, 14, 11,  8, 20,  0, 22,  1,
         37, 35]),
 'audio': tensor([[-0.0050,  0.0210,  0.0506,  ...,  0.0094,  0.0280,  0.0201],
         [ 0.0003,  0.0004,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0018,  0.0809,  0.0700,  ...,  0.0968,  0.1595,  0.2512],
         ...,
         [ 0.0051, -0.0120, -0.0181,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0024,  0.0027,  0.0022,  ..., -0.0243, -0.0349, -0.0430],
         [-0.0013, -0.0039,

## Model

In [3]:
import torch
from workspace.datasets.batch_augs import BatchAugs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


ba_params = {
        'seq_len': args.seq_len,
        'fs': args.sampling_rate,
        'augs': args.augs_mix,
        'device': device,
        'mix_ratio': args.mix_ratio,
        'batch_sz': args.local_rank,
        'epoch_mix': args.epoch_mix,
        'resample_factors': [0.8, 0.9, 1.1, 1.2],
        'multilabel': True if args.multilabel else False,
        'mix_loss': args.mix_loss
    }
batch_augs = BatchAugs(ba_params)

In [4]:
import torch.nn as nn
#####################
# losses            #
#####################
if args.loss_type == "label_smooth":
    from modules.losses import LabelSmoothCrossEntropyLoss
    criterion = LabelSmoothCrossEntropyLoss(smoothing=0.1, reduction='sum')
elif args.loss_type == "cross_entropy":
    criterion = nn.CrossEntropyLoss(reduction='sum')
elif args.loss_type == "focal":
    from modules.losses import FocalLoss
    criterion = FocalLoss()
elif args.loss_type == 'bce':
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
else:
    raise ValueError

In [5]:
from typing import Any, Optional
from lightning.pytorch.utilities.types import STEP_OUTPUT
import numpy as np
import torch, torch.nn as nn
import lightning as L
from workspace.datasets.batch_augs import BatchAugs
from modules.soundnet import SoundNetRaw as SoundNet

class EAT(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.save_hyperparameters(args)
        ds_fac = np.prod(np.array(args.ds_factors)) * 4
        self.model = SoundNet(
                nf=args.nf,
                dim_feedforward=args.dim_feedforward,
                clip_length=args.seq_len // ds_fac,
                embed_dim=args.emb_dim,
                n_layers=args.n_layers,
                nhead=args.n_head,
                n_classes=args.n_classes,
                factors=args.ds_factors,
                )

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        x, targets, is_mixed = batch_augs(x, y) # TODO: removed epoch parameter
        pred = self(x)
        if is_mixed:
            loss_cls = batch_augs.mix_loss(pred, targets, n_classes=args.n_classes,
            pred_one_hot=args.multilabel)
        else:
            loss_cls = criterion(pred, y)
        self.log('loss_cls', loss_cls)
        return loss_cls
    
    def test_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        pred = self(x)
        loss_cls = criterion(pred, y)
        self.log('test_loss', loss_cls)
        return loss_cls
    
    # def validation_step(self, batch, batch_idx):
    #     loss = self.training_step(batch, batch_idx)
    #     self.log('val_loss', loss)
    #     return loss
    
    def configure_optimizers(self):
        if args.amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler(init_scale=2**10)
            eps = 1e-4
        else:
            scaler = None
            eps = 1e-8
        parameters = self.model.parameters()
        return torch.optim.AdamW(parameters,
                            lr=args.max_lr,
                            betas=[0.9, 0.99],
                            weight_decay=0,
                            eps=eps)
model = EAT()


In [21]:
# get one sample from datamodule
# datamodule.prepare_data()
datamodule.setup(stage='fit')
sample = next(iter(datamodule.train_dataloader()))
x = sample['audio']
print(x.shape)
model(x)

torch.Size([128, 1, 90112])


tensor([[ 0.7032,  0.2245, -0.4650,  ...,  0.1418,  0.3108, -0.3904],
        [ 0.3579, -1.0132, -0.0413,  ...,  0.1003, -0.0456,  0.0107],
        [ 1.0528,  0.5265, -0.7309,  ..., -0.0906,  0.1717, -0.2026],
        ...,
        [ 0.9733, -0.1774, -0.4244,  ..., -0.2977, -0.0049, -0.0283],
        [ 0.6691, -0.7178,  0.3668,  ..., -0.0950, -0.1738,  0.5077],
        [ 0.1911, -0.3720,  0.2180,  ...,  0.2844,  0.6657, -0.1125]],
       grad_fn=<AddmmBackward0>)

## Train

In [11]:
from lightning import Trainer


trainer = L.Trainer(max_epochs=2, accelerator='gpu', devices=args.gpus) # set devices to a list of GPU ids to train on

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:

# start training 
trainer.fit(model, datamodule=datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type        | Params
--------------------------------------
0 | model | SoundNetRaw | 5.2 M 
--------------------------------------
5.2 M     Trainable params
0         Non-trainable params
5.2 M     Total params
20.722    Total estimated model params size (MB)


Epoch 0:   0%|          | 0/12 [00:00<?, ?it/s] 

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py", line 2803, in __getitem__
    return self._getitem(key)
  File "/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py", line 2788, in _getitem
    formatted_output = format_table(
  File "/usr/local/lib/python3.9/dist-packages/datasets/formatting/formatting.py", line 629, in format_table
    return formatter(pa_table, query_type=query_type)
  File "/usr/local/lib/python3.9/dist-packages/datasets/formatting/formatting.py", line 396, in __call__
    return self.format_row(pa_table)
  File "/usr/local/lib/python3.9/dist-packages/datasets/formatting/formatting.py", line 485, in format_row
    formatted_batch = self.format_batch(pa_table)
  File "/usr/local/lib/python3.9/dist-packages/datasets/formatting/formatting.py", line 515, in format_batch
    return self.transform(batch)
  File "/tmp/ipykernel_634798/3357819279.py", line 19, in train_transform
    batch['audio'] = AudioAugs(transforms, args.sampling_rate, p=1)(audio)
  File "/workspace/datasets/audio_augs.py", line 617, in __call__
    sample = self.augs[aug](sample)
  File "/workspace/datasets/audio_augs.py", line 133, in __call__
    sample = torch.cat((pad, sample[:-int_d]), dim=-1)
RuntimeError: Tensors must have same number of dimensions: got 1 and 2


## Test
Test the model on the test set.

In [13]:
trainer.test(model, datamodule=datamodule)

Repo card metadata block was not found. Setting CardData to empty.
Saving the dataset (3/3 shards): 100%|██████████| 2000/2000 [00:03<00:00, 612.89 examples/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 15.73it/s]


[{'test_loss': 513.25830078125}]