In [1]:
import os
import sys
# change directory to parent directory
os.chdir(os.path.dirname(os.getcwd()))

# print current working directory
print(os.getcwd())



/workspaces/E2E-AudioClassfication


In [2]:
import torch
# reset gpu memory
torch.cuda.empty_cache()

## Paramerers

In [3]:
import argparse
from pathlib import Path


parser = argparse.ArgumentParser()
'''train'''
parser.add_argument("--max_lr", default=3e-4, type=float)
parser.add_argument("--wd", default=1e-5, type=float)
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--run_name", default=None, type=Path)
parser.add_argument('--loss_type', default="label_smooth", type=str)
parser.add_argument('--n_epochs', default=None, type=int)
parser.add_argument('--epoch_mix', default=None, type=int)
parser.add_argument("--amp", action='store_true')
parser.add_argument("--filter_bias_and_bn", action='store_true', default=True)
parser.add_argument("--ext_pretrained", default=None, type=str)
parser.add_argument("--multilabel", action='store_true')
parser.add_argument('--save_path', default=None, type=Path)
parser.add_argument('--load_path', default=None, type=Path)
parser.add_argument('--scheduler', default=None, type=str)
parser.add_argument('--augs_signal', nargs='+', type=str,
                    default=['amp', 'neg', 'tshift', 'tmask', 'ampsegment', 'cycshift'])
parser.add_argument('--augs_noise', nargs='+', type=str,
                    default=['awgn', 'abgn', 'apgn', 'argn', 'avgn', 'aun', 'phn', 'sine'])
# parser.add_argument('--augs_signal', nargs='+', type=str,
#                     default=[])
# parser.add_argument('--augs_noise', nargs='+', type=str,
#                     default=[])
parser.add_argument('--augs_mix', nargs='+', type=str, default=['mixup', 'timemix', 'freqmix', 'phmix'])
parser.add_argument('--mix_loss', default='bce', type=str)
parser.add_argument('--mix_ratio', default=1, type=float)
parser.add_argument('--ema', default=0.995, type=float)
parser.add_argument('--log_interval', default=100, type=int)
parser.add_argument("--kd_model", default=None, type=Path)
parser.add_argument("--use_bg", action='store_true', default=False)
parser.add_argument("--resume_training", action='store_true', default=False)
parser.add_argument("--use_balanced_sampler", action='store_true', default=False)
'''common'''
parser.add_argument('--local_rank', default=0, type=int)
parser.add_argument('--gpu_ids', nargs='+', default=[0])
parser.add_argument("--use_ddp", action='store_true')
parser.add_argument("--use_dp", action='store_true')
parser.add_argument('--save_interval', default=100, type=int)
'''data'''
parser.add_argument('--fold_id', default=1, type=int)
parser.add_argument("--data_subtype", default='balanced', type=str)
parser.add_argument('--seq_len', default=90112, type=int)
parser.add_argument('--dataset', default="esc50", type=str)
parser.add_argument('--n_classes', default=50, type=int)
'''net'''
parser.add_argument('--ds_factors', nargs='+', type=int, default=[4, 4, 4, 4])
parser.add_argument('--n_head', default=8, type=int)
parser.add_argument('--n_layers', default=4, type=int)
parser.add_argument("--emb_dim", default=128, type=int)
parser.add_argument("--model_type", default='SoundNetRaw', type=str)
parser.add_argument("--nf", default=16, type=int)
parser.add_argument("--dim_feedforward", default=512, type=int)
parser.add_argument("--sampling_rate", default=22050, type=int)
'''system'''
parser.add_argument('--data_dir', default='data/', type=Path)
parser.add_argument('--data_path', default='data/ESC-50-master', type=str)

parser.add_argument('--gpus', type=list, default=[0])
parser.add_argument('--num_workers', type=int, default=30)
args = parser.parse_args(args=[])


## Data

### ESC50 Dataset
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes.
[Github](https://github.com/karolpiczak/ESC-50)

[Huggingface](https://huggingface.co/datasets/ashraq/esc50) "ashraq/esc50"

### Preprocess
Prepare your dataset. All code that only need to be executed ones before training and results fit on disk.

In [4]:
import random
import os
from datasets import load_dataset, Audio

def preprocess_audio(example):
    audio = example['audio']
    audio = audio['array']
    if audio.shape[0] >= args.seq_len:
        max_audio_start = audio.shape[0] - args.seq_len
        audio_start = random.randint(0, max_audio_start)
        audio = audio[audio_start : audio_start + args.seq_len]
    else:
        audio = F.pad(
            audio, (0, args.seq_len - audio.size(0)), "constant"
        ).data
    example['audio'] = audio
    return example

esc50 = load_dataset("ashraq/esc50", cache_dir=args.data_dir)
esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# rename column target to label
esc50 = esc50.rename_column("target", "label")
esc50 = esc50.map(preprocess_audio)
esc50.save_to_disk(os.path.join(args.data_dir, 'esc50processed'))

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

### Setup Datamodule

In [5]:
import os
import lightning as L
import torch
from torch.utils.data import random_split, DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Audio


import torch.nn.functional as F

from model.augmentations.audio_augs import AudioAugs



def train_transform(batch):
    transforms = args.augs_signal + args.augs_noise
    audio = torch.Tensor(batch['audio'])
    if transforms is not None:
        # check if audio has more then 1 dimension
        if len(audio.shape) > 1:
            # iterate over all dimensions
            for i in range(audio.shape[0]):
                audio[i] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio[i])
        else:
            batch['audio'] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio)      
    batch['audio'] = audio.unsqueeze(1)
    batch['label'] = torch.Tensor(batch['label']).long()
    return batch

def gpu_transforms(batch):
    transforms = args.augs_signal + args.augs_noise
    audio = batch['audio']
    if transforms is not None:
        # check if audio has more then 1 dimension
        if len(audio.shape) > 1:
            # iterate over all dimensions
            for i in range(audio.shape[0]):
                audio[i] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio[i][0])
        else:
            batch['audio'] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio)      
    batch['audio'] = audio
    return batch
    
def test_transform(batch):
    batch['audio'] = torch.Tensor(batch['audio']).unsqueeze(1)
    batch['label'] = torch.Tensor(batch['label']).long()
    return batch

class ESC50DataModule(L.LightningDataModule):
    def __init__(self, data_dir: Path = args.data_dir):
        super().__init__()
        self.data_dir = data_dir
        
    # called only within a single process on CPU but everytime trainer is envoked
    # def prepare_data(self):

    # run on each GPU
    def setup(self, stage: str):
        # load from disk
        esc50 = load_from_disk(os.path.join(self.data_dir, 'esc50processed'))
        esc50 = esc50.remove_columns(['filename', 'fold', 'category', 'esc10', 'src_file', 'take'])
        # split into train, val, test
        esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
        self.dataset_train = esc50['train']
        # self.dataset_train = esc50['train'].with_format('torch', columns=['audio', 'label'])
        self.dataset_test = esc50['test'].with_format('torch', columns=['audio', 'label'])

    def train_dataloader(self):
        self.dataset_train.set_transform(train_transform)
        # self.dataset_train.set_transform(test_transform)
        return DataLoader(self.dataset_train, batch_size=args.batch_size,
        num_workers=args.num_workers,
        persistent_workers=True,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        # generator=torch.Generator(device='cuda')
        )


    def test_dataloader(self):
        self.dataset_test.set_transform(test_transform)
        return DataLoader(self.dataset_test, batch_size=args.batch_size, num_workers=args.num_workers,
        persistent_workers=True,
        pin_memory=True,
        shuffle=False,
        drop_last=True,
        )
    
    # def on_after_batch_transfer(self, batch, dataloader_idx):
    #     if self.trainer.training:
    #         batch = gpu_transforms(batch)
    #     return batch

datamodule = ESC50DataModule()

### Playground code


In [14]:
from datasets import load_dataset
from datasets import Audio
esc50 = load_dataset("ashraq/esc50")
esc50 = esc50.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# split into train, val, test
esc50 = esc50['train'].train_test_split(test_size=0.2, shuffle=True)
esc50


Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [6]:
datamodule.prepare_data()
datamodule.setup(stage='fit')

## Model

In [6]:
import torch
from model.augmentations.batch_augs import BatchAugs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


ba_params = {
        'seq_len': args.seq_len,
        'fs': args.sampling_rate,
        'augs': args.augs_mix,
        'device': device,
        'mix_ratio': args.mix_ratio,
        'batch_sz': args.local_rank,
        'epoch_mix': args.epoch_mix,
        'resample_factors': [0.8, 0.9, 1.1, 1.2],
        'multilabel': True if args.multilabel else False,
        'mix_loss': args.mix_loss
    }
batch_augs = BatchAugs(ba_params)

In [7]:
import torch.nn as nn
#####################
# losses            #
#####################
if args.loss_type == "label_smooth":
    from model.losses import LabelSmoothCrossEntropyLoss
    criterion = LabelSmoothCrossEntropyLoss(smoothing=0.1, reduction='sum')
elif args.loss_type == "cross_entropy":
    criterion = nn.CrossEntropyLoss(reduction='sum')
elif args.loss_type == "focal":
    from model.losses import FocalLoss
    criterion = FocalLoss()
elif args.loss_type == 'bce':
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
else:
    raise ValueError

In [8]:
from lightning.pytorch.utilities.types import STEP_OUTPUT
import numpy as np
import torch, torch.nn as nn
import lightning as L
from model.soundnet import SoundNetRaw as SoundNet
from utils.helper_funcs import accuracy

class EAT(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.save_hyperparameters(args)
        ds_fac = np.prod(np.array(args.ds_factors)) * 4
        self.model = SoundNet(
                nf=args.nf,
                dim_feedforward=args.dim_feedforward,
                clip_length=args.seq_len // ds_fac,
                embed_dim=args.emb_dim,
                n_layers=args.n_layers,
                nhead=args.n_head,
                n_classes=args.n_classes,
                factors=args.ds_factors,
                )

    def forward(self, x):
        return self.model(x)

    def augment_audio(audio):
        transforms = args.augs_signal + args.augs_noise
        for i in range(audio.shape[0]):
            audio[i] = AudioAugs(transforms, args.sampling_rate, p=0.5)(audio[i])
        return audio
    
    def training_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        # augment x
        # x = augment_audio(x)
        x, targets, is_mixed = batch_augs(x, y) # TODO: removed epoch parameter
        pred = self(x)
        if is_mixed:
            loss_cls = batch_augs.mix_loss(pred, targets, n_classes=args.n_classes,
            pred_one_hot=args.multilabel)
        else:
            loss_cls = criterion(pred, y)
        self.log('loss_cls', loss_cls)
        return loss_cls
    
    def test_step(self, batch, batch_idx):
        x = batch['audio']
        y = batch['label']
        pred = self(x)
        loss_cls = criterion(pred, y)
        acc = accuracy(pred, y, topk=(1,))[0]
        self.log('acc', acc)
        self.log('test_loss', loss_cls)
        return loss_cls
    
    # def validation_step(self, batch, batch_idx):
    #     loss = self.training_step(batch, batch_idx)
    #     self.log('val_loss', loss)
    #     return loss
    
    def configure_optimizers(self):
        if args.amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler(init_scale=2**10)
            eps = 1e-4
        else:
            scaler = None
            eps = 1e-8
        parameters = self.model.parameters()
        return torch.optim.AdamW(parameters,
                            lr=args.max_lr,
                            betas=[0.9, 0.99],
                            weight_decay=0,
                            eps=eps)
model = EAT()


In [19]:
# get one sample from datamodule
# datamodule.prepare_data()
datamodule.setup(stage='fit')
sample = next(iter(datamodule.train_dataloader()))
x = sample['audio']
y = sample['label']
print(x.shape)
print(y.shape)
pred = model(x)
print(pred.shape)

  table = cls._concat_blocks(blocks, axis=0)


torch.Size([128, 1, 90112])
torch.Size([128])
torch.Size([128, 50])


## Train

In [9]:
from lightning import Trainer

trainer = L.Trainer(max_epochs=10, accelerator='gpu', devices=args.gpus, log_every_n_steps=10) # set devices to a list of GPU ids to train on

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/root/.cache/pypoetry/virtualenvs/e2e-audioclassfication-sMlWUVdH-py3.10/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [10]:
# start training 
trainer.fit(model, datamodule=datamodule)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  table = cls._concat_blocks(blocks, axis=0)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type        | Params
--------------------------------------
0 | model | SoundNetRaw | 5.2 M 
--------------------------------------
5.2 M     Trainable params
0         Non-trainable params
5.2 M     Total params
20.722    Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA__cudnn_convolution)

## Test
Test the model on the test set.

In [14]:
trainer.test(model, datamodule=datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 13.89it/s]


[{'acc': 18.22916603088379, 'test_loss': 463.3260803222656}]