In [1]:
import warnings
warnings.filterwarnings("ignore")

# some basic libraries
import os
import numpy as np
from numba import cuda 
import tensorflow as tf
from numpy.lib.function_base import average
from platform import python_version

# ipython display
from IPython.core.display import display

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# pytorch
import torch
from torch.utils.data import DataLoader, random_split, sampler
from torch.utils.data import Dataset
from torchlibrosa.stft import STFT, ISTFT, magphase

from keras.utils import dataset_utils

# sklearn machine learning library
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

from htsat_utils import do_mixup, get_mix_lambda, do_mixup_label
from htsat_utils import get_loss_func, d_prime, float32_to_int16

# import the HTSAT model
from htsat_model import HTSAT_Swin_Transformer 

# import project echo modules
import baseline_config
import echo_data_module
import echo_module

import ipywidgets as widgets
widgets.IntSlider()

# print system information
print('Python Version     : ', python_version())
print('TensorFlow Version : ', tf.__version__)
print('Pytorch Version    : ', torch.__version__)

Python Version     :  3.9.15
TensorFlow Version :  2.10.1
Pytorch Version    :  2.0.0+cu117


In [2]:
def paths_and_labels_to_dataset(image_paths,labels,num_classes):
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    img_ds = path_ds.map(
        lambda path: tf.io.read_file(path), 
        num_parallel_calls=tf.data.AUTOTUNE
    )
    label_ds = dataset_utils.labels_to_dataset(
        labels, 
        'categorical', 
        num_classes)
    img_ds = tf.data.Dataset.zip((img_ds, label_ds))
    return img_ds

def create_dataset(subset):
    image_paths, labels, class_names = dataset_utils.index_directory(
            baseline_config.dataset_path + subset,
            labels="inferred",
            formats=('.pt'),
            class_names=None,
            shuffle=False,
            seed=42,
            follow_links=False)

    dataset = paths_and_labels_to_dataset(
        image_paths=image_paths,
        labels=labels,
        num_classes=len(class_names))
    
    return dataset, class_names

train_dataset, class_names = create_dataset('TRAIN/')
test_dataset, _            = create_dataset('TEST/')
validation_dataset, _      = create_dataset('VALIDATION/')
print("class names: ", class_names)

Found 12384 files belonging to 5 classes.
Found 54 files belonging to 5 classes.
Found 817 files belonging to 5 classes.
class names:  ['brant', 'jabwar', 'sheowl', 'spodov', 'wiltur']


In [3]:
def dataset_transforms(image,label):
    # reshape into standard 3 channels
    image = tf.io.parse_tensor(image, tf.float32)
    image = tf.expand_dims(image, -1)
    
    image = tf.ensure_shape(image, [216, 128, 1])

    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+tf.keras.backend.epsilon()) 
    
    return image,label
  
train_dataset_b = ( 
                  train_dataset
                  .shuffle(20000)
                  .map(dataset_transforms)
                  .cache()           
                )

validation_dataset_b = ( 
                  validation_dataset
                  .map(dataset_transforms)
                  .cache()
                )

test_dataset_b = ( 
                  test_dataset
                  .map(dataset_transforms)
                  .cache()
                )

for item,lbl in train_dataset_b.take(1):
    print(item.shape, lbl.shape)

(216, 128, 1) (5,)


In [4]:
class EchoDatasetMelspec(Dataset):

    def __init__(self, dataset):
        
        self.dataset = dataset
        
        self.audio_dataset = []
        
        # just load it all into RAM for now
        for item,lbl in dataset:
            self.audio_dataset.append((item.numpy(), lbl.numpy()))
        
    # this shuffles the whole list of training samples
    def shuffle_dataset(self):
        print("")

    # get sample at location 'index'
    def __getitem__(self, index):
        """Load waveform and target of an audio clip.
        Args:
            index: the index number
        Return: {
            "filename": str,
            "waveform": (clip_samples,),
            "target": (classes_num,)
        }
        """
        
        # retrieve the sample from the dataset
        sample = self.audio_dataset[index]

        melspec = sample[0]                      # T F C
        # print("melspec.shape",melspec.shape) 
        melspec = np.transpose(melspec, (2,0,1)) # C T F        
        
        target = sample[1]
             
        # return a dictionary with the sample data
        return {
            "filename": "n/a",
            "melspec": melspec,
            "target": np.argmax(target),
        }

    def __len__(self):
        return len(self.audio_dataset)

In [5]:
train_dataset = EchoDatasetMelspec(train_dataset_b)
validation_dataset = EchoDatasetMelspec(validation_dataset_b)

# don't need tensorflow anymore, need to free memory so pytorch can use it
device = cuda.get_current_device()
device.reset()

In [6]:
if not os.path.exists('models/'):
    os.mkdir('models/')
    
# get the number of available GPUs
device_num = torch.cuda.device_count()

# create the audio data set pipeline
audio_pipeline = echo_data_module.EchoDataModule(train_dataset, validation_dataset, device_num)

# checkpoint to record snapshots during training
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints/',
    monitor = "acc",
    filename='l-{epoch:d}-{acc:.3f}',
    save_top_k = 5,
    mode = "max",
    save_weights_only=True
)

# resume from here in the training
checkpoint_resume = None # 'checkpoints/l-epoch=73-acc=0.784.ckpt'

# construct the model trainer
trainer = pl.Trainer(
        deterministic=False,
        default_root_dir = baseline_config.workspace,
        gpus = device_num, 
        val_check_interval = 0.1,
        max_epochs = baseline_config.max_epoch,
        auto_lr_find = True,    
        sync_batchnorm = True,
        callbacks = [checkpoint_callback],
        accelerator = "ddp" if device_num > 1 else None,
        num_sanity_val_steps = 0,
        resume_from_checkpoint = checkpoint_resume, 
        replace_sampler_ddp = False,
        gradient_clip_val=1.0
    )

# construct the model
sed_model = HTSAT_Swin_Transformer(
        spec_size=baseline_config.htsat_spec_size,
        patch_size=baseline_config.htsat_patch_size,
        in_chans=1,
        num_classes=baseline_config.classes_num,
        window_size=baseline_config.htsat_window_size,
        config = baseline_config,
        depths = baseline_config.htsat_depth,
        embed_dim = baseline_config.htsat_dim,
        patch_stride=baseline_config.htsat_stride,
        num_heads=baseline_config.htsat_num_head
    )

# wrapper to track metrics during training 
model = echo_module.EchoModule(
        sed_model = sed_model, 
        config = baseline_config,
        dataset = validation_dataset
    )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
# train the model
trainer.fit(model, audio_pipeline)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: .\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                   | Params
-----------------------------------------------------
0 | sed_model | HTSAT_Swin_Transformer | 27.8 M
-----------------------------------------------------
27.5 M    Trainable params
296 K     Non-trainable params
27.8 M    Total params
111.369   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [8]:
# save the trained model
if not os.path.exists('models/'):
    os.mkdir('models/')
torch.save(model.state_dict(), 'models/baseline_htsat_model.pt')

# example test load
test_model = echo_module.EchoModule(
        sed_model = sed_model, 
        config = baseline_config,
        dataset = validation_dataset)
test_model.load_state_dict(torch.load('models/baseline_htsat_model.pt'))
test_model.eval()

EchoModule(
  (sed_model): HTSAT_Swin_Transformer(
    (spectrogram_extractor): Spectrogram(
      (stft): STFT(
        (conv_real): Conv1d(1, 257, kernel_size=(512,), stride=(512,), bias=False)
        (conv_imag): Conv1d(1, 257, kernel_size=(512,), stride=(512,), bias=False)
      )
    )
    (logmel_extractor): LogmelFilterBank()
    (spec_augmenter): SpecAugmentation(
      (time_dropper): DropStripes()
      (freq_dropper): DropStripes()
    )
    (bn0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (patch_embed): PatchEmbed(
      (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): BasicLayer(
        dim=96, input_resolution=(128, 128), depth=2
        (blocks): ModuleList(
          (0): SwinTransformerBlock(
            dim=96, input_resolution=(128, 128), num_heads=4, window_siz