In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torchmetrics
import wandb
import gc
import math

from dataset import CoughDataset

In [None]:
#@title Transformer implementation. Make sure to run the cell. Also, we recommend you do this after getting your SelfAttention implementation working
# WRITTEN BY Holger Severin Bovbjerg <hsbo@es.aau.dk>
# Adapted by Sarthak Yadav <sarthaky@es.aau.dk> for this assignment
# KWT model based on model from https://github.com/ID56/Torch-KWT/blob/main/models/kwt.py"""

from einops import rearrange, repeat
from einops.layers.torch import Rearrange
import torch
import torch.fft
from torch import nn, einsum

# Basically vision transformer, ViT that accepts MFCC + SpecAug. Refer to:
# https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py


class PreNorm(nn.Module):
    """
    Pre layer normalization
    """
    def __init__(self, dim, fn):
        """
        Initialises PreNorm module
        :param dim: model dimension
        :param fn: torch module
        """
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        """
        Forward method for PreNorm module
        :param x: input tensor
        :param kwargs: Keyword arguments
        :return:
        """
        return self.fn(self.norm(x), **kwargs)


class PostNorm(nn.Module):
    """
    Post layer normalization
    """
    def __init__(self, dim, fn):
        """
        Initialises PostNorm module
        :param dim: model dimension
        :param fn: torch module
        """
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        """
        Forward method for PostNorm module
        :param x: input tensor
        :param kwargs: Keyword arguments
        :return: PostNorm output
        """
        return self.norm(self.fn(x, **kwargs))


class FeedForward(nn.Module):
    """
    Feed forward model
    """
    def __init__(self, dim, hidden_dim, dropout=0.):
        """
        Initialises FeedForward module
        :param dim: feedforward dim
        :param hidden_dim: hidden dimension of feedforward layer
        :param dropout: feedforward dropout percentage
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        """
        Forward method for feedforward module
        :param x: input tensor
        :return: FeedForward output
        """
        return self.net(x)


class SHA(nn.Module):
    def __init__(self, head_dim, attn_drop=0.):
        super().__init__()
        self.head_dim = head_dim
        self.scale = head_dim ** -0.5
        self.attn_drop = nn.Dropout(attn_drop)
    
    def forward(self, q, k, v):
        # print(f"in SHA, q:{q.shape}, k:{k.shape}, v:{v.shape}")
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v)
        # print("output shape:", x.shape)
        return x


class MHA(nn.Module):
    def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
        super().__init__()

        self.num_heads = heads
        self.dim = dim
        self.head_dim = dim_head
        inner_dim = dim_head * heads

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.attention_heads = nn.ModuleList([
            SHA(self.head_dim) for _ in range(self.num_heads)
        ])
        project_out = not (heads == 1 and dim_head == dim)
        self.scale = dim_head ** -0.5

        self.to_out = nn.Sequential(
                nn.Linear(inner_dim, dim),
                nn.Dropout(dropout)
            ) if project_out else nn.Identity() 
  
    def forward(self, x):
        """
        Forward method for Attention module
        :param x: input tensor
        :return: Attention module output
        """

        B, N, C = x.shape
        qkv = self.to_qkv(x)

        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 3, 0, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)

        o = []
        for i in range(self.num_heads):
            head_i = self.attention_heads[i](q[i],k[i],v[i]).unsqueeze(0)
            o.append(head_i)
        o = torch.concat(o, dim=0)
        o = o.permute(1, 2, 0, 3).reshape(B, N, -1)

        return self.to_out(o)


class Transformer(nn.Module):
    """
    Transformer model
    """
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, pre_norm=True, dropout=0., mha_block=MHA):
        """
        Initialises Transformer model
        :param dim: transformer dimension
        :param depth: number of transformer layers
        :param heads: number of attention heads for each transformer layer
        :param dim_head: dimension of each attention head
        :param mlp_dim: MLP dimension
        :param pre_norm: specifies whether PreNorm (True) or PostNorm (False) is used
        :param dropout: dropout percentage of Attention of FeedForward modules
        """
        super().__init__()
        self.layers = nn.ModuleList([])

        P_Norm = PreNorm if pre_norm else PostNorm
        if mha_block is None:
          mha_block = MHA
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                P_Norm(dim, mha_block(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
                P_Norm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
            ]))

    def forward(self, x):
        """
        Forward method for Transformer model
        :param x: input tensor
        :return: Tuple of model output, hidden states of transformer and attentions from each transformer layer
        """
        hidden_states = []
        attentions = []
        for attn, ff in self.layers:
            x = attn(x) + x
            attentions.append(x)
            x = ff(x) + x
            hidden_states.append(x)
        return x, hidden_states, attentions


class KWT(nn.Module):
    """
    KWT model
    """
    def __init__(self, input_res, patch_res, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=1,
                 dim_head=64, dropout=0., emb_dropout=0., pre_norm=True, mha_block=MHA, **kwargs):
        """
        Initialises KWT model
        :param input_res: input spectrogram size
        :param patch_res: patch size
        :param num_classes: number of keyword classes
        :param dim: transformer dimension
        :param depth: number of transformer layers
        :param heads: number of attention heads
        :param mlp_dim: MLP dimension
        :param pool: specifies whether CLS token or average pooling of transformer model is used for classification
        :param channels: Number of input channels
        :param dim_head: dimension of attention heads
        :param dropout: dropout of transformer attention and feed forward layers
        :param emb_dropout: dropout of embeddings
        :param pre_norm: specifies whether PreNorm (True) or PostNorm (False) is used
        :param kwargs: Keyword arguments
        """
        super().__init__()

        num_patches = int(input_res[0] / patch_res[0] * input_res[1] / patch_res[1])

        patch_dim = channels * patch_res[0] * patch_res[1]
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_res[0], p2=patch_res[1]),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)
        self.mask_embedding = nn.Parameter(torch.FloatTensor(dim).uniform_())
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, pre_norm, dropout, mha_block=mha_block)

        self.pool = pool
        self.to_latent = nn.Identity()

        # Create classification head
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x, mask=None, output_hidden_states=False, output_attentions=False):
        """
        Forward method of KWT model
        :param x: input tensor
        :param mask: input mask
        :param output_hidden_states: specifies whether hidden states are output
        :param output_attentions: specifies whether attentions are output
        :return: KWT model output, if output_hidden_states and/or output_attentions the classification head is skipped
        """
        x = self.to_patch_embedding(x)
        b, n, _ = x.shape

        # Add cls token embedding
        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b)
        x = torch.cat((cls_tokens, x), dim=1)

        # Mask input
        if mask is not None:
            x[mask] = self.mask_embedding

        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x, hidden_states, attentions = self.transformer(x)

        x = x.mean(dim=1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)

        if any([output_hidden_states, output_attentions]):
            outputs = (self.mlp_head(x), hidden_states) if output_hidden_states else (self.mlp_head(x), )
            outputs = outputs + (attentions, ) if output_attentions else outputs
            return outputs
        return self.mlp_head(x)

In [None]:
class CoughDataModule(pl.LightningDataModule):
    def __init__(self, 
                 df, 
                 data_path, 
                 batch_size=32, 
                 num_workers=4, 
                 train_size=0.8, 
                 val_size=0.1, 
                 test_size=0.1,
                 duration=10.0,
                 sample_rate=48000,
                 channels=1,
                 n_mels=64,
                 n_fft=1024, 
                 top_db=80):
        super().__init__()
        
        self.df = df
        self.data_path = data_path
        self.batch_size = batch_size
        self.num_workers = num_workers
        
        self.train_size = train_size
        self.val_size = val_size
        self.test_size = test_size
        
        self.duration = duration
        self.sample_rate = sample_rate
        self.channels = channels
        
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.top_db = top_db
        
        if self.train_size + self.val_size + self.test_size != 1.0:
            raise Exception('train_size + val_size + test_size must be equal to 1.0')
    
        dataset = CoughDataset(df=self.df, 
                            data_path=self.data_path,
                            duration=self.duration,
                            sample_rate=self.sample_rate,
                            channels=self.channels,
                            n_mels=self.n_mels,
                            n_fft=self.n_fft,
                            top_db=self.top_db)

        self.train_dataset, self.val_dataset, self.test_dataset = random_split(dataset, [self.train_size, self.val_size, self.test_size])
            
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)        

In [None]:
METADATA_FILE = 'data/metadata_compiled.csv'
DATA_PATH = 'data/'


metadata_df = pd.read_csv(METADATA_FILE)

In [None]:
not_nan_df = metadata_df[metadata_df['status'].isna() == False]
filtered_df = not_nan_df[not_nan_df['cough_detected'] > 0.9]    # TODO: Set as a hyperparameter
filtered_df[['uuid', 'cough_detected', 'SNR', 'age', 'gender', 'status']]

In [None]:
# @title self-attention
from torch.nn.modules.module import T
class SelfAttention(nn.Module):
  def __init__(self, head_dim):
    super().__init__()
    self.head_dim = head_dim
    ## Complete what the scale should be. Then uncomment
    self.scale = self.head_dim ** -0.5

  def forward(self, q, k, v, mask=None):
    """
    q, k, v are query, key and value tensors, respectively of shapes (B, N, C)
    where B is Batch Size
          N is number of patches
          C is the dimensions

    You have to 

    1. compute the dot product between q and k
    2. scale it
    3. apply softmax to get weights
    4. project v with weights
    """
    """qk_dot = torch.matmul(q[0,:,:],k[0,:,:].T)
    wv = qk_dot * self.scale
    n = torch.nn.Softmax(dim=1)
    wv_softmax = n(wv)
    wv_softmax_weighted = torch.matmul(wv_softmax, v)"""
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values

In [None]:
#@title MultiHeadAttention, already implemented
class MultiHeadAttentionCustom(nn.Module):
    def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
        super().__init__()

        self.num_heads = heads
        self.dim = dim
        self.head_dim = dim_head
        inner_dim = dim_head * heads

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.attention_heads = nn.ModuleList([
            SelfAttention(self.head_dim) for _ in range(self.num_heads)
        ])
        project_out = not (heads == 1 and dim_head == dim)
        self.scale = dim_head ** -0.5

        self.to_out = nn.Sequential(
                nn.Linear(inner_dim, dim),
                nn.Dropout(dropout)
            ) if project_out else nn.Identity() 
  
    def forward(self, x):
        """
        Forward method for Attention module
        :param x: input tensor
        :return: Attention module output
        """

        B, N, C = x.shape
        qkv = self.to_qkv(x)
        #print("in attention qkv shape:", qkv.shape)
        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 3, 0, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)

        o = []
        for i in range(self.num_heads):
            head_i = self.attention_heads[i](q[i],k[i],v[i]).unsqueeze(0)
            o.append(head_i)
        o = torch.concat(o, dim=0)
        o = o.permute(1, 2, 0, 3).reshape(B, N, -1)

        return self.to_out(o)

In [None]:
#@title This is the PyTorch-Lightning Module implementation, which takes care of a lot of boilerplate code related to training for us.
class KWTLightning(pl.LightningModule):
  def __init__(self, hparams, mha_block):
    super().__init__()
    self.model = KWT(**hparams, mha_block=mha_block)
  
  def forward(self, x):
    return self.model(x)
  
  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=0.01)
  
  def training_step(self, train_batch, batch_idx):
    x, y = train_batch
    preds = self.model(x)
    loss = nn.functional.cross_entropy(preds, y)
    acc = (preds.argmax(dim=-1) == y).float().mean()

    self.log("train_loss", loss)
    self.log("train_acc", acc, prog_bar=True, on_step=False, on_epoch=True)
    return loss
  
  def validation_step(self, val_batch, batch_idx):
    x, y = val_batch
    preds = self.model(x)
    loss = nn.functional.cross_entropy(preds, y)
    acc = (preds.argmax(dim=-1) == y).float().mean()
    self.log('val_loss', loss)
    self.log("val_acc", acc, prog_bar=True, on_epoch=True)
  
  def test_step(self, batch, batch_idx):
    x, y = batch
    preds = self.model(x)
    loss = nn.functional.cross_entropy(preds, y)
    acc = (preds.argmax(dim=-1) == y).float().mean()
    self.log('test_loss', loss)
    self.log("test_acc", acc)

def create_pl(mha_block, MODEL_HPARAMS):
  kwt_pl = KWTLightning(MODEL_HPARAMS, mha_block)
  return kwt_pl

In [None]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        gc.collect()
        torch.cuda.empty_cache()
        
        try:    
            torch.manual_seed(69) # noice
            
            wandb_logger = WandbLogger(log_model=True)
            
            data_module = CoughDataModule(df=filtered_df, 
                              data_path=DATA_PATH, 
                              batch_size=config.batch_size, 
                              sample_rate=config.sample_rate,
                              n_fft=config.n_fft,
                              n_mels=config.n_mels)
            # Get size of random sample of data
            size = data_module.train_dataset[0][0].shape
        
            BATCH_SIZE = 128
            N_CLASSES = 3
            MODEL_HPARAMS = {
                "input_res":[size[1], size[2]],
                "patch_res":[size[1], 1],
                "num_classes":N_CLASSES,
                "mlp_dim":256,
                "dim":64,
                "heads":4,
                "dim_head":64//4,       # dim_head should be dim divided by number of heads
                "depth":4,
                "dropout": 0.1,
                "emb_dropout": 0.1
            }
            print("Size of random sample of data:", size)
            kwt_pl = create_pl(mha_block=MultiHeadAttentionCustom, MODEL_HPARAMS=MODEL_HPARAMS)
            cuda_available = torch.cuda.is_available()
            train_set = data_module.train_dataloader()
            val_set = data_module.val_dataloader()
            test_set = data_module.test_dataloader()
            trainer = pl.Trainer(
                max_epochs=config.max_epochs,
                logger=wandb_logger,
                accelerator='gpu' if cuda_available else None,
            )
            trainer.fit(kwt_pl, train_dataloaders=train_set, val_dataloaders=val_set)
            #trainer.fit(kwt_pl, data_module)
            trainer.test(kwt_pl, test_set)
            
        except Exception as e:
            print(e)
            wandb.finish()
            raise e
        
        del wandb_logger
        del data_module
        del model
        del classifier
        del trainer
        
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
sweep_config = {
  "method": "bayes",
  "metric": {
        "name": "val_accuracy",
        "goal": "maximize"
  },
  "parameters": {
    "batch_size": {
        "values": [32, 64, 128]
    },
    "max_epochs": {
        "values": [2, 4, 6]
    },
    "learning_rate": {
        "min": 0.001,
        "max": 0.01
    },
    "sample_rate": {
      "values": [16000, 22050, 44100, 48000]
    },
    "n_fft": {
      "values": [512, 1024, 2048]
    },
    "n_mels": {
      "values": [32, 64]
    },
  }
}


sweep_id = wandb.sweep(sweep_config, project='cough-classifier', entity='dl-miniproject')

wandb.agent(sweep_id, function=train, count=10)
