In [1]:
import torchaudio
import torch

In [2]:
from utils.data import EmotionDataset

In [3]:
dataset = EmotionDataset(annotation='data/dataset/annotations.json')

In [4]:
from torch.nn.utils.rnn import pad_sequence

pad_sequence([torch.tensor([1,]),  torch.tensor([1, 2])], batch_first=True)

tensor([[1, 0],
        [1, 2]])

In [5]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(items):
    output = {key: [] for key in list(items[0].keys())}
    for item in items:
        for key in item:
            output[key].append(torch.tensor(item[key]))
    for key in list(output.keys()):
        if key == 'emotion' or key == 'state':
            output[key] = torch.stack(output[key])
        else:
            output[key] = pad_sequence(output[key], batch_first=True)
    return output

dataloader = DataLoader(dataset, batch_size=1, num_workers=1, collate_fn=collate_fn, shuffle=True)

In [6]:
batch = next(iter(dataloader))
batch

{'array': tensor([[-0.0028, -0.0038, -0.0032,  ...,  0.0077,  0.0072,  0.0067]]),
 'emotion': tensor([4]),
 'state': tensor([2])}

In [14]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import numpy as np
train_idx, validation_idx = train_test_split(np.arange(len(dataset)),
                                             test_size=0.15,
                                             random_state=999,
                                             shuffle=True,
                                             stratify=dataset.emotion_labels)
train_dataset = Subset(dataset, train_idx)
validation_dataset = Subset(dataset, validation_idx)

# Dataloader for train and val
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=1, shuffle=False)

In [7]:
import torch.nn as nn

class Wav2Vec2Classifier(nn.Module):
    def __init__(self, num_classes):
        super(Wav2Vec2Classifier, self).__init__()
        bundle = torchaudio.pipelines.WAV2VEC2_BASE
        self.feature_extractor = bundle.get_model()
        self.linear = nn.Linear(768, num_classes)
        
    def forward(self, X):
        features = self.get_embeddings(X)
        logits = self.linear(features)
        return logits
    
    def get_embeddings(self, X):
        embeddings = self.feature_extractor(X)[0].mean(axis=1)
        return nn.functional.normalize(embeddings)

In [8]:
import lightning as L

class LitModule(L.LightningModule):
    def __init__(self, num_classes: int):
        super().__init__()
        # Model
        self.model = Wav2Vec2Classifier(num_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['array'])
        loss = F.cross_entropy(outputs, batch['emotion'])
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        outputs = self.forward(x)
        loss = F.cross_entropy(outputs, y)
        self.acc(outputs, y)
        self.f1(outputs, y)
        self.log('val_loss', loss)

    def on_validation_epoch_end(self):
        self.log('val_acc', self.acc)
        self.log('val_f1', self.f1)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000, 1100], gamma=0.1)
        return [optimizer], [scheduler]

In [9]:
import torch.nn.functional as F

from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks import LearningRateMonitor

checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename="classifier_{epoch:02d}",
        every_n_epochs=2,
        save_top_k=-1,
)
lr_monitor = LearningRateMonitor(logging_interval='epoch')

model = LitModule(len(emotion_map))
trainer = L.Trainer(accelerator='gpu',
                    devices=1,
                    max_epochs=50,
                    callbacks=[checkpoint_callback, lr_monitor])
trainer.fit(model, dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/danil/anaconda3/envs/emotion-diarization/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 3050') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type               | Params
---------------------------------------------
0 | model | Wav2Vec2Classifier | 94.4 M
---------------------------------------------
94.4 M    Trainable params
0         Non-trainable para

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=1` reached.
