In [1]:
import os
from pathlib import Path
os.chdir(Path('.').absolute().parent)

In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, random_split
from tqdm.notebook import tqdm

from audio.dataset import DEAMDataset
from audio.model import AudioCNNEncoder
from audio.model import AudioLSTMEncoder

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

device: cpu


In [4]:
dataset = DEAMDataset()
dataset.load()

In [5]:
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2)
print(f'train: {len(train_dataset)}, test: {len(test_dataset)}')

train: 45924, test: 11482


In [6]:
batch_size = 64

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [7]:
encoder = AudioCNNEncoder(n_frames=160, n_mfcc=40, n_out=64).to(device)
predictor = nn.Linear(64, 1).to(device)

optimizer = Adam([*encoder.parameters(), *predictor.parameters()], lr=0.0005, betas=(0.9, 0.999), eps=1e-8)
scheduler = StepLR(optimizer, step_size=30, gamma=0.2)
loss_fn = nn.MSELoss()

In [9]:
epochs = 50

for epoch in tqdm(range(epochs)):
    train_loss, test_loss = 0, 0
    
    encoder.train()
    predictor.train()
    for i, batch in enumerate(train_dataloader):
        batch = [tensor.to(device) for tensor in batch]
        samples, targets = batch
        targets = targets[:,0].reshape(-1,1)  # arousal target
        
        optimizer.zero_grad()

        h = encoder(samples)
        pred = predictor(h)

        loss = loss_fn(pred, targets)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    encoder.eval()
    predictor.eval()
    for i, batch in enumerate(test_dataloader):
        batch = [tensor.to(device) for tensor in batch]
        samples, targets = batch
        targets = targets[:,0].reshape(-1,1)  # arousal target
        
        with torch.no_grad():
            h = encoder(samples)
            pred = predictor(h)

            loss = loss_fn(pred, targets)
            test_loss += loss.item()
    
    scheduler.step()
        
    print(f'epoch={epoch+1},\t'
          f'train_loss={train_loss / len(train_dataloader)},\t'
          f'test_loss={test_loss / len(test_dataloader)}')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

epoch=1,	train_loss=0.05454583984636248,	test_loss=0.04660609892259041
epoch=2,	train_loss=0.04315651017747051,	test_loss=0.03865784530838331
epoch=3,	train_loss=0.03987751318748864,	test_loss=0.03816457819193601



In [None]:
encoder = AudioLSTMEncoder(n_mfcc=40, n_hidden=256, n_out=32).to(device)
predictor = Predictor(n_in=32, n_out=1).to(device)

optimizer = Adam([*encoder.parameters(), *predictor.parameters()], lr=0.0005, betas=(0.9, 0.999), eps=1e-8)
scheduler = StepLR(optimizer, step_size=15, gamma=0.2)
loss_fn = nn.MSELoss()