In [5]:
import time
import datetime
import os
import random
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import adabound

from trainer_helper import Trainer_Helper

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
random.seed(23)
torch.manual_seed(23)
if device == "cuda:0":
    torch.cuda.manual_seed(23)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data_dir = "UrbanSound8K/bitmap/"
num_classes = 10
batch_size = 16

In [4]:
def bitmap_loader(path):
    with np.load(path) as data:
        #print(data['arr_0'].shape)
        data_len = data['arr_0'].shape[1]
        arr = data['arr_0']
        if data_len > 22050:
            data_len = 22050
            arr = arr[:,0:22050]
        assert arr.shape[1] <= 22050
        #try:
        arr = np.pad(arr, ((0, 0), (0, 22050-data_len)), 'constant')
        #except:
        #    print(data['arr_0'].shape)
        #    print(arr.shape)
        #arr = data['arr_0']
        result = []
        for row in arr:
            unpacked_row = np.unpackbits(row)
            result.append(unpacked_row)

        #return torch.FloatTensor(result)
        return np.array(result)


train_dataset = datasets.DatasetFolder(data_dir + 'train/', loader=bitmap_loader, extensions='npz')
val_dataset = datasets.DatasetFolder(data_dir + 'val/', loader=bitmap_loader, extensions='npz')
test_dataset = datasets.DatasetFolder(data_dir + 'test/', loader=bitmap_loader, extensions='npz')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, pin_memory =True, shuffle=True, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, pin_memory=True, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4)

In [6]:
class RNN(nn.Module):
    def __init__(self, hidden_size=256, lstm_layers=2, cnn_start_channels=256):
        super(RNN, self).__init__()
        self.name = "CNN({})_LSTM({}_hidden_{})".format(cnn_start_channels, lstm_layers, cnn_start_channels)

        self.conv_layers = nn.Sequential(
            # input.size: 16x176400
            nn.Conv1d(in_channels=16, out_channels=cnn_start_channels, kernel_size=30, stride=10),
            # output: 64 x 17638
            nn.ReLU(),
            nn.BatchNorm1d(cnn_start_channels),
            # output 64x17638

            nn.Conv1d(in_channels=cnn_start_channels, out_channels=2*cnn_start_channels, kernel_size=30, stride=10),
            # output: 256 x 1762
            nn.ReLU(),
            nn.BatchNorm1d(2*cnn_start_channels),
            # output: 256 x 1762

            nn.Conv1d(in_channels=2*cnn_start_channels, out_channels=4*cnn_start_channels, kernel_size=30, stride=10),
            # output: 256 x 175
            nn.ReLU(),
            nn.BatchNorm1d(4*cnn_start_channels),
            # output: 256 x 175
        )

        self.rnn = nn.LSTM(input_size=4*cnn_start_channels,
                            hidden_size=hidden_size, dropout=0.2,
                            num_layers=lstm_layers)

        #self.rnn = nn.GRU(input_size=4*cnn_start_channels,
        #                    hidden_size=hidden_size, dropout=0.2,
        #                    num_layers=lstm_layers)

        self.fc = nn.Linear(hidden_size, 10)

    def forward(self, inputs, hidden = None):
        output = self.conv_layers(inputs)

        output = output.transpose(1, 2).transpose(0, 1)

        output = torch.tanh(output)
        output, hidden = self.rnn(output, hidden)

        output = self.fc(output[-1, :, :])

        return output, hidden

    def get_name(self):
        return self.name


In [None]:
from torch.optim.lr_scheduler import ExponentialLR, StepLR, CosineAnnealingLR
from ignite.contrib.handlers.param_scheduler import LRScheduler

model = RNN()

run_name = datetime.datetime.now().strftime("%Y.%m.%d.%H.%M.%S")
run_name = 'rnn-adam-lr0-001-lr-experiment5'

#optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)
#optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
#lr_handler = StepLR(optimizer=optimizer, step_size=200, gamma=0.9)
#scheduler = LRScheduler(lr_handler)
criterion = nn.CrossEntropyLoss()

trainer = Trainer_Helper(run_name, device)
trainer.setup_dataloader(train_dataloader, val_dataloader, test_dataloader)
trainer.add_scheduler(scheduler)
trainer.train_rnn(model, optimizer, criterion, 15)


Evaluation
----------

Lets run this model on the testset.

In [10]:
correct = 0
total = 0

model = RNN()
model = model.to(device)

model_filepath = 'saved_models/rnn_adam_best_val=54_8.pth'

model.load_state_dict(torch.load(model_filepath))
model.eval()

with torch.no_grad():
    for imgs, labels in test_dataloader:
        imgs = imgs.to(device, dtype=torch.float)
        outputs, hidden = model(imgs, None)
        _, pred = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (pred.cpu() == labels).sum().item()

print('Test Accuracy: {}'.format(100.0*correct/total))

Test Accuracy: 53.04659498207885


Somewhat surprising this model underperforms heavily compared to the CNN with MFCC spectograms.
The paper suggest that this should beat the CNN approach or at least give us a comparable result.
We suspect that this could be one of two reasons
* This approach works for for sound event detection but is not suited for urban sound classification
* We didn't have enough data or should have used data augmentation because the network started overfitting too early