In [33]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import csv
import numpy as np
import librosa
import librosa.display
#from utils_train import train, test, fit


np.random.seed(123)
learning_rate = 0.005
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [3]:
def csvToAudioList(filename,sourceDir):
    dataList = []
    with open(filename, "rt") as csvfile:
        lines = csv.reader(csvfile)
        dataList = list(lines)
        dataList.pop(0)
        #delete
        #dataList = dataList[1950:2000]
        #

        audDataset = []
        labelDataset = []
        print(len(dataList))
        for x in dataList:
            audData, freq = librosa.load(sourceDir + x[0] + ".wav")
            if(len(audData) != 88200):
                audData = fillWithZeros(audData)
            audDataset.append(audData)
            labelDataset.append(labelTrans(x[1]))
        print("Finished")

    return audDataset, labelDataset

def fillWithZeros(audData):
    if(len(audData) < 88200):
        return np.append(audData,np.zeros((88200-len(audData),1),dtype=np.float32))
    else: #One dataset is longer
        audData = audData[:88200]
        return audData


    return audData

def labelTrans(labelString):
    if(labelString == 'siren'):
        return 0
    elif(labelString == 'street_music'):
        return 1
    elif (labelString == 'drilling'):
        return 2
    elif (labelString == 'dog_bark'):
        return 3
    elif (labelString == 'children_playing'):
        return 4
    elif (labelString == 'gun_shot'):
        return 5
    elif (labelString == 'engine_idling'):
        return 6
    elif (labelString == 'air_conditioner'):
        return 7
    elif (labelString == 'jackhammer'):
        return 8
    elif (labelString == 'car_horn'):
        return 9



audList,labelList = csvToAudioList('/Users/manueldrazyk/Documents/Uni/FS19/ATML/Projekt/Proj/Data/urban-sound-classification/train/train.csv','/Users/manueldrazyk/Documents/Uni/FS19/ATML/Projekt/Proj/Data/urban-sound-classification/train/Train/')

5435
Finished


In [34]:
class AudioDataset(Dataset):
    def __init__(self, data_audio, data_label):

        self.data_set = np.array(data_audio)
        self.data_label1 = np.array(data_label)

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        data_entry = self.data_set[index]
        data_entry = torch.from_numpy(data_entry).reshape(1,4,int(len(self.data_set[index])/4))
        data_lab = torch.from_numpy(np.array([self.data_label1[index]]))

        return data_entry, data_lab.long()


split_refList = int(len(audList)*0.8)
train_audList, val_audList = audList[:split_refList], audList[split_refList:]
train_labelList, val_labelList = labelList[:split_refList], labelList[split_refList:]

In [38]:
class SimpleConvNet(nn.Module):

    def __init__(self):
        super(SimpleConvNet, self).__init__()
           
        self.conv_layer1 = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=1, stride=1, padding=1),
            nn.ReLU(),
        )
        self.conv_layer2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        self.conv_layer3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d((1))
        )

        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )

    def forward(self, input):
        output = self.conv_layer1(input)
        output = self.conv_layer2(output)
        output = self.conv_layer3(output)
        
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output
    
import numpy as np
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def train(model, train_loader, optimizer, loss_fn, print_every=100):
    '''
    Trains the model for one epoch
    '''
    n_correct = 0
    losses = []
    model.to(device=device)
    model.train()

    for iteration, (images, labels) in enumerate(train_loader):
        images = images.to(device=device)
        labels = labels.to(device=device)
        output = model(images)
        optimizer.zero_grad()
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
#         if iteration % print_every == 0:
#             print('Training iteration {}: loss {:.4f}'.format(iteration, loss.item()))
        losses.append(loss.item())
        n_correct += torch.sum(output.argmax(1) == labels).item()
    accuracy = 100.0 * n_correct / len(train_loader.data_set)
    return np.mean(np.array(losses)), accuracy
            
def test(model, test_loader, loss_fn):
    '''
    Tests the model on data from test_loader
    '''
    model.eval()
    test_loss = 0
    n_correct = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            output = model(images)
            loss = loss_fn(output, labels)
            test_loss += loss.item()
            n_correct += torch.sum(output.argmax(1) == labels).item()

    average_loss = test_loss / len(test_loader)
    accuracy = 100.0 * n_correct / len(test_loader.data_set)
#     print('Test average loss: {:.4f}, accuracy: {:.3f}'.format(average_loss, accuracy))
    return average_loss, accuracy


def fit(train_dataloader, val_dataloader, model, optimizer, loss_fn, n_epochs, scheduler=None):
    train_losses, train_accuracies = [], []
    val_losses, val_accuracies = [], []
    learning_rates = []
    for epoch in range(n_epochs):
        train_loss, train_accuracy = train(model, train_dataloader, optimizer, loss_fn)
        val_loss, val_accuracy = test(model, val_dataloader, loss_fn)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        # We'll monitor learning rate -- just to show that it's decreasing
        learning_rates.append(optimizer.param_groups[0]['lr'])
        ########## Notify a scheduler that an epoch passed
        if scheduler:
            scheduler.step() # argument only needed for ReduceLROnPlateau
        print('Epoch {}/{}: train_loss: {:.4f}, train_accuracy: {:.4f}, val_loss: {:.4f}, val_accuracy: {:.4f}'.format(epoch+1, n_epochs,
                                                                                                          train_losses[-1],
                                                                                                          train_accuracies[-1],
                                                                                                          val_losses[-1],
                                                                                                          val_accuracies[-1]))

    print(learning_rates)
    return train_losses, train_accuracies, val_losses, val_accuracies, learning_rates


In [39]:
trainDataset = AudioDataset(train_audList,train_labelList)
valDataset = AudioDataset(val_audList,val_labelList)


model = SimpleConvNet()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
epochs = 25
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.1)
fit(trainDataset,valDataset,model,optimizer,loss_fn,epochs, scheduler )


Epoch 1/25: train_loss: 2.2674, train_accuracy: 12.2585, val_loss: 2.2352, val_accuracy: 16.3753
Epoch 2/25: train_loss: 2.1821, train_accuracy: 18.0543, val_loss: 2.0808, val_accuracy: 22.1711
Epoch 3/25: train_loss: 2.0145, train_accuracy: 27.0699, val_loss: 1.9696, val_accuracy: 28.5189
Epoch 4/25: train_loss: 1.8820, train_accuracy: 31.8537, val_loss: 1.8205, val_accuracy: 33.9466
Epoch 5/25: train_loss: 1.7774, train_accuracy: 36.1086, val_loss: 1.7430, val_accuracy: 36.7065
Epoch 6/25: train_loss: 1.7133, train_accuracy: 38.7764, val_loss: 1.6924, val_accuracy: 37.9945
Epoch 7/25: train_loss: 1.6748, train_accuracy: 39.8574, val_loss: 1.6581, val_accuracy: 38.8224
Epoch 8/25: train_loss: 1.6422, train_accuracy: 40.8694, val_loss: 1.6471, val_accuracy: 40.0184
Epoch 9/25: train_loss: 1.6201, train_accuracy: 41.9043, val_loss: 1.6373, val_accuracy: 40.6624
Epoch 10/25: train_loss: 1.5997, train_accuracy: 42.8473, val_loss: 1.6310, val_accuracy: 41.3983
Epoch 11/25: train_loss: 1.58

([2.267382847352541,
  2.1820643112652873,
  2.014481748417431,
  1.8819826933441353,
  1.7773694755799088,
  1.7132539797159598,
  1.6747914576541423,
  1.6422185016754867,
  1.6201305266760166,
  1.5996602061971432,
  1.5821348454823791,
  1.501882975351703,
  1.4874296242357492,
  1.4809446119101035,
  1.476146111244262,
  1.4723094263702645,
  1.4685845589445152,
  1.4651872829980497,
  1.4612376250535688,
  1.4580259144986467,
  1.4545194743147158,
  1.444172652521391,
  1.44141231177044,
  1.4402613594867597,
  1.4399861360642212],
 [12.258509659613615,
  18.054277828886846,
  27.069917203311867,
  31.85372585096596,
  36.10855565777369,
  38.77644894204232,
  39.857405703771846,
  40.869365225390986,
  41.904323827046916,
  42.84728610855566,
  43.30726770929163,
  46.550137994480224,
  47.378104875804965,
  47.53909843606256,
  47.93008279668813,
  48.045078196872126,
  48.09107635694572,
  48.5050597976081,
  48.29806807727691,
  48.29806807727691,
  48.11407543698252,
  48.80