In [9]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset,DataLoader
import csv
import numpy as np
import librosa
import librosa.display
from utils_train import train, test, fit


np.random.seed(123)
learning_rate = 0.005
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [10]:
def csvToAudioList(filename,sourceDir):
    dataList = []
    with open(filename, "rt") as csvfile:
        lines = csv.reader(csvfile)
        dataList = list(lines)
        dataList.pop(0)
        #delete
        #dataList = dataList[1800:2000]
        #

        audDataset = []
        labelDataset = []
        print(len(dataList))
        for x in dataList:
            audData, freq = librosa.load(sourceDir + x[5] +"/"+ x[0])
            if(len(audData) != 88200):
                audData = fillWithZeros(audData)
            audDataset.append(audData)
            labelDataset.append(labelTrans(x[7]))
        print("Finished")

    return audDataset, labelDataset

def fillWithZeros(audData):
    if(len(audData) < 88200):
        return np.append(audData,np.zeros((88200-len(audData),1),dtype=np.float32))
    else: #One dataset is longer
        audData = audData[:88200]
        return audData


    return audData

def labelTrans(labelString):
    if(labelString == 'siren'):
        return 0
    elif(labelString == 'street_music'):
        return 1
    elif (labelString == 'drilling'):
        return 2
    elif (labelString == 'dog_bark'):
        return 3
    elif (labelString == 'children_playing'):
        return 4
    elif (labelString == 'gun_shot'):
        return 5
    elif (labelString == 'engine_idling'):
        return 6
    elif (labelString == 'air_conditioner'):
        return 7
    elif (labelString == 'jackhammer'):
        return 8
    elif (labelString == 'car_horn'):
        return 9



audList,labelList = csvToAudioList('/Users/manueldrazyk/Documents/Uni/FS19/ATML/git/UrbanSound8K.csv','/Users/manueldrazyk/Documents/Uni/FS19/ATML/git/audio/fold')

8732
Finished


In [11]:
class AudioDataset(Dataset):
    def __init__(self, data_audio, data_label):

        self.data_set = np.array(data_audio)
        self.data_label1 = np.array(data_label)

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        data_entry = self.data_set[index]
        data_entry = torch.from_numpy(data_entry).reshape(4,int(len(self.data_set[index])/4))
        data_lab = torch.from_numpy(np.array([self.data_label1[index]]))

        return data_entry, data_lab.long()


split_refList = int(len(audList)*0.8)
split_refListSec = int(len(audList)*0.9)
train_audList, val_audList,test_audList = audList[:split_refList], audList[split_refList:split_refListSec], audList[split_refListSec:]
train_labelList, val_labelList,test_labelList = labelList[:split_refList], labelList[split_refList:split_refListSec], labelList[split_refListSec:]


In [12]:
class SimpleConvNet(nn.Module):

    def __init__(self):
        super(SimpleConvNet, self).__init__()
           
        self.conv_layer1 = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=1, stride=1, padding=1),
            nn.ReLU(),
        )
        self.conv_layer2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        self.conv_layer3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d((1))
        )

        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )

    def forward(self, input):
        output = self.conv_layer1(input)
        output = self.conv_layer2(output)
        output = self.conv_layer3(output)
        
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output


In [13]:
class ConvNet(nn.Module):

    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=1, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.AdaptiveAvgPool1d(1)

        )
        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )

    def forward(self, input):
        output = self.conv_layers(input)
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output


In [14]:
trainDataset = AudioDataset(train_audList,train_labelList)
valDataset = AudioDataset(val_audList,val_labelList)
testDataset = AudioDataset(test_audList,test_labelList)

train_dataloader = torch.utils.data.DataLoader(trainDataset, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(valDataset, batch_size=32, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(testDataset, batch_size=32, num_workers=4)



In [10]:
class AudioDatasetRes(Dataset):
    def __init__(self, data_audio, data_label):

        self.data_set = np.array(data_audio)
        self.data_label1 = np.array(data_label)

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        data_entry = self.data_set[index]
        data_entry = torch.from_numpy(data_entry).reshape(1,4,1,int(len(self.data_set[index])/4))
        data_lab = torch.from_numpy(np.array([self.data_label1[index]]))
        
        return data_entry, data_lab.long()


trainDatasetRes = AudioDatasetRes(train_audList,train_labelList)
valDatasetRes = AudioDatasetRes(val_audList,val_labelList)

epochs = 8
RES = models.resnet18()

RES.conv1 = nn.Conv2d(4, 64, kernel_size=1, stride=2, padding=3,
                               bias=False)
optimizerRES = torch.optim.Adam(RES.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerRES, step_size=10, gamma=0.1)
fit(trainDatasetRes,val_dataloader,RES,optimizerRES,loss_fn,epochs, scheduler )


Epoch 1/8: train_loss: 2.7921, train_accuracy: 11.2695, val_loss: 2.2843, val_accuracy: 10.8556
Epoch 2/8: train_loss: 2.2722, train_accuracy: 11.8215, val_loss: 2.2717, val_accuracy: 10.8556
Epoch 3/8: train_loss: 2.2675, train_accuracy: 11.8215, val_loss: 2.2705, val_accuracy: 10.8556
Epoch 4/8: train_loss: 2.2670, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 5/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 6/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 7/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 8/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556


In [None]:
epochs = 25

SCN = SimpleConvNet()


optimizerSCN = torch.optim.Adam(SCN.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerSCN, step_size=10, gamma=0.1)
fit(train_dataloader,val_dataloader,SCN,optimizerSCN,loss_fn,epochs, scheduler )
testLoss, testAcc = test(SCN, test_dataloader, loss_fn)
print("The accuracy on the testdata is " + str(testAcc) + "%")


Epoch 1/25: train_loss: 2.2285, train_accuracy: 15.0752, val_loss: 2.3719, val_accuracy: 12.4857
Epoch 2/25: train_loss: 2.1033, train_accuracy: 21.4603, val_loss: 2.2498, val_accuracy: 23.8259
Epoch 3/25: train_loss: 1.9090, train_accuracy: 28.8762, val_loss: 2.2042, val_accuracy: 33.5624
Epoch 4/25: train_loss: 1.7688, train_accuracy: 34.3737, val_loss: 2.0987, val_accuracy: 29.0951
Epoch 5/25: train_loss: 1.6778, train_accuracy: 36.4782, val_loss: 2.1224, val_accuracy: 30.5842
Epoch 6/25: train_loss: 1.6154, train_accuracy: 39.7709, val_loss: 2.0290, val_accuracy: 28.0641
Epoch 7/25: train_loss: 1.5848, train_accuracy: 41.4030, val_loss: 2.0413, val_accuracy: 32.7606
Epoch 8/25: train_loss: 1.5427, train_accuracy: 42.9635, val_loss: 2.0754, val_accuracy: 33.9061
Epoch 9/25: train_loss: 1.5192, train_accuracy: 43.8511, val_loss: 1.9952, val_accuracy: 39.7480
Epoch 10/25: train_loss: 1.4970, train_accuracy: 45.5548, val_loss: 2.0657, val_accuracy: 38.7171
Epoch 11/25: train_loss: 1.47

In [None]:
CN = ConvNet()

optimizerCN = torch.optim.Adam(CN.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerCN, step_size=10, gamma=0.1)
fit(train_dataloader,val_dataloader,CN,optimizerCN,loss_fn,epochs, scheduler )
