In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset,DataLoader
import csv
import numpy as np
import librosa
import librosa.display
from utils_train import train, test, fit


np.random.seed(123)
learning_rate = 0.005
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [3]:
def csvToAudioList(filename,sourceDir):
    dataList = []
    with open(filename, "rt") as csvfile:
        lines = csv.reader(csvfile)
        dataList = list(lines)
        dataList.pop(0)
        #delete
        #dataList = dataList[1950:2000]
        #

        audDataset = []
        labelDataset = []
        print(len(dataList))
        for x in dataList:
            audData, freq = librosa.load(sourceDir + x[0] + ".wav")
            if(len(audData) != 88200):
                audData = fillWithZeros(audData)
            audDataset.append(audData)
            labelDataset.append(labelTrans(x[1]))
        print("Finished")

    return audDataset, labelDataset

def fillWithZeros(audData):
    if(len(audData) < 88200):
        return np.append(audData,np.zeros((88200-len(audData),1),dtype=np.float32))
    else: #One dataset is longer
        audData = audData[:88200]
        return audData


    return audData

def labelTrans(labelString):
    if(labelString == 'siren'):
        return 0
    elif(labelString == 'street_music'):
        return 1
    elif (labelString == 'drilling'):
        return 2
    elif (labelString == 'dog_bark'):
        return 3
    elif (labelString == 'children_playing'):
        return 4
    elif (labelString == 'gun_shot'):
        return 5
    elif (labelString == 'engine_idling'):
        return 6
    elif (labelString == 'air_conditioner'):
        return 7
    elif (labelString == 'jackhammer'):
        return 8
    elif (labelString == 'car_horn'):
        return 9



audList,labelList = csvToAudioList('/Users/manueldrazyk/Documents/Uni/FS19/ATML/Projekt/Proj/Data/urban-sound-classification/train/train.csv','/Users/manueldrazyk/Documents/Uni/FS19/ATML/Projekt/Proj/Data/urban-sound-classification/train/Train/')

5435
Finished


In [4]:
class AudioDataset(Dataset):
    def __init__(self, data_audio, data_label):

        self.data_set = np.array(data_audio)
        self.data_label1 = np.array(data_label)

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        data_entry = self.data_set[index]
        data_entry = torch.from_numpy(data_entry).reshape(1,4,int(len(self.data_set[index])/4))
        data_lab = torch.from_numpy(np.array([self.data_label1[index]]))

        return data_entry, data_lab.long()


split_refList = int(len(audList)*0.8)
train_audList, val_audList = audList[:split_refList], audList[split_refList:]
train_labelList, val_labelList = labelList[:split_refList], labelList[split_refList:]

In [5]:
class SimpleConvNet(nn.Module):

    def __init__(self):
        super(SimpleConvNet, self).__init__()
           
        self.conv_layer1 = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=1, stride=1, padding=1),
            nn.ReLU(),
        )
        self.conv_layer2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        self.conv_layer3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d((1))
        )

        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )

    def forward(self, input):
        output = self.conv_layer1(input)
        output = self.conv_layer2(output)
        output = self.conv_layer3(output)
        
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output


In [6]:
class ConvNet(nn.Module):

    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=16, kernel_size=1, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.AdaptiveAvgPool1d(1)

        )
        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )

    def forward(self, input):
        output = self.conv_layers(input)
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output


In [12]:
trainDataset = AudioDataset(train_audList,train_labelList)
valDataset = AudioDataset(val_audList,val_labelList)



In [10]:
class AudioDatasetRes(Dataset):
    def __init__(self, data_audio, data_label):

        self.data_set = np.array(data_audio)
        self.data_label1 = np.array(data_label)

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        data_entry = self.data_set[index]
        data_entry = torch.from_numpy(data_entry).reshape(1,4,1,int(len(self.data_set[index])/4))
        data_lab = torch.from_numpy(np.array([self.data_label1[index]]))
        
        return data_entry, data_lab.long()


trainDatasetRes = AudioDatasetRes(train_audList,train_labelList)
valDatasetRes = AudioDatasetRes(val_audList,val_labelList)

epochs = 8
RES = models.resnet18()

RES.conv1 = nn.Conv2d(4, 64, kernel_size=1, stride=2, padding=3,
                               bias=False)
optimizerRES = torch.optim.Adam(RES.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerRES, step_size=10, gamma=0.1)
fit(trainDatasetRes,valDatasetRes,RES,optimizerRES,loss_fn,epochs, scheduler )


Epoch 1/8: train_loss: 2.7921, train_accuracy: 11.2695, val_loss: 2.2843, val_accuracy: 10.8556
Epoch 2/8: train_loss: 2.2722, train_accuracy: 11.8215, val_loss: 2.2717, val_accuracy: 10.8556
Epoch 3/8: train_loss: 2.2675, train_accuracy: 11.8215, val_loss: 2.2705, val_accuracy: 10.8556
Epoch 4/8: train_loss: 2.2670, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 5/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 6/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 7/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556
Epoch 8/8: train_loss: 2.2669, train_accuracy: 11.7985, val_loss: 2.2703, val_accuracy: 10.8556


In [13]:
epochs = 25

SCN = SimpleConvNet()

optimizerSCN = torch.optim.Adam(SCN.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerSCN, step_size=10, gamma=0.1)
fit(trainDataset,valDataset,SCN,optimizerSCN,loss_fn,epochs, scheduler )


Epoch 1/25: train_loss: 2.2628, train_accuracy: 12.2585, val_loss: 2.1837, val_accuracy: 20.5152
Epoch 2/25: train_loss: 2.0456, train_accuracy: 24.1720, val_loss: 1.9912, val_accuracy: 28.0589
Epoch 3/25: train_loss: 1.8446, train_accuracy: 32.4287, val_loss: 1.7837, val_accuracy: 33.1187
Epoch 4/25: train_loss: 1.7579, train_accuracy: 37.3045, val_loss: 1.7139, val_accuracy: 36.3385
Epoch 5/25: train_loss: 1.6962, train_accuracy: 39.6964, val_loss: 1.6652, val_accuracy: 37.9945
Epoch 6/25: train_loss: 1.6451, train_accuracy: 40.8464, val_loss: 1.6410, val_accuracy: 39.9264
Epoch 7/25: train_loss: 1.6200, train_accuracy: 41.4443, val_loss: 1.6554, val_accuracy: 37.6265
Epoch 8/25: train_loss: 1.5964, train_accuracy: 41.9273, val_loss: 1.6484, val_accuracy: 38.0865
Epoch 9/25: train_loss: 1.5769, train_accuracy: 42.6633, val_loss: 1.6438, val_accuracy: 38.9144
Epoch 10/25: train_loss: 1.5616, train_accuracy: 43.0543, val_loss: 1.6300, val_accuracy: 38.5465
Epoch 11/25: train_loss: 1.54

([2.2627857760102135,
  2.0456334563570224,
  1.844580414513916,
  1.7578734270748515,
  1.6962225826443433,
  1.645136046150516,
  1.620046620500525,
  1.5964140497977402,
  1.5769423352898868,
  1.5616400144195737,
  1.5448436027584664,
  1.457401038455094,
  1.4365336230586316,
  1.4302009829948035,
  1.4254664106272237,
  1.4213425724157975,
  1.4176568542602845,
  1.4142032662242061,
  1.4109399133570075,
  1.4078366680296948,
  1.404848719187074,
  1.3923304937128749,
  1.3897040798898577,
  1.3888061850665012,
  1.388239005885559],
 [12.258509659613615,
  24.17203311867525,
  32.42870285188592,
  37.30450781968721,
  39.69641214351426,
  40.84636614535419,
  41.44434222631095,
  41.92732290708371,
  42.66329346826127,
  43.054277828886846,
  43.399264029438825,
  45.74517019319227,
  46.04415823367065,
  46.45814167433303,
  46.52713891444342,
  46.550137994480224,
  46.481140754369825,
  46.59613615455382,
  46.71113155473781,
  46.757129714811406,
  46.8721251149954,
  47.4701

In [16]:
CN = ConvNet()

optimizerCN = torch.optim.Adam(CN.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizerCN, step_size=10, gamma=0.1)
fit(trainDataset,valDataset,CN,optimizerCN,loss_fn,epochs, scheduler )


Epoch 1/25: train_loss: 2.2780, train_accuracy: 11.4995, val_loss: 2.4807, val_accuracy: 6.8997
Epoch 2/25: train_loss: 2.2735, train_accuracy: 12.1435, val_loss: 2.8234, val_accuracy: 10.1196
Epoch 3/25: train_loss: 2.2734, train_accuracy: 12.2815, val_loss: 2.3582, val_accuracy: 9.3836
Epoch 4/25: train_loss: 2.2734, train_accuracy: 12.2585, val_loss: 2.7325, val_accuracy: 12.4195
Epoch 5/25: train_loss: 2.2734, train_accuracy: 12.2815, val_loss: 2.3906, val_accuracy: 9.9356
Epoch 6/25: train_loss: 2.2732, train_accuracy: 12.0285, val_loss: 2.4147, val_accuracy: 10.6716
Epoch 7/25: train_loss: 2.2733, train_accuracy: 12.1665, val_loss: 2.3659, val_accuracy: 10.0276
Epoch 8/25: train_loss: 2.2734, train_accuracy: 12.3275, val_loss: 2.3847, val_accuracy: 11.4995
Epoch 9/25: train_loss: 2.2734, train_accuracy: 12.0285, val_loss: 2.3492, val_accuracy: 10.1196
Epoch 10/25: train_loss: 2.2733, train_accuracy: 12.2585, val_loss: 2.3292, val_accuracy: 13.0635
Epoch 11/25: train_loss: 2.2734,

([2.2779743642355172,
  2.2735209357552058,
  2.2734023361383717,
  2.2734253404057125,
  2.2733978919184525,
  2.273244211239451,
  2.273281063360146,
  2.273390335837086,
  2.2733691846600563,
  2.27331096824968,
  2.2734226442414243,
  2.2639747163848036,
  2.26347461422321,
  2.2632662570575235,
  2.2631571382256745,
  2.2630980505017697,
  2.263066344614301,
  2.2630405086054095,
  2.2630167284906735,
  2.263002661520336,
  2.262992465452196,
  2.262283345264586,
  2.26220620561326,
  2.262163414742383,
  2.2621396985641837],
 [11.499540018399264,
  12.143514259429622,
  12.281508739650414,
  12.258509659613615,
  12.281508739650414,
  12.02851885924563,
  12.166513339466421,
  12.32750689972401,
  12.02851885924563,
  12.258509659613615,
  12.21251149954002,
  12.35050597976081,
  12.534498620055198,
  12.603495860165593,
  12.580496780128795,
  12.603495860165593,
  12.64949402023919,
  12.67249310027599,
  12.64949402023919,
  12.64949402023919,
  12.626494940202392,
  12.64949