In [30]:
import sys
sys.path.insert(0, '../')

import pandas as pd
from sklearn import metrics
#from keras.wrappers.scikit_learn import KerasClassifier
from classification_plots import plot_confusion_matrix
import matplotlib.pyplot as plt
import os
from collections import Counter
import numpy as np
import scipy 
%matplotlib notebook

In [31]:
# Able to specify which GPU to use
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import warnings
warnings.filterwarnings('ignore')
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

orig_SR = 44100
orig_blocksize = int(orig_SR * 5)
orig_overlap = 0 #int(orig_SR/4)

SR = 16000
blocksize = int(SR * 5)
overlap = 0 #int(SR/4)

In [32]:
import warnings
warnings.filterwarnings('ignore')

# Load Dataset
Here we load the csv that describes each file in the dataset. We add a high level category that is defined in the ESC-50 documentation. This we realize is anthetical to true training, it is a stopgap for when we use NLP to classify tags into these categories.

In [33]:
path_to_db='../../ESC-50/'
audio_dir = path_to_db + 'audio/'
dataset = pd.read_csv(path_to_db + 'meta/esc50.csv')
classes = [None] * 50
h_classes = ['Human & Animal', 'Interacting Materials']
mapping = {'dog': 0,'rooster': 0,'pig': 0,'cow': 0,'frog': 0,'cat': 0,'hen': 0,
            'insects': 0,'sheep': 0,'crow': 0,'rain': 1,'sea_waves': 1,'crackling_fire': 1,
            'crickets': 0,'chirping_birds': 0,'water_drops': 1,'wind': 1,'pouring_water': 1,
            'toilet_flush': 1,'thunderstorm': 1,'crying_baby': 0,'sneezing': 0,'clapping': 0,
            'breathing': 0,'coughing': 0,'footsteps': 1,'laughing': 0,'brushing_teeth': 1,
            'snoring': 0,'drinking_sipping': 1,'door_wood_knock': 1,'mouse_click': 1,
            'keyboard_typing': 1,'door_wood_creaks': 1,'can_opening': 1,'washing_machine': 1,
            'vacuum_cleaner': 1,'clock_alarm': 1,'clock_tick': 1,'glass_breaking':1,'helicopter': 1,
            'chainsaw': 1,'siren': 1,'car_horn': 1,'engine': 1,'train': 1,'church_bells': 1,
            'airplane': 1,'fireworks': 1,'hand_saw': 1,
            }
dataset['h_target'] = None
for index, row in dataset.iterrows():
    target = row['target']
    classes[target] = row['category']
    dataset.loc[index, 'h_target'] = mapping[row['category']]

In [34]:
dataset.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take,h_target
0,1-100032-A-0.wav,1,0,dog,True,100032,A,0
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,0
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,1
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B,1
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,1


In [35]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self,):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3)
        self.conv2 = nn.Conv2d(32, 32, 3)
        self.norm = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(7, 7)
        self.fc1 = nn.Linear(512, 64)
        self.dropout = nn.Dropout2d(p=0.2)
#         self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(64,32)
        self.fc4 = nn.Linear(32,2)
        self.softmax = nn.LogSoftmax()

    def forward(self, x):
        x = self.conv1(x)
        x = self.norm(x)
        x = F.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.norm(x)
        x = F.relu(x)
        x = self.pool(x)
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = F.relu(x)
#         x = self.fc2(x)
#         x = F.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.softmax(x)
        return x
    
net = Net()
print(net)
net.cuda()

Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=7, stride=7, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=64, bias=True)
  (dropout): Dropout2d(p=0.2, inplace=False)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=2, bias=True)
  (softmax): LogSoftmax()
)


Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=7, stride=7, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=64, bias=True)
  (dropout): Dropout2d(p=0.2, inplace=False)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=2, bias=True)
  (softmax): LogSoftmax()
)

In [36]:
# load data:
from PIL import Image
import os 
import numpy as np
from torch.utils.data import Dataset, DataLoader

class trainset(Dataset):
    def __init__(self):
        
        root = "./data2/"
        self.data_list = []
        self.label_list = []
        self.h_label_list =[]
        for root, dir, files in os.walk("./data2/"):
            for file in files:
                if file.find('.npy')!= -1:
                    self.data_list.append(os.path.join(root, file))
                    label = int(root[13:])
                    self.label_list.append(label)
                    row_idx = dataset.loc[dataset['target']==label]
                    self.h_label_list.append(row_idx['h_target'])
        print(len(self.data_list), len(self.label_list),len(self.h_label_list))
        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, index):

        path = self.data_list[index]
        data = np.load(path,allow_pickle = True)
        data = np.expand_dims(data, axis=0)
        label= self.label_list[index] 
        high_label = self.h_label_list[index]    
        label = np.asarray(label)
        high_label = (int)(np.mean(high_label))
        
        return data, label , high_label
dataloader = trainset()
data,label,high_label = dataloader.__getitem__(1598)
print(label)
print(high_label)
# print(label.shape)

2000 2000 2000
44
1


In [38]:
import torch.optim as optim
import torch
from torch.utils.data.sampler import SubsetRandomSampler
import time

device = 0
# model = BaseNet()
model = net
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

validation_split = .2
random_seed= 42
shuffle_dataset = True
dataset_size = 2000
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

datasets  = trainset()
trainloader = torch.utils.data.DataLoader(datasets, batch_size=10, 
                                           sampler=train_sampler)
valloader = torch.utils.data.DataLoader(datasets, batch_size=10,
                                                sampler=valid_sampler)

val_history = []
val_loss_hist = []
train_history = []
train_loss_hist = []

for epoch in range(500):  # loop over the dataset multiple times
    ######### Training ###########   
    
    running_loss = 0.0
    count = 0
    training_accuracy = 0
    running_accuracy = 0
    t1 = time.time()
    for i, data in enumerate(trainloader, 0):
        count += 1
        inputs, labels , high_labels = data
        inputs = inputs.float().to(device)
        labels = labels.long().to(device)
        high_labels = high_labels.long().to(device)
        optimizer.zero_grad()
        outputs = model(inputs) 
        
        loss = criterion(outputs, high_labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        acc_train = torch.eq(preds, labels).float().mean()
        running_accuracy += acc_train.item()
        
    running_accuracy /= count
    running_loss /= count
    train_history.append(running_accuracy)
    train_loss_hist.append(running_loss)
    t2 = time.time()
    print("===========Phase: Train============") 
    print("Training Time: {}".format(t2 - t1))
    print("Epoch: {}  train_loss: {}".format(epoch, running_loss))
    print("Epoch: {}  train_accuracy: {}".format(epoch, running_accuracy))
    
        
# # #             torch.save(net, 'toy_model.pt')
        
    
    print('Finished Training')
    
########## Validation ###########
    
    count = 0
    running_accuracy = 0
    running_loss = 0.0
    t1 = time.time()
    for i, data in enumerate(valloader, 0):
        count += 1
        inputs, labels,high_labels = data
        inputs = inputs.float().to(device)
        labels = labels.long().to(device)        
        high_labels = high_labels.long().to(device)
        outputs = model(inputs)
        outputs = outputs.squeeze(1)
        val_loss = criterion(outputs, high_labels)
        _, preds = torch.max(outputs, 1)
        acc_val = torch.eq(preds, labels).float().mean()
        running_accuracy += acc_val.item()
        running_loss += val_loss.item()
        
    running_accuracy /= count
    running_loss /= count
    val_history.append(running_accuracy)
    val_loss_hist.append(running_loss)
    t2 = time.time()
    print("===========Phase: Val============")
    print("Validation Time: {}".format(t2 - t1))
    print("Epoch: {}  val_loss: {}".format(epoch, running_loss))
    print("Epoch: {}  val_accuracy: {}".format(epoch, running_accuracy))
    



2000 2000 2000
Training Time: 8.692643404006958
Epoch: 0  train_loss: 0.2003171399817802
Epoch: 0  train_accuracy: 0.023125000391155483
Finished Training
Validation Time: 0.8637504577636719
Epoch: 0  val_loss: 0.38404234098270534
Epoch: 0  val_accuracy: 0.012500000186264515
Training Time: 8.035685777664185
Epoch: 1  train_loss: 0.14869969184001092
Epoch: 1  train_accuracy: 0.023750000353902578
Finished Training
Validation Time: 0.8413238525390625
Epoch: 1  val_loss: 0.4088881375733763
Epoch: 1  val_accuracy: 0.010000000149011612
Training Time: 8.170022249221802
Epoch: 2  train_loss: 0.16564060213859194
Epoch: 2  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.9472634792327881
Epoch: 2  val_loss: 0.4785086778225377
Epoch: 2  val_accuracy: 0.010000000149011612
Training Time: 8.058465003967285
Epoch: 3  train_loss: 0.19682996090268717
Epoch: 3  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8458108901977539
Epoch: 3  val_loss: 0.4511814

Validation Time: 0.8645808696746826
Epoch: 24  val_loss: 0.4595656434074044
Epoch: 24  val_accuracy: 0.012500000186264515
Training Time: 8.027220964431763
Epoch: 25  train_loss: 0.10196591474523302
Epoch: 25  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8383035659790039
Epoch: 25  val_loss: 0.4473859768710099
Epoch: 25  val_accuracy: 0.010000000149011612
Training Time: 8.002433776855469
Epoch: 26  train_loss: 0.0909186626355222
Epoch: 26  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8522908687591553
Epoch: 26  val_loss: 0.4432221340015531
Epoch: 26  val_accuracy: 0.010000000149011612
Training Time: 8.024035453796387
Epoch: 27  train_loss: 0.11518221809601528
Epoch: 27  train_accuracy: 0.0218750003259629
Finished Training
Validation Time: 0.8497273921966553
Epoch: 27  val_loss: 0.5049523902125657
Epoch: 27  val_accuracy: 0.012500000186264515
Training Time: 7.988353252410889
Epoch: 28  train_loss: 0.10557992614631076
Epoch: 28  tr

Training Time: 7.991253852844238
Epoch: 49  train_loss: 0.06119406250336397
Epoch: 49  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8647103309631348
Epoch: 49  val_loss: 0.46483967748936267
Epoch: 49  val_accuracy: 0.012500000186264515
Training Time: 8.056375741958618
Epoch: 50  train_loss: 0.06555144509311503
Epoch: 50  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8413527011871338
Epoch: 50  val_loss: 0.5647344280172547
Epoch: 50  val_accuracy: 0.012500000186264515
Training Time: 8.138542413711548
Epoch: 51  train_loss: 0.08308668791414675
Epoch: 51  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8823802471160889
Epoch: 51  val_loss: 0.5350491156073985
Epoch: 51  val_accuracy: 0.010000000149011612
Training Time: 8.075538873672485
Epoch: 52  train_loss: 0.10103733849227865
Epoch: 52  train_accuracy: 0.023750000353902578
Finished Training
Validation Time: 1.5180583000183105
Epoch: 52  val_loss: 0.480219

Validation Time: 0.8462436199188232
Epoch: 73  val_loss: 0.5853761941427365
Epoch: 73  val_accuracy: 0.010000000149011612
Training Time: 8.168365478515625
Epoch: 74  train_loss: 0.06503757152363505
Epoch: 74  train_accuracy: 0.023125000344589353
Finished Training
Validation Time: 0.8572430610656738
Epoch: 74  val_loss: 0.49989884881069885
Epoch: 74  val_accuracy: 0.007500000111758709
Training Time: 8.460097789764404
Epoch: 75  train_loss: 0.036525165077353
Epoch: 75  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.9035744667053223
Epoch: 75  val_loss: 0.680796067357005
Epoch: 75  val_accuracy: 0.010000000149011612
Training Time: 8.08557653427124
Epoch: 76  train_loss: 0.050627782171022775
Epoch: 76  train_accuracy: 0.022500000381842255
Finished Training
Validation Time: 0.8477330207824707
Epoch: 76  val_loss: 0.5652326879790053
Epoch: 76  val_accuracy: 0.010000000149011612
Training Time: 8.045774698257446
Epoch: 77  train_loss: 0.05218193802219275
Epoch: 77  t

Training Time: 8.483942031860352
Epoch: 98  train_loss: 0.03746325060720892
Epoch: 98  train_accuracy: 0.02187500037252903
Finished Training
Validation Time: 0.8581154346466064
Epoch: 98  val_loss: 0.5294478004238045
Epoch: 98  val_accuracy: 0.010000000149011612
Training Time: 8.24796438217163
Epoch: 99  train_loss: 0.03255741140803821
Epoch: 99  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.9454138278961182
Epoch: 99  val_loss: 0.4504081368955667
Epoch: 99  val_accuracy: 0.010000000149011612
Training Time: 8.308959484100342
Epoch: 100  train_loss: 0.045126213022149385
Epoch: 100  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8813998699188232
Epoch: 100  val_loss: 0.8646254455758026
Epoch: 100  val_accuracy: 0.012500000186264515
Training Time: 8.274163246154785
Epoch: 101  train_loss: 0.047546585380155194
Epoch: 101  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.9389848709106445
Epoch: 101  val_loss: 0.

Epoch: 122  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8632106781005859
Epoch: 122  val_loss: 0.709910029923958
Epoch: 122  val_accuracy: 0.010000000149011612
Training Time: 8.271828651428223
Epoch: 123  train_loss: 0.04143220956415235
Epoch: 123  train_accuracy: 0.023750000353902578
Finished Training
Validation Time: 0.9162836074829102
Epoch: 123  val_loss: 0.7179790366441011
Epoch: 123  val_accuracy: 0.010000000149011612
Training Time: 8.258960247039795
Epoch: 124  train_loss: 0.04771507999535061
Epoch: 124  train_accuracy: 0.0218750003259629
Finished Training
Validation Time: 0.8549456596374512
Epoch: 124  val_loss: 0.7092957653834674
Epoch: 124  val_accuracy: 0.012500000186264515
Training Time: 8.163755655288696
Epoch: 125  train_loss: 0.044228904262604375
Epoch: 125  train_accuracy: 0.0218750003259629
Finished Training
Validation Time: 0.8787345886230469
Epoch: 125  val_loss: 0.47842565862374614
Epoch: 125  val_accuracy: 0.010000000149011612
Training

Validation Time: 0.9206619262695312
Epoch: 146  val_loss: 0.5220181993528967
Epoch: 146  val_accuracy: 0.012500000186264515
Training Time: 8.27796745300293
Epoch: 147  train_loss: 0.052405612750219
Epoch: 147  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8966801166534424
Epoch: 147  val_loss: 0.7160301628522575
Epoch: 147  val_accuracy: 0.015000000223517418
Training Time: 8.337750434875488
Epoch: 148  train_loss: 0.03311311818426148
Epoch: 148  train_accuracy: 0.023125000344589353
Finished Training
Validation Time: 0.8687667846679688
Epoch: 148  val_loss: 0.5642943604965694
Epoch: 148  val_accuracy: 0.010000000149011612
Training Time: 8.190211534500122
Epoch: 149  train_loss: 0.03151261827293297
Epoch: 149  train_accuracy: 0.022500000335276125
Finished Training
Validation Time: 0.8717164993286133
Epoch: 149  val_loss: 0.5837843799381517
Epoch: 149  val_accuracy: 0.012500000186264515


KeyboardInterrupt: 

In [21]:
torch.save(model, 'hierarchical_cnn.pt')

## High-Level Shallow Nets
Train binary shallow nets for high level categories(animals, natural, human, domestic, urban)

In [13]:
from cnnmult import CNN_Multilayer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.decomposition import PCA

clf = CNN_Multilayer()

clf.fit(train_X, train_y)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram_1 (Melspectro (None, 128, 313, 1)       296064    
_________________________________________________________________
conv1 (Conv2D)               (None, 120, 305, 32)      2624      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 17, 32)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2176)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                69664     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
__________

Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


CNN_Multilayer(a_batch_size=128, a_epochs=50, batch_size=128, epochs=50,
        i_batch_size=128, i_epochs=50, validation_split=0.05, verbose=1)