In [8]:
import torch

from torch.utils.data import Dataset
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import torchvision
from torchvision import datasets
from torchvision import datasets, models, transforms
import time
import copy
import IPython.display as ipd

from helper import *

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda:0


In [3]:
class RNN(nn.Module):
    def __init__(self, hidden_size=256, lstm_layers=2, cnn_start_channels=256):
        super(RNN, self).__init__()
        self.name = "CNN({})_LSTM({}_hidden_{})".format(cnn_start_channels, lstm_layers, cnn_start_channels)

        self.conv_layers = nn.Sequential(
            # input.size: 16x176400
            nn.Conv1d(in_channels=16, out_channels=cnn_start_channels, kernel_size=30, stride=10),
            # output: 64 x 17638
            nn.ReLU(),
            nn.BatchNorm1d(cnn_start_channels),
            # output 64x17638

            nn.Conv1d(in_channels=cnn_start_channels, out_channels=2*cnn_start_channels, kernel_size=30, stride=10),
            # output: 256 x 1762
            nn.ReLU(),
            nn.BatchNorm1d(2*cnn_start_channels),
            # output: 256 x 1762

            nn.Conv1d(in_channels=2*cnn_start_channels, out_channels=4*cnn_start_channels, kernel_size=30, stride=10),
            # output: 256 x 175
            nn.ReLU(),
            nn.BatchNorm1d(4*cnn_start_channels),
            # output: 256 x 175
        )

        self.rnn = nn.LSTM(input_size=4*cnn_start_channels,
                            hidden_size=hidden_size, dropout=0.2,
                            num_layers=lstm_layers)

        #self.rnn = nn.GRU(input_size=4*cnn_start_channels,
        #                    hidden_size=hidden_size, dropout=0.2,
        #                    num_layers=lstm_layers)

        self.fc = nn.Linear(hidden_size, 10)

    def forward(self, inputs, hidden):
        output = self.conv_layers(inputs)

        output = output.transpose(1, 2).transpose(0, 1)

        output = torch.tanh(output)
        output, hidden = self.rnn(output, hidden)

        output = self.fc(output[-1, :, :])

        return output, hidden

    def get_name(self):
        return self.name


In [8]:
model_filepath = 'saved_models/CNN(256)_LSTM(2_hidden_256)_71'

model = RNN()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # 0.005

checkpoint = torch.load(model_filepath)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])



In [38]:
data_dir = "data/test/bitmap"

test_id = 23

def bitmap_loader(path):
    with np.load(path) as data:
        data_len = data['arr_0'].shape[1]
        arr = np.pad(data['arr_0'], ((0, 0), (0, 22050-data_len)), 'constant')
        result = []
        for row in arr:
            unpacked_row = np.unpackbits(row)
            result.append(unpacked_row)

        return np.array(result)


#bitmap_dataset = datasets.DatasetFolder(data_dir, loader=bitmap_loader, extensions='npz')
#dataloader = torch.utils.data.DataLoader(bitmap_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_file = bitmap_loader(data_dir + '/{}.npz'.format(test_id))
with torch.no_grad():
    input_tensor = torch.Tensor(test_file).view(1,16,-1)
    input_tensor = input_tensor.to(device)
    out, hidden = model(input_tensor, None)
    out = torch.softmax(out, -1)

pred = out.argmax(-1)
print(pred)

class_to_idx = {'air_conditioner': 0,
 'car_horn': 1,
 'children_playing': 2,
 'dog_bark': 3,
 'drilling': 4,
 'engine_idling': 5,
 'gun_shot': 6,
 'jackhammer': 7,
 'siren': 8,
 'street_music': 9}

idx_to_class = {v: k for k, v in class_to_idx.items()}

print(idx_to_class[int(pred)])

ipd.Audio("data/test/Test/{}.wav".format(test_id)) # jackhammer


tensor([2], device='cuda:0')
children_playing


In [2]:
class ConvNet(nn.Module):
    
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv_layers = nn.Sequential(
            # input.size: 3x224x224
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            # output: 16x224x224
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 16x112x112
            
            
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            # output: 128x112x112
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),
            #output: 32x56x56
            
            
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x56x56
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x56x56
            nn.ReLU(),
            nn.BatchNorm2d(64),            
            nn.MaxPool2d(kernel_size=2, stride=2),
            #output: 64x28x28

            
            
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x14x14
            nn.ReLU(),
            nn.BatchNorm2d(64),            
            nn.AdaptiveAvgPool2d((1,1))
            # output: 256x7x7
            
            
            
        )
        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )
    
    def forward(self, input):
        output = self.conv_layers(input)
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output

In [5]:
model_filepath = './saved_models/MFCC_CNN_96'

model_ft = ConvNet()
model_ft = model_ft.to(device)
optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)

checkpoint = torch.load(model_filepath)
model_ft.load_state_dict(checkpoint['model_state_dict'])
optimizer_ft.load_state_dict(checkpoint['optimizer_state_dict'])



In [20]:
from PIL import Image

data_dir = "data/test/spectograms/"
test_id = 23
test_file = data_dir + str(test_id) + ".png"

data_transforms = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

with open(test_file, 'rb') as f:
    test_img = Image.open(f).convert('RGB')
    test_img = data_transforms(test_img)


with torch.no_grad():
    input_tensor = torch.Tensor(test_img).view(1,3,224,224)
    input_tensor = input_tensor.to(device)
    out = model_ft(input_tensor)
    out = torch.softmax(out, -1)

pred = out.argmax(-1)
print(pred)

class_to_idx = {'air_conditioner': 0,
 'car_horn': 1,
 'children_playing': 2,
 'dog_bark': 3,
 'drilling': 4,
 'engine_idling': 5,
 'gun_shot': 6,
 'jackhammer': 7,
 'siren': 8,
 'street_music': 9}

idx_to_class = {v: k for k, v in class_to_idx.items()}

print(idx_to_class[int(pred)])

ipd.Audio("data/test/Test/{}.wav".format(test_id)) # jackhammer

tensor([9], device='cuda:0')
street_music
