In [1]:
import numpy as np
import matplotlib.pyplot as plot
from scipy.io import wavfile

In [2]:
from __future__ import print_function, division
import sys
import glob
from json import dumps, loads
import os
import math

import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.utils.model_zoo as model_zoo

import torch
import pandas as pd
from skimage import io, transform, color
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import cv2
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
plt.ion()

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [4]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=8):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(0.3))
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(p=0.3))
        self.fc = nn.Linear(12*12*64, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [5]:
import matplotlib as mpl
mpl.use('Agg')

In [6]:
plot.ioff()

In [7]:
class AudioDataset(Dataset):
    """Face dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.image_path = pd.read_csv(csv_file, sep="\t", names=['name', 'place', 'zero', 'score', 'class'], header=None)
        self.num_classes = len(self.image_path['class'].unique())
        self.class_dict = {}
        for idx in range(len(self.image_path['class'].unique())):
          self.class_dict[self.image_path['class'].unique()[idx]] = idx

    def __len__(self):
        return len(self.image_path)
    
    
    def get_specgram(self, path):
      samplingFrequency, signalData = wavfile.read(path)
      
      fig = plot.figure()
      fig.add_subplot(111)
      plot.grid(False)
      plot.axis('off')
      plot.specgram(signalData,Fs=samplingFrequency)
      fig.canvas.draw()
      plot.close()
      return np.array(fig.canvas.renderer._renderer)
    
    
    def random(self):
      self.image_path = shuffle(self.image_path)
    
    
    def batches(self, batch_size):
      for i in range(0, len(self.image_path), batch_size):
        path_t = list(self.image_path[i:i+batch_size]['name'])
        pearson_t = list(self.image_path[i:i+batch_size]['class'])
        batch_img = torch.ones(len(path_t), 4, 48, 48)
        batch_pearson = torch.ones(len(path_t))
        for j in range(len(path_t)):
          image = self.get_specgram(os.path.join(self.root_dir, path_t[j]))
          pearson = self.class_dict[pearson_t[j]]
          sample = {'image': image, 'pearson': pearson}
          if self.transform:
            sample = self.transform(sample)
          batch_img[j] = sample['image']
          batch_pearson[j] = sample['pearson']
        yield batch_img, batch_pearson

    def __getitem__(self, idx):
        img_name = self.image_path.loc[idx, 'name']
        image = self.get_specgram(os.path.join(self.root_dir, img_name))
        pearson = self.class_dict[self.image_path.loc[idx, 'class']]
        sample = {'image': image, 'pearson': pearson}

        if self.transform:
            sample = self.transform(sample)

        return sample['image'], sample['pearson']

In [8]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, pearson = sample['image'], sample['pearson']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size
        
        new_size = max(int(new_h), int(new_w))
        new_h, new_w = new_size, new_size

        img = transform.resize(image, (new_h, new_w))

        return {'image': img, 'pearson': pearson}
    

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, pearson = sample['image'], sample['pearson']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        return {'image': torch.from_numpy(image),
                'pearson': torch.LongTensor([pearson])}

In [9]:
audio_dataset = AudioDataset(csv_file='meta/meta.txt', root_dir='audio', transform=transforms.Compose([
                                               Rescale(32),
                                               ToTensor()
                                           ]))

In [10]:
audio_dataset.random()

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [12]:
model = ConvNet(8).to(device)
#model.load_state_dict(torch.load("mytraining.pt")) #Можно не тренировать модель, а сразу загрузить веса
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
from tqdm import tqdm_notebook as tqdm

In [None]:
# Train the model
for epoch in range(20):
    for images, labels in tqdm(audio_dataset.batches(64)):
        images = images.to(device)
        labels = labels.long().to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print ('Epoch [{}/{}], Loss: {:.4f}' 
           .format(epoch+1, 20, loss.item()))

A Jupyter Widget


Epoch [1/5], Loss: 0.4492


A Jupyter Widget


Epoch [2/5], Loss: 0.3106


During request exception was raised: <urlopen error [WinError 10055] Невозможно выполнить операцию на сокете, т.к. буфер слишком мал или очередь переполнена>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py",

A Jupyter Widget


Epoch [3/5], Loss: 0.1878


During request exception was raised: <urlopen error [WinError 10055] Невозможно выполнить операцию на сокете, т.к. буфер слишком мал или очередь переполнена>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py",

A Jupyter Widget


Epoch [4/5], Loss: 0.0925


A Jupyter Widget


Epoch [5/5], Loss: 0.1111


A Jupyter Widget


Epoch [6/5], Loss: 0.0845


A Jupyter Widget


Epoch [7/5], Loss: 0.0707


A Jupyter Widget


Epoch [8/5], Loss: 0.0897


A Jupyter Widget


Epoch [9/5], Loss: 0.0212


A Jupyter Widget


Epoch [10/5], Loss: 0.0409


A Jupyter Widget


Epoch [11/5], Loss: 0.0085


A Jupyter Widget


Epoch [12/5], Loss: 0.0549


A Jupyter Widget


Epoch [13/5], Loss: 0.0064


A Jupyter Widget


Epoch [14/5], Loss: 0.0332


A Jupyter Widget


Epoch [15/5], Loss: 0.0481


A Jupyter Widget


Epoch [16/5], Loss: 0.0166


A Jupyter Widget


Epoch [17/5], Loss: 0.0313


A Jupyter Widget

In [44]:
at = os.listdir("test")

In [45]:
t_ser = pd.Series(at[:473])

In [46]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for i in tqdm(range(0, len(t_ser), 64)):
        path_t = list(t_ser[i:i+64])
        batch_img = torch.ones(len(path_t), 4, 48, 48)
        batch_pearson = torch.ones(len(path_t))
        for j in range(len(path_t)):
          image = audio_dataset.get_specgram(os.path.join("test", path_t[j]))
          nclass = path_t[j].split("_")[0] if path_t[j].split("_")[0] != "knocking" else "_".join(path_t[j].split("_")[:2])
          pearson = audio_dataset.class_dict[nclass]
          sample = {'image': image, 'pearson': pearson}
          if audio_dataset.transform:
            sample = audio_dataset.transform(sample)
          batch_img[j] = sample['image']
          batch_pearson[j] = sample['pearson']
        outputs = model(batch_img.to(device))
        _, predicted = torch.max(outputs.data, 1)
        total += batch_pearson.size(0)
        correct += (predicted == batch_pearson.to(device).long()).sum().item()
    print('Test Accuracy of the model on the {} val audio: {} %'.format(len(t_ser), 100 * correct / total))

A Jupyter Widget


Test Accuracy of the model on the 473 val audio: 84.14376321353066 %


In [47]:
t_ser = pd.Series(at)
res_path = []
res_score = []
res_class.. to load your previously training model: = []
with torch.no_grad():
    for i in tqdm(range(0, len(t_ser), 64)):
        path_t = list(t_ser[i:i+64])
        batch_img = torch.ones(len(path_t), 4, 48, 48)
        for j in range(len(path_t)):
          image = audio_dataset.get_specgram(os.path.join("test", path_t[j]))
          sample = {'image': image, 'pearson': 404}
          if audio_dataset.transform:
            sample = audio_dataset.transform(sample)
          batch_img[j] = sample['image']
        outputs = model(batch_img.to(device))
        scores, predicted = torch.max(outputs.data, 1)
        res_path+=path_t
        res_score+= list(scores.cpu().numpy())
        res_class+= list(predicted.cpu().numpy())

A Jupyter Widget




In [49]:
ans_data = pd.DataFrame()
ans_data['name'] = res_path
ans_data['score'] = res_score
ans_data['class'] = res_class

In [51]:
ans_data['class'] = ans_data['class'].map({v:k for k,v in audio_dataset.class_dict.items()})

In [53]:
#Сохранил модель
torch.save(model.state_dict(), "mytraining.pt")

# Загрузка модели
#model.load_state_dict(torch.load('mytraining.pt'))

In [54]:
ans_data.to_csv("result.txt", sep="\t", index=False, header=None)