In [39]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter   

from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pack_sequence,pad_packed_sequence
from sklearn.preprocessing import LabelEncoder
import joblib

import librosa
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
train_post_competition = pd.read_csv('train_post_competition.csv')
test_post_competition = pd.read_csv('test_post_competition.csv')
train_post_competition

Unnamed: 0,fname,label,manually_verified,freesound_id,license
0,00044347.wav,Hi-hat,0,28739,Attribution
1,001ca53d.wav,Saxophone,1,358827,Attribution
2,002d256b.wav,Trumpet,0,10897,Creative Commons 0
3,0033e230.wav,Glockenspiel,1,325017,Attribution
4,00353774.wav,Cello,1,195688,Attribution
...,...,...,...,...,...
9468,ffec59fb.wav,Fireworks,0,343090,Creative Commons 0
9469,fff37590.wav,Hi-hat,0,33136,Attribution
9470,fff44ac6.wav,Laughter,0,133674,Attribution
9471,fff6a13d.wav,Chime,0,14640,Attribution


In [32]:
y, sr = librosa.load('audio_train/002d256b.wav', sr=11025)
y, sr

(array([-9.6781314e-06, -3.6996349e-05, -1.4843055e-05, ...,
        -9.9896242e-06, -3.5597372e-05, -2.0594454e-05], dtype=float32),
 11025)

In [40]:
labelEncoder = LabelEncoder()
labelEncoder.fit_transform(train_post_competition.label.value_counts().keys())
joblib.dump(labelEncoder, 'labelencoder.pkl')

['labelencoder.pkl']

## Load wav files and extract features through mfcc, save as npy files

In [34]:
train_path = 'audio_train/'
test_path = 'audio_test/'
trainX, trainY, devX, devY = [], [], [], []
N_MFCC = 24
count=0
for index in range(len(train_post_competition)-1000):
    file = train_post_competition['fname'].iloc[index]
    x, sr = librosa.load(os.path.join(train_path,file), sr=11025)
    mfccX = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=N_MFCC)
    y =  labelEncoder.transform([train_post_competition['label'].iloc[index],])
    trainX.append(torch.tensor(mfccX.T))
    trainY.append(torch.tensor(y))
    count+=1
    print(count, end=' ' )
    if count>1000:
        break
count=0      
for index in range(len(train_post_competition)-1000, len(train_post_competition)):
    file = train_post_competition['fname'].iloc[index]
    x, sr = librosa.load(os.path.join(train_path,file), sr=11025)
    mfccX = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=N_MFCC)
    y =  labelEncoder.transform([train_post_competition['label'].iloc[index],])
    devX.append(torch.tensor(mfccX.T))  
    devY.append(torch.tensor(y))
    count+=1
    if count>100:
        break

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 

KeyboardInterrupt: 

In [13]:
base_path = 'augmented_mfcc24_npy_files/'
with open(base_path+'trainX.npy', 'rb') as f:
    trainX = np.load(f, allow_pickle=True)
with open(base_path+'trainY.npy', 'rb') as f:
    trainY = np.load(f, allow_pickle=True)
    
with open(base_path+'devX.npy', 'rb') as f:
    devX = np.load(f, allow_pickle=True)
with open(base_path+'devY.npy', 'rb') as f:
    devY = np.load(f, allow_pickle=True)
    
with open(base_path+'testX.npy', 'rb') as f:
    testX = np.load(f, allow_pickle=True)
with open(base_path+'testY.npy', 'rb') as f:
    testY = np.load(f, allow_pickle=True)

In [36]:
BATCH_SIZE = 8

In [10]:
class MyDataset(Dataset):
    def __init__(self, trainX, trainY):
        self.trainX = trainX
        self.trainY = trainY
    
    def __getitem__(self, idx):
        # return torch.FloatTensor(self.trainX[idx]), torch.FloatTensor(self.trainY[idx])
        return self.trainX[idx], self.trainY[idx]
        
    def __len__(self):
        assert np.array(self.trainX).shape[0] == np.array(self.trainY).shape[0]
        return np.array(self.trainX).shape[0]

In [38]:
data_train = MyDataset(trainX, trainY)
data_loader_train = DataLoader(data_train, batch_size=1, shuffle=True, drop_last=True)

data_loader_dev = DataLoader(MyDataset(devX, devY), batch_size=1, shuffle=True, drop_last=True)

batch_train = iter(data_loader_train).next()

In [104]:
def compute_accuracy(output, target):
    target_hat = torch.max(output, dim=1).indices
    correct_num = int(sum(target_hat==target))
    total_num = target.shape[0]
    return correct_num, total_num

In [26]:
def test(model, data_loader):
    # accuracy
    with torch.no_grad():
        correct_list = []
        total_list = []
        for x, y in tqdm(data_loader):
            x = x.to(device)
            y = y.to(device)
            
            outputs = lstm(x, device)
            correct_num, total_num = compute_accuracy(outputs, y)
            correct_list.append(correct_num)
            total_list.append(total_num)
        
        accuracy = sum(correct_list) / sum(total_list)
        
 
    return accuracy

In [7]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers, batch_size, bi):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.bi = 2 if bi == True else 1


        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, bidirectional=False)
        
        self.fc = nn.Linear(self.bi*hidden_size, num_classes)
        # self.softmax = nn.Softmax(dim=1)



    def forward(self, x, device):
        h_0 = Variable(torch.zeros(
            self.num_layers , self.batch_size, self.hidden_size))
        
        c_0 = Variable(torch.zeros(
            self.num_layers, self.batch_size, self.hidden_size))
        
        h_0 = h_0.to(device)
        c_0 = c_0.to(device)
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        
        # h_out = h_out.view(-1, self.hidden_size)
        h_out = ula[:, -1, :]
        # h_out = h_out.view(-1, self.hidden_size)


        
        out = self.fc(h_out)
        # out = self.softmax(out)
        
        return out

In [30]:
BATCH_SIZE = 8
num_epochs = 100
learning_rate = 0.01
N_MFCC = 24

input_size = N_MFCC
hidden_size = 16
num_layers = 1

num_classes = 41

seq_len = batch_train[0].data.shape[0]

writer = SummaryWriter('summary_writer/')


lstm = LSTM(num_classes, input_size, hidden_size, num_layers, seq_length=seq_len, batch_size=1)

criterion = torch.nn.CrossEntropyLoss()   
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(lstm.parameters(), lr=learning_rate)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lstm.to(device)
criterion.to(device)

# Train the model
for epoch in range(num_epochs):
    LOSS = 0
    batch = 1
    LOSS_list = []
    for x, y in tqdm(data_loader_train):
        x = x.to(device)
        y = y.to(device)

        outputs = lstm(x, device)
        
        # obtain the loss function
        loss = criterion(outputs, y[0])
        


        # print(outputs)
        # print('outputs:',outputs.shape)
        # print('y:',y.shape) 
        LOSS += loss
        LOSS_list.append(LOSS.item())
        batch += 1
        if batch == BATCH_SIZE:
            optimizer.zero_grad()

            LOSS.backward()

            optimizer.step()
            LOSS = 0
            batch = 1
        
        writer.add_scalar('LSTM_mfcc_{}epoch'.format(num_epochs), LOSS, epoch)    

 
    dev_precision = test(lstm, data_loader_dev)
    
    print("Epoch: %d, loss: %1.5f, dev_precision: %2f" % (epoch, np.mean(LOSS_list), dev_precision))

torch.save(lstm, 'ckp/lstm-mfcc-{}epoch.ckp'.format(num_epochs))

NameError: name 'batch_train' is not defined

In [119]:
lstm = torch.load('ckp/augmented_bi_3layer_lstm-mfcc-22epoch.ckp')

In [120]:
data_loader_test = DataLoader(MyDataset(testX, testY), batch_size=1, shuffle=True, drop_last=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Testing')
test_precision = test(lstm, data_loader_test)
print('Test precision: %2f', test_precision)

Testing


100%|██████████| 1600/1600 [00:17<00:00, 93.58it/s] 

Test precision: %2f 0.48375





In [70]:
def compute_F1(model, data_loader):
    
    labelencoder = joblib.load('labelencoder.pkl')

    with torch.no_grad():
       
        matrix_dict = {}
        
        for x, y in tqdm(data_loader):
            x = x.to(device)
            y = y.to(device)

            outputs = lstm(x, device)
    
            target_hat = str(int(torch.max(outputs, dim=1).indices))
            target = str(int(y))
            if target_hat not in matrix_dict:
                matrix_dict[target_hat] = {'TP':0, 'FP':0, 'FN':0}
            if target not in matrix_dict:
                matrix_dict[target] = {'TP':0, 'FP':0, 'FN':0}
                
            if target_hat == target:
                matrix_dict[target_hat]['TP'] += 1
            else:
                matrix_dict[target_hat]['FP'] += 1
                matrix_dict[target]['FN'] += 1
                
        f1_dict = {}
        for encoded_label in matrix_dict:
            true_label = labelencoder.inverse_transform([int(encoded_label)])[0]
            TP = matrix_dict[encoded_label]['TP']
            FP = matrix_dict[encoded_label]['FP']
            FN = matrix_dict[encoded_label]['FN']
            
            if TP == 0:
                precision =0
                recall = 0
                F1_score = 0
            else:
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
                F1_score = 2 * (precision * recall)/(precision + recall)            
    
            f1_dict[true_label] = {'precision':precision, 'recall':recall, 'f1-score':F1_score}
            
    # Macro-F1 : an arithmetic mean of the per-class F1-scores
    sum_F1 = 0
    for label in f1_dict:
        sum_F1 += f1_dict[label]['f1-score']
    macro_f1 = sum_F1 / len(f1_dict)
    
    return macro_f1, f1_dict

In [121]:
macro_f1, f1_dict = compute_F1(lstm, data_loader_test)

100%|██████████| 1600/1600 [00:16<00:00, 96.41it/s] 


In [122]:
macro_f1

0.4419399717593248

In [123]:
# precision-recall-f1 matric

pd.merge(pd.DataFrame.from_dict(f1_dict).T, pd.DataFrame(train_post_competition.label.value_counts()), left_index=True, right_index=True)

Unnamed: 0,precision,recall,f1-score,label
Scissors,0.235294,0.48,0.315789,95
Hi-hat,0.296703,0.692308,0.415385,300
Squeak,0.0,0.0,0.0,300
Drawer_open_or_close,0.636364,0.482759,0.54902,158
Tearing,0.333333,0.444444,0.380952,300
Telephone,0.285714,0.291667,0.28866,120
Finger_snapping,0.5,0.69697,0.582278,117
Double_bass,0.608696,0.7,0.651163,300
Clarinet,0.415094,0.392857,0.40367,300
Flute,0.421875,0.490909,0.453782,300


In [129]:
len(test_post_competition)

9400