## Import

In [113]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import numpy as np
rng = np.random.default_rng(123456)

import librosa
import librosa.display
import IPython.display as ipd

from scipy.io import wavfile

import os
from sklearn import neighbors

## AudioDataset class

In [122]:
sliced_dataset = "short_audio_dataset"
sliced_dataset_lenght = 16050
# sliced_dataset = "shorter_audio_dataset"
# sliced_dataset_lenght = 4013
original_dataset = "audio_dataset"
original_dataset_lenght = 80249

class AudioDataset(Dataset):
    def __init__(self, root_path="./data/", drop_both=False, use_short=False, normalize=False, use_features=False):
        root_folder = root_path + original_dataset if not use_short else root_path + sliced_dataset
        self.use_features = use_features
        self.max_length = original_dataset_lenght if not use_short else sliced_dataset_lenght
        self.class_map = {"esben" : 0, "peter": 1, "both": 2}
        self.data = []
        self.wavs = []
        self.labels = []
        self.features_list = []
        self.min_val = 10e10
        self.max_val = 0
        print("Start reading files")
        for subdir, dirs, files in os.walk(root_folder):
            for file_name in files:
                if drop_both and "both" in subdir:
                   continue
        
                file_path = os.path.join(subdir, file_name)
                self.sample_rate, wav = wavfile.read(file_path)
                wav = wav.astype(np.float32)
                
                if wav.shape[0] > self.max_length:
                    self.max_length = wav.shape[0]
                    print("Found wav with more length than specified max one, new max is:", wav.shape[0])
                
                wav = np.pad(wav, (0, self.max_length-wav.shape[0]))
                features = np.transpose(self.feature_extraction(wav, self.sample_rate, normalize=normalize))
                label_str = file_path.split('/')[-3][2:]
                label = (np.int64(self.class_map[label_str]))
                
                self.max_val = np.max(wav) if np.max(wav) > self.max_val else self.max_val
                self.min_val = np.min(wav) if np.min(wav) < self.min_val else self.min_val
                
                self.wavs.append(wav)
                self.features_list.append(features)
                self.labels.append(label)
               
        self.wavs = np.array(self.wavs)
        self.mu  = self.wavs.mean()
        self.std = np.std(self.wavs)
        # self.wavs = torch.Tensor(self.wavs)
        if normalize:
            self.wavs = (self.wavs + np.abs(self.min_val)) / (np.abs(self.min_val) + self.max_val)
            # self.wavs = torch.nn.functional.normalize(self.wavs, dim=1)
        
        print("="*40)
        print("Loaded DATABASE from {}\n{} total file\nLongest file is {} long\nMean: {}\nStandard deviation: {}\nNormalization: {}".
              format(root_folder, len(self.wavs), self.max_length, self.mu, self.std, normalize))
        print("="*40)
    
    def feature_extraction(self, wav, sample_rate, n_mfcc=128, normalize=False):
        # extract features from the audio
        mfcc = np.mean(librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=n_mfcc).T, axis=0)
        if normalize:
            mfcc = (mfcc + np.abs(np.min(mfcc))) / (np.abs(np.min(mfcc)) + np.max(mfcc))
        return mfcc

    def __len__(self):
        return len(self.wavs)
    
    def __getitem__(self, idx):
        wav = self.wavs[idx]
        label = self.labels[idx]
        features = self.features_list[idx]
        wav_tensor = torch.from_numpy(wav)
        label_tensor = torch.Tensor(label)
        features_tensor = torch.Tensor(features)
        return wav_tensor, label_tensor


## Loading data for training

In [124]:
audio_dataset = AudioDataset(root_path="../data/", drop_both=True, use_short=True, normalize=True, use_features=True)
dataset_len = len(audio_dataset)
train_size, test_size, valid_size = int(dataset_len * 0.7), int(dataset_len * 0.2), int(dataset_len * 0.1)

dataset_train, dataset_test, dataset_valid = torch.utils.data.random_split(audio_dataset, (train_size, test_size, valid_size))
print(audio_dataset.max_val)
kwargs = {'batch_size': 1, 'num_workers': 2}
loader_train = torch.utils.data.DataLoader(dataset_train, **kwargs, shuffle=True)
loader_test = torch.utils.data.DataLoader(dataset_test, **kwargs, shuffle=True)
loader_valid = torch.utils.data.DataLoader(dataset_valid, **kwargs, shuffle=True)

Start reading files


Loaded DATABASE from ../data/short_audio_dataset
1000 total file
Longest file is 16050 long
Mean: -0.6988561153411865
Standard deviation: 2332.389404296875
Normalization: True
32767.0


## Autoencoder class

In [130]:
class AutoEncoder(nn.Module):
    def __init__(self, wav_len):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            # nn.Linear(64, 16),
            # nn.ReLU(),
            # nn.Linear(16, 4),
            # nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            # nn.Linear(16, 64),
            # nn.ReLU(),
            # nn.Linear(64, 128),
            nn.Sigmoid()
        )

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        return self.decoder(x)
        
    def forward(self, x):
        return self.decode(self.encode(x))

## Feature extraction function

In [131]:
def feature_extraction(wav, sample_rate, n_mfcc=128, normalize=False):
    # extract features from the audio
    mfcc = np.mean(librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=n_mfcc).T, axis=0)
    if normalize:
        mfcc = (mfcc + np.abs(np.min(mfcc))) / (np.abs(np.min(mfcc)) + np.max(mfcc))
    return mfcc

## Training

In [132]:
model = AutoEncoder(audio_dataset.max_length)
opt = torch.optim.Adam(model.parameters())

for epoch in range(25):
    print(f'Epoch {epoch+1:02}/25', end=' ')
    for x, _ in loader_train:  
        x = np.transpose(feature_extraction(x.numpy(), audio_dataset.sample_rate, normalize=True))
        x = torch.Tensor(x)
        x_rec = model(x) 
        loss = F.binary_cross_entropy(x_rec, x)
        
        opt.zero_grad()
        loss.backward()
        opt.step()

        mse = F.mse_loss(x_rec, x)
        mae = F.l1_loss(x_rec, x)
        
    print(f'loss: {loss.item():.4f} - rmse: {np.sqrt(mse.item()):.4f} - mae: {mae.item():.4f}')

Epoch 01/25 

loss: 0.5693 - rmse: 0.1455 - mae: 0.0848
Epoch 02/25 loss: 0.4276 - rmse: 0.2248 - mae: 0.1850
Epoch 03/25 loss: 0.4861 - rmse: 0.1777 - mae: 0.1283
Epoch 04/25 loss: 0.4918 - rmse: 0.1495 - mae: 0.0693
Epoch 05/25 loss: 0.5713 - rmse: 0.1268 - mae: 0.0869
Epoch 06/25 loss: 0.5500 - rmse: 0.1281 - mae: 0.0799
Epoch 07/25 loss: 0.5206 - rmse: 0.1336 - mae: 0.0725
Epoch 08/25 loss: 0.5106 - rmse: 0.1308 - mae: 0.0554
Epoch 09/25 loss: 0.5977 - rmse: 0.0985 - mae: 0.0461
Epoch 10/25 loss: 0.5346 - rmse: 0.1213 - mae: 0.0661
Epoch 11/25 loss: 0.3974 - rmse: 0.1573 - mae: 0.0894
Epoch 12/25 loss: 0.5958 - rmse: 0.0987 - mae: 0.0428
Epoch 13/25 loss: 0.5183 - rmse: 0.1301 - mae: 0.0755
Epoch 14/25 loss: 0.5383 - rmse: 0.1197 - mae: 0.0564
Epoch 15/25 loss: 0.5048 - rmse: 0.1298 - mae: 0.0571
Epoch 16/25 loss: 0.6026 - rmse: 0.0967 - mae: 0.0412
Epoch 17/25 loss: 0.5349 - rmse: 0.1201 - mae: 0.0536
Epoch 18/25 loss: 0.5121 - rmse: 0.1242 - mae: 0.0547
Epoch 19/25 loss: 0.6116 - rmse: 0.0911 

In [118]:
def knn_param_search(train_data, train_labels, test_data, test_labels, 
                     metrics=('manhattan', 'euclidean', 'chebyshev'), 
                     ks=(1, 3, 5, 10, 25, 50, 100, 250), 
                     n_train=None, n_test=None, algorithm='brute'):
  """
  Takes a dataset and plots knn classification accuracy 
  for different hyper parameters.

  n_train and n_test allows to subsample the dataset for faster iteration
  """
  x_train = np.array(train_data)
  y_train = np.array(train_labels)
  x_test = np.array(test_data)
  y_test = np.array(test_labels)
  
  # subsample the dataset
  if n_train:
    x_train, y_train = x_train[:n_train], y_train[:n_train]
  if n_test:
    x_test, y_test = x_test[:n_test], y_test[:n_test]

  for metric in metrics:
    print(f'Metric: {metric}')
    for k in ks:
        print(f'\tk: {k:3d} Training', end='')
        classifier = neighbors.KNeighborsClassifier(k, algorithm=algorithm, metric=metric)
        classifier = classifier.fit(x_train, y_train)

        labels = classifier.predict(x_test)
        
        correct = labels == np.array(y_test)
        print(f'\r\tk: {k:3d} Accuracy: {correct.mean() * 100:.2f} %')

##### Call the KNN Algorithm 

In [119]:
knn_param_search(dataset_train.dataset.wavs, 
                 dataset_train.dataset.labels, 
                 dataset_valid.dataset.wavs, 
                 dataset_valid.dataset.labels,
                )

Metric: manhattan
	k:   1 Accuracy: 100.00 %
	k:   3 Accuracy: 67.40 %
	k:   5 Accuracy: 58.10 %
	k:  10 Accuracy: 52.40 %
	k:  25 Accuracy: 51.50 %
	k:  50 Accuracy: 60.90 %
	k: 100 Accuracy: 49.90 %
	k: 250 Accuracy: 50.00 %
Metric: euclidean
	k:   1 Accuracy: 100.00 %
	k:   3 Accuracy: 75.20 %
	k:   5 Accuracy: 64.00 %
	k:  10 Accuracy: 56.40 %
	k:  25 Accuracy: 50.20 %
	k:  50 Accuracy: 69.40 %
	k: 100 Accuracy: 50.00 %
	k: 250 Accuracy: 50.00 %
Metric: chebyshev
	k:   1 Accuracy: 100.00 %
	k:   3 Accuracy: 69.40 %
	k:   5 Accuracy: 65.90 %
	k:  10 Accuracy: 60.70 %
	k:  25 Accuracy: 56.60 %
	k:  50 Accuracy: 54.10 %
	k: 100 Accuracy: 52.20 %
	k: 250 Accuracy: 50.90 %
