# K-nearest neigbours

##### Imports

In [42]:
import time
import numpy as np
import os
from scipy.io import wavfile

# sklearn
from sklearn import neighbors
from sklearn.metrics import confusion_matrix

# visualization
import matplotlib.pyplot as plt
import seaborn

# torch
import torch
from torchvision import datasets, transforms

# Dataset
from torch.utils.data import DataLoader, Dataset

#Sklearn
from sklearn.model_selection import train_test_split

from audio_dataset import AudioDataset as AD


##### AudioDataset class

In [70]:
sliced_dataset = "short_audio_dataset"
sliced_dataset_lenght = 16050
# sliced_dataset = "shorter_audio_dataset"
# sliced_dataset_lenght = 4013
original_dataset = "audio_dataset"
original_dataset_lenght = 80249

class AudioDataset(Dataset):
    def __init__(self, root_path="./data/", drop_both=False, use_short=False, normalize=False, use_features=False):
        root_folder = root_path + original_dataset if not use_short else root_path + sliced_dataset
        self.use_features = use_features
        self.max_length = original_dataset_lenght if not use_short else sliced_dataset_lenght
        self.class_map = {"esben" : 0, "peter": 1, "both": 2}
        self.data = []
        self.wavs = []
        self.labels = []
        self.min_val = 10e10
        self.max_val = 0
        print("Start reading files")
        for subdir, dirs, files in os.walk(root_folder):
            for file_name in files:
                if drop_both and "both" in subdir:
                   continue
        
                file_path = os.path.join(subdir, file_name)
                self.sample_rate, wav = wavfile.read(file_path)
                wav = wav.astype(np.float32)
                
                if wav.shape[0] > self.max_length:
                    self.max_length = wav.shape[0]
                    print("Found wav with more length than specified max one, new max is:", wav.shape[0])
                
                wav = np.pad(wav, (0, self.max_length-wav.shape[0]))
                label_str = file_path.split('/')[-3][2:]
                label = (np.int64(self.class_map[label_str]))
                
                self.max_val = np.max(wav) if np.max(wav) > self.max_val else self.max_val
                self.min_val = np.min(wav) if np.min(wav) < self.min_val else self.min_val
                
                self.wavs.append(wav)
                self.labels.append(label)
               
        self.wavs = np.array(self.wavs)
        self.mu  = self.wavs.mean()
        self.std = np.std(self.wavs)
        # self.wavs = torch.Tensor(self.wavs)
        if normalize:
            self.wavs = (self.wavs + np.abs(self.min_val)) / (np.abs(self.min_val) + self.max_val)
            # self.wavs = torch.nn.functional.normalize(self.wavs, dim=1)
        
        print("="*40)
        print("Loaded DATABASE from {}\n{} total file\nLongest file is {} long\nMean: {}\nStandard deviation: {}\nNormalization: {}".
              format(root_folder, len(self.wavs), self.max_length, self.mu, self.std, normalize))
        print("="*40)
    
    
    def __len__(self):
        return len(self.wavs)
    
    def __getitem__(self, idx):
        wav = self.wavs[idx]
        label = self.labels[idx]
        wav_tensor = torch.from_numpy(wav)
        label_tensor = torch.Tensor(label)
        return wav_tensor, label_tensor


##### Initialize the Dataset

In [101]:
dataset = AudioDataset(root_path="../data/", drop_both=True, use_short=True, normalize=False)
data_len = len(dataset)
train_size, test_size, valid_size = int(data_len * 0.8), int(data_len * 0.1), int(data_len * 0.1)

data_train, data_test, data_valid = torch.utils.data.random_split(dataset, (train_size, test_size, valid_size))

Start reading files
Loaded DATABASE from ../data/short_audio_dataset
1000 total file
Longest file is 16050 long
Mean: -0.6988561153411865
Standard deviation: 2332.389404296875
Normalization: False


##### Define knn algorithm

In [103]:
def knn_param_search(train_data, train_labels, test_data, test_labels, 
                     metrics=('manhattan', 'euclidean', 'chebyshev'), 
                     ks=(1, 3, 5, 10, 25, 50, 100), algorithm='brute'):
  """
  Takes a dataset and plots knn classification accuracy 
  for different hyper parameters.

  n_train and n_test allows to subsample the dataset for faster iteration
  """
  x_train = np.array(train_data)
  y_train = np.array(train_labels)
  x_test = np.array(test_data)
  y_test = np.array(test_labels)

  for metric in metrics:
    print(f'Metric: {metric}', end=' ')
    for k in ks:
        print(f'k: {k:3d}')

        classifier = neighbors.KNeighborsClassifier(k, algorithm=algorithm, metric=metric)
        classifier = classifier.fit(x_train, y_train)

        start = time.time()

        labels = classifier.predict(x_test)
        duration = time.time() - start

        correct = labels == np.array(y_test)
        print(f'\rAccuracy: {correct.mean() * 100:.2f} %, Duration: {duration:.2f} s')
  return classifier

##### Call the KNN Algorithm 

In [104]:
classifier = knn_param_search(data_train.dataset.wavs, 
                              data_train.dataset.labels, 
                              data_test.dataset.wavs, 
                              data_test.dataset.labels,
                              metrics=['euclidean'],
                              ks=[1]
                             )

Metric: euclidean k:   1
Accuracy: 100.00 %, Duration: 4.52 s


In [105]:
predicted_labels = classifier.predict(data_valid.dataset.wavs)
correct = predicted_labels == np.array(data_valid.dataset.labels)
print(f' Accuracy: {correct.mean() * 100:.2f}')

 Accuracy: 100.00
