In [1]:
import os
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal, sparse
from scipy.io import wavfile

audio_dir = 'GTZAN/genres_original/'
corrupted_file = ['jazz.00054.wav']

In [2]:
# PCA function
# input is (dim * sample)
def pca(sig, components):
    data = sig - np.mean(sig, axis=1, keepdims=True)
    cov = np.cov(data)
    eigen_vals, eigen_vec = sparse.linalg.eigsh(cov, k=components)
    A = np.linalg.inv(np.diag(np.sqrt(eigen_vals)))
    W = np.dot(A, eigen_vec.T)
#     W /= np.linalg.norm(W)
    
    return (W, W @ data)

In [4]:
# audio processing

genres = os.listdir(audio_dir)

num_train = 90
window = 'hamming'
nperseg = 4410

train_data = []
train_label = []
test_data = []
test_label = []


for genre in genres:
    print("Processing genre: " + genre)
    files = os.listdir(audio_dir + genre)
    
    # Split training and testing
    mask = num_train * [True] + (100 - num_train) * [False]
    np.random.shuffle(mask)

    for i, file in enumerate(files):        
        if file in corrupted_file:
            continue
        file_name = audio_dir + genre + '/' + file
        
        sample_rate, sound = wavfile.read(file_name)
        _, _, spec = signal.stft(sound, fs=sample_rate, window=window, nperseg=nperseg)
        sample = np.log(np.abs(spec) + 1e-7)
        if mask[i]:
            train_data.append(sample)
            train_label.extend([genres.index(genre)] * sample.shape[1])
        else:
            test_data.append(sample)
            test_label.extend([genres.index(genre)] * sample.shape[1])

train_data = np.hstack(train_data)
test_data = np.hstack(test_data)
train_label = np.array(train_label)
test_label = np.array(test_label)

print(train_data.shape)
print(test_data.shape)
print(train_label.shape)
print(test_label.shape)

Processing genre: blues
Processing genre: classical
Processing genre: country
Processing genre: disco
Processing genre: hiphop
Processing genre: jazz
Processing genre: metal
Processing genre: pop
Processing genre: reggae
Processing genre: rock
(2206, 271619)
(2206, 30199)
(271619,)
(30199,)


In [6]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

pca_model = PCA(n_components=20)
pca_model.fit(train_data.T)
X_train = pca_model.transform(train_data.T)
X_test = pca_model.transform(test_data.T)

k = 10

print('Training...')
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train, train_label) 
print('Predicting...')
y_predict = knn_model.predict(X_test)
print('Done')

print(confusion_matrix(test_label, y_predict))
print(classification_report(test_label, y_predict))

Training...
Predicting...
Done
[[1874   21   50  134  145  211  342    1   64  178]
 [  34 2534  138   47   15  181    2    6   30   33]
 [ 466  254  722  189   92  479  138   58  281  341]
 [ 285    6  171  904  306   91  431  214  236  376]
 [ 226    3  112  447  681  100  732  296  154  268]
 [ 582  201  128   47   51 1749   52   25   90   95]
 [ 191    1   19   77   88   20 2456    9   28  131]
 [   5   55  139  211  339  131   24 1799  139  178]
 [ 247   90  308  237  230  415  120  230  870  273]
 [ 388   13  108  307  170  223  714   84  422  591]]
              precision    recall  f1-score   support

           0       0.44      0.62      0.51      3020
           1       0.80      0.84      0.82      3020
           2       0.38      0.24      0.29      3020
           3       0.35      0.30      0.32      3020
           4       0.32      0.23      0.27      3019
           5       0.49      0.58      0.53      3020
           6       0.49      0.81      0.61      3020
     