In [1]:
import os
import sklearn
import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal, sparse
from scipy.io import wavfile

audio_dir = '../GTZAN/genres_original/'
corrupted_file = ['jazz.00054.wav']

In [None]:
# PCA function
# input is (dim * sample)
def pca(sig, components):
    data = sig - np.mean(sig, axis=1, keepdims=True)
    cov = np.cov(data)
    eigen_vals, eigen_vec = sparse.linalg.eigsh(cov, k=components)
    A = np.linalg.inv(np.diag(np.sqrt(eigen_vals)))
    W = np.dot(A, eigen_vec.T)
#     W /= np.linalg.norm(W)
    
    return (W, W @ data)

In [2]:
from sklearn.preprocessing import StandardScaler

# audio processing
genres = os.listdir(audio_dir)

num_train = 90
window = 'hamming'
nperseg = 4410

n_mfcc = 20

train_data = []
train_label = []
test_data = []
test_label = []
vote_length = []

for genre in genres:
    print("Processing genre: " + genre)
    files = os.listdir(audio_dir + genre)
    
    # Split training and testing
    mask = num_train * [True] + (100 - num_train) * [False]
    np.random.shuffle(mask)

    for i, file in enumerate(files):        
        if file in corrupted_file:
            continue
        file_name = audio_dir + genre + '/' + file
        
        # spectrogram 15 dim => 0.50, 
        # sample_rate, sound = wavfile.read(file_name)
        # _, _, spec = signal.stft(y=sound, sr=sample_rate, window=window, nperseg=nperseg)
        # sample = np.log(np.abs(spec) + 1e-7)
        
        # mfcc most frequently considered coefficients 20 coeff => 0.50
        # sound, sample_rate = librosa.load(file_name)
        # sample = librosa.feature.mfcc(y=sound, sr=sample_rate, n_mfcc=n_mfcc)
        
        # mel spectrogram 15 dim => 0.42, 20 dim => 0.47, 40 dim => 0.5
        # svm 15 dim => 0.512
        # sound, sample_rate = librosa.load(file_name)
        # spec = librosa.feature.melspectrogram(y=sound, sr=sample_rate, window=window, n_fft=1024, hop_length=512)        
        # sample = np.log(np.abs(spec) + 1e-7)
        
        # librosa spectrogram
        sound, sample_rate = librosa.load(file_name)
        spec = librosa.stft(y=sound, window=window, n_fft=1024, hop_length=512)        
        sample = np.log(np.abs(spec) + 1e-7)
        
        # energy 0.17
        # sound, _ = librosa.load(file_name)
        # sample = librosa.feature.rms(y=sound)

        if mask[i]:
            train_data.append(sample)
            train_label.extend([genres.index(genre)] * sample.shape[1])
        else:
            test_data.append(sample)
            test_label.extend([genres.index(genre)])
            vote_length.append(sample.shape[1])
            

train_data = np.hstack(train_data)
test_data = np.hstack(test_data)
train_label = np.array(train_label)
test_label = np.array(test_label)
vote_length = np.array(vote_length)

scaler = StandardScaler()
scaler.fit(train_data.T)
train_data = scaler.transform(train_data.T)
test_data = scaler.transform(test_data.T)

print(train_data.shape)
print(test_data.shape)
print(train_label.shape)
print(test_label.shape)
assert test_label.shape == vote_length.shape

train_data_super = train_data
train_label_super = train_label
test_data_super = test_data
test_label_super = test_label

Processing genre: blues
Processing genre: classical
Processing genre: country
Processing genre: disco
Processing genre: hiphop
Processing genre: jazz
Processing genre: metal
Processing genre: pop
Processing genre: reggae
Processing genre: rock


MemoryError: Unable to allocate 2.22 GiB for an array with shape (1162987, 513) and data type float32

In [None]:
# mask for subsampling training data   
train_percentage = 1
num_true = int(train_label_super.shape[0] * train_percentage)
num_false = train_label_super.shape[0] - num_true
train_mask = num_true * [True] + num_false * [False]
np.random.shuffle(train_mask)

# validate_percentage = 0.1
# num_true = int(train_label_super.shape[0] * validate_percentage)
# num_false = train_label_super.shape[0] - num_true
# validate_mask = num_true * [True] + num_false * [False]
# np.random.shuffle(validate_mask)

# test_percentage = 1
# num_true = int(test_label_super.shape[0] * test_percentage)
# num_false = test_label_super.shape[0] - num_true
# test_mask = num_true * [True] + num_false * [False]
# np.random.shuffle(test_mask)

train_data = train_data_super[train_mask,:]
train_label = train_label_super[train_mask]
# test_data = test_data_super[test_mask,:]
# test_label = test_label_super[test_mask]
# validate_data = train_data_super[validate_mask,:]
# validate_label = train_label_super[validate_mask]

print(train_data.shape)
print(validate_data.shape)
print(test_data.shape)

print(train_label.shape)
print(validate_label.shape)
print(test_label.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

pca_model = PCA(n_components=15)
pca_model.fit(train_data)
X_train = pca_model.transform(train_data)
X_test = pca_model.transform(test_data)

# X_train = train_data
# X_test = test_data

In [None]:
# KNN

k = 10

print('Training...')
knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn_model.fit(X_train, train_label) 
print('Predicting...')
y_predict = knn_model.predict(X_test)
print('Done')

for i, genre in enumerate(genres):
    print (str(i) + ' ' + genre)
    
print(confusion_matrix(test_label, y_predict))
print(classification_report(test_label, y_predict))

In [4]:
from collections import Counter
cur_idx = 0
extended_test_label = []
voted_label = []
for i in range(len(test_label)):
#     c = Counter(y_predict[cur_idx:cur_idx+vote_length[i]])
#     voted_label.append(c.most_common(1)[0][0])
    cur_idx = cur_idx + vote_length[i]
    extended_test_label.extend([test_label[i]] * vote_length[i])

# print(extended_test_label)
# print(np.sum(voted_label == test_label) / test_label.shape[0])
# print(classification_report(extended_test_label, y_predict))

In [None]:
# from sklearn.mixture import GaussianMixture
# gmm = GaussianMixture(n_components=10)
# gmm.fit(X_train)

In [None]:
# from sklearn import metrics
# y_pred = gmm.predict(X_test)
# metrics.adjusted_rand_score(test_label, y_pred)

In [None]:
# import librosa.display

# for genre in genres:
#     print("genre: " + genre)
#     files = os.listdir(audio_dir + genre)
#     sound, sample_rate = librosa.load( audio_dir + genre + '/' + files[0])
#     spec = librosa.feature.melspectrogram(y=sound, sr=sample_rate, window=window)        
#     DB = librosa.amplitude_to_db(spec, ref=np.max)
#     librosa.display.specshow(DB, sr=sample_rate, x_axis='time', y_axis='log');
#     plt.colorbar(format='%+2.0f dB');
#     plt.show()

In [None]:
# from sklearn import svm
# from sklearn import metrics

# # Create a svm Classifier
# clf = svm.SVC(gamma='scale', verbose=True) # Linear Kernel

# # Train the model using the training sets
# # Training
# print('Training...')
# clf.fit(X_train, train_label)

# # Predict the response for test dataset
# # Predicting
# print('Predicting...')
# y_pred = clf.predict(X_test)
# print("Accuracy:", metrics.accuracy_score(test_label, y_pred))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10).fit(X_train)

In [None]:
from sklearn import metrics

y_predict = kmeans.predict(X_test)

from collections import Counter
cur_idx = 0
extended_test_label = []
voted_label = []
for i in range(len(test_label)):
    c = Counter(y_predict[cur_idx:cur_idx+vote_length[i]])
    voted_label.append(c.most_common(1)[0][0])
    cur_idx = cur_idx + vote_length[i]
    extended_test_label.extend([test_label[i]] * vote_length[i])
    
print(metrics.adjusted_rand_score(extended_test_label, y_predict))
metrics.adjusted_rand_score(voted_label, test_label)

In [8]:
print(len(vote_length))

100


In [13]:
from sklearn import metrics

import itertools
labels = [i for i in range(10)]
labels_permu = list(itertools.permutations(labels))
max_acc = 0
dummy = [0 for i in range(len(extended_test_label))]
for i in labels_permu:        
    a = np.array(i)
    b = []
    for j in range(len(vote_length)):
        b.extend([a[j%10]] * vote_length[j])
    acc = metrics.adjusted_rand_score(b, dummy)
    max_acc = max(max_acc, acc)
print(max_acc)

KeyboardInterrupt: 