In [34]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm, sklearn.ensemble
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score

import scipy

import librosa
import librosa.display
from IPython import display

In [2]:
def load_processed_data(train_path, val_path, test_path):
    npz_train = np.load(train_path)
    npz_val = np.load(val_path)
    npz_test = np.load(test_path)
    
    X_train = npz_train['arr_0']
    y_train = npz_train['arr_1']
    X_val = npz_val['arr_0']
    y_val = npz_val['arr_1']
    X_test = npz_test['arr_0']
    y_test = npz_test['arr_1']
    
    return X_train, y_train, X_val, y_val, X_test, y_test 

def encode_labels(y_train, y_val, y_test):
    # Convert label data into one-hot encoding for softmax
    le = skl.preprocessing.OneHotEncoder(sparse=False)
    y_train = le.fit_transform(y_train.reshape(-1, 1))
    y_val = le.fit_transform(y_val.reshape(-1, 1))
    y_test = le.fit_transform(y_test.reshape(-1, 1))
    return y_train, y_val, y_test

def scale_shuffle_data(X, y): 
    X_raw = librosa.core.db_to_power(X, ref=1.0)
    X_log = np.log(X_raw)
    X, y = skl.utils.shuffle(X_log, y)
    return X, y

def scale_shuffle_train_data(X, y): 
    X_raw = librosa.core.db_to_power(X, ref=1.0)
    X_log = np.log(X_raw)
    X, y = unison_shuffled_copies(X_log, y)
    return X, y

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_processed_data(
    'train2_arr.npz', 'mel_valid_data.npz', 'mel_test_data.npz')
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)
X_train, y_train = scale_shuffle_data(X_train, y_train)
X_val, y_val = scale_shuffle_data(X_val, y_val)
X_test, y_test = scale_shuffle_data(X_test, y_test)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(1600, 640, 128)
(1600, 8)
(800, 640, 128)
(800, 8)


## Setup Deep Learning Model

In [2]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Conv2D, concatenate, MaxPooling2D, Flatten, Embedding, Lambda
from keras.layers import Input, Dense, Bidirectional, LSTM, Dropout, Activation, GRU

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import backend as K
from keras.utils import np_utils
from keras.optimizers import Adam, RMSprop

from keras import regularizers

Using TensorFlow backend.


In [12]:
batch_size = 64
epochs = 50
num_classes = 8
n_frames = X_train.shape[1]
n_features = X_train.shape[2]

nb_filters1 = 16 
nb_filters2 = 32 
nb_filters3 = 64
nb_filters4 = 64
nb_filters5 = 64
ksize = (3,1)
pool_size_1 = (2,2) 
pool_size_2 = (4,4)
pool_size_3 = (4,2)

dropout_prob = 0.20
dense_size1 = 128
lstm_count = 64
num_units = 120

def build_pcrnn_model(input_layer):
    print('Building parallel RNN+CNN model...')
    
    # Set up convolutional layers
    conv_1 = Conv2D(filters = nb_filters1, kernel_size = ksize, 
                    strides = 1, padding = 'valid', activation = 'relu', 
                    name = 'conv_1')(input_layer)
    pool_1 = MaxPooling2D(pool_size_1)(conv_1)
    
    conv_2 = Conv2D(filters = nb_filters2, kernel_size = ksize, 
                    strides = 1, padding = 'valid', activation = 'relu', 
                    name = 'conv_2')(pool_1)
    pool_2 = MaxPooling2D(pool_size_1)(conv_2)
    
    conv_3 = Conv2D(filters = nb_filters3, kernel_size = ksize, 
                    strides = 1, padding = 'valid', activation = 'relu', 
                    name = 'conv_3')(pool_2)
    pool_3 = MaxPooling2D(pool_size_1)(conv_3)
    
    conv_4 = Conv2D(filters = nb_filters4, kernel_size = ksize, 
                    strides = 1, padding = 'valid', activation = 'relu', 
                    name = 'conv_4')(pool_3)
    pool_4 = MaxPooling2D(pool_size_2)(conv_4)
    
    conv_5 = Conv2D(filters = nb_filters5, kernel_size = ksize, 
                    strides = 1, padding = 'valid', activation = 'relu', 
                    name = 'conv_5')(pool_4)
    pool_5 = MaxPooling2D(pool_size_2)(conv_5)
    
    flatten1 = Flatten()(pool_5)
    
    
    # Set up recurrent layers
    pool_lstm1 = MaxPooling2D(pool_size_3, name = 'pool_lstm')(input_layer)
    
    squeezed = Lambda(lambda x: K.squeeze(x, axis = -1))(pool_lstm1)
    
    lstm = Bidirectional(GRU(lstm_count))(squeezed)
    
    
    # Concatenate output of CNN and RNN 
    combined = concatenate([flatten1, lstm], axis=-1, name='combined')
    
    # Softmax
    output = Dense(num_classes, activation = 'softmax', name = 'softmax')(combined)
    model = Model(input_layer, output)
    
    # Some kind of optimizer..?
    opt = RMSprop(lr = 0.005) 
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics=['accuracy'])
    
    print(model.summary())
    return model

def train(X_train, y_train, X_val, y_val, name):
    X_train = np.expand_dims(X_train, axis = -1)
    X_val = np.expand_dims(X_val, axis = -1)
    
    input_layer = Input((n_frames, n_features, 1))
    model = build_pcrnn_model(input_layer)
    
    # Checkpoints..?
    checkpoint_callback = ModelCheckpoint('./models/prcnn/' + name, monitor ='val_accuracy', verbose = 1,
                                          save_best_only = True, mode = 'max')
    
    reducelr_callback = ReduceLROnPlateau(monitor = 'val_accuracy', factor = 0.5, patience = 10, 
                                          min_delta = 0.01, verbose = 1)
    callbacks_list = [checkpoint_callback, reducelr_callback]
    
    print('Training the parallel RNN+CNN model...')
    history = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs,
                       validation_data = (X_val, y_val), verbose = 1, callbacks = callbacks_list)
    return model, history

In [94]:
model, history  = train(X_train, y_train, X_val, y_val, 'full-weights.best.h5')

Building parallel RNN+CNN model...
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 640, 128, 1)  0                                            
__________________________________________________________________________________________________
conv_1 (Conv2D)                 (None, 638, 128, 16) 64          input_8[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_36 (MaxPooling2D) (None, 319, 64, 16)  0           conv_1[0][0]                     
__________________________________________________________________________________________________
conv_2 (Conv2D)                 (None, 317, 64, 32)  1568        max_pooling2d_36[0][0]           
_________________________________________________________


Epoch 00019: val_accuracy did not improve from 0.26875
Epoch 20/50

Epoch 00020: val_accuracy did not improve from 0.26875
Epoch 21/50

Epoch 00021: val_accuracy did not improve from 0.26875
Epoch 22/50

Epoch 00022: val_accuracy did not improve from 0.26875
Epoch 23/50

Epoch 00023: val_accuracy improved from 0.26875 to 0.27125, saving model to ./models/prcnn/weights.best.h5
Epoch 24/50

Epoch 00024: val_accuracy did not improve from 0.27125
Epoch 25/50

Epoch 00025: val_accuracy did not improve from 0.27125
Epoch 26/50

Epoch 00026: val_accuracy improved from 0.27125 to 0.27500, saving model to ./models/prcnn/weights.best.h5
Epoch 27/50

Epoch 00027: val_accuracy did not improve from 0.27500
Epoch 28/50

Epoch 00028: val_accuracy did not improve from 0.27500

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 29/50

Epoch 00029: val_accuracy did not improve from 0.27500
Epoch 30/50

Epoch 00030: val_accuracy did not improve from 0.27500
Epoch 31/50

## Testing...

In [3]:
weights = 'models/prcnn/base-weights.best.h5'
model = load_model(weights)

In [42]:
# Testing base test data:
base_npzfile = np.load('base_test_arr.npz')
base_X_test = base_npzfile['arr_0']
base_y_test = base_npzfile['arr_1']
# base_X_test, base_y_test = scale_shuffle_data(base_X_test, base_y_test)

In [43]:
X_test_raw = librosa.core.db_to_power(base_X_test, ref=1.0)
X_test_raw = np.log(X_test_raw)
print(np.amin(X_test_raw), np.amax(X_test_raw), np.mean(X_test_raw))

X_test_exp = np.expand_dims(X_test_raw, axis = -1)
y_pred = model.predict(X_test_exp)
y_pred = np.argmax(y_pred, axis = 1)
y_true = base_y_test

-18.420680743952367 1.554312234475218e-15 -10.044188292533065


In [44]:
y_true = y_true - 1
print(classification_report(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.51      0.55       100
           1       0.30      0.30      0.30       100
           2       0.24      0.28      0.26       100
           3       0.63      0.80      0.70       100
           4       0.43      0.41      0.42       100
           5       0.58      0.45      0.51       100
           6       0.26      0.19      0.22       100
           7       0.49      0.61      0.54       100

    accuracy                           0.44       800
   macro avg       0.44      0.44      0.44       800
weighted avg       0.44      0.44      0.44       800

0.44375


## Testing own models..

In [56]:
weights = 'models/prcnn/weights.best.h5'
model = load_model(weights)

In [57]:
X_test_exp = np.expand_dims(X_test, axis = -1)
y_pred = model.predict(X_test_exp)
y_pred = np.argmax(y_pred, axis = 1)
y_true = np.argmax(y_test, axis = 1)

In [58]:
print(classification_report(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.33      0.28       100
           1       0.21      0.12      0.15       100
           2       0.11      0.06      0.08       100
           3       0.22      0.21      0.22       100
           4       0.40      0.38      0.39       100
           5       0.20      0.23      0.21       100
           6       0.24      0.18      0.21       100
           7       0.28      0.46      0.35       100

    accuracy                           0.25       800
   macro avg       0.24      0.25      0.23       800
weighted avg       0.24      0.25      0.23       800

0.24625


## Ensembling

In [10]:
def load_metadata(path):
    tracks = pd.read_csv(os.path.join(METADATA_PATH, "tracks.csv"), index_col=0, header=[0, 1])
    keep_cols = [('set', 'split'), ('set', 'subset'), ('track', 'genre_top')]
    tracks = tracks[keep_cols]
    
    features = pd.read_csv(os.path.join(METADATA_PATH, "features.csv"), index_col=0, header=[0, 1, 2], skip_blank_lines=True)
    return tracks, features

def setup_data(tracks, features):
    small = tracks['set', 'subset'] == 'small'
    
    train = tracks['set', 'split'] == 'training'
    val = tracks['set', 'split'] == 'validation'
    test = tracks['set', 'split'] == 'test'
    
    y_train = tracks.loc[small & train, ('track', 'genre_top')]
    y_val = tracks.loc[small & val, ('track', 'genre_top')]
    y_test = tracks.loc[small & test, ('track', 'genre_top')]

    X_train = features.loc[small & train, 'mfcc']
    X_val = features.loc[small & val, 'mfcc']
    X_test = features.loc[small & test, 'mfcc']
    
    # Shuffle training data
    X_train, y_train = skl.utils.shuffle(X_train, y_train)
    
    # Standardize features - remove mean and scale accordingly
    standardize = skl.preprocessing.StandardScaler(copy=False)
    X_train = standardize.fit_transform(X_train)
    X_val = standardize.fit_transform(X_val)
    X_test = standardize.fit_transform(X_test)
    
    # Label encode outputs
    le = skl.preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_val = le.fit_transform(y_val)
    y_test = le.fit_transform(y_test)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

def train(c, X_train, y_train):
    c.fit(X_train, y_train)

def predict(c, X_test):
    y_pred = c.predict(X_test)
    return y_pred

In [11]:
METADATA_PATH = 'dataset/FMA/fma_metadata/'
tracks, features = load_metadata(METADATA_PATH)
X_train, y_train, X_val, y_val, X_test, y_test = setup_data(tracks, features)

In [36]:
c_svc = skl.svm.SVC()
train(c_svc, X_train, y_train)
y_pred_train = predict(c_svc, X_train)
y_pred_val = predict(c_svc, X_val)
y_pred_test_svc = predict(c_svc, X_test)
print("SVC - F1 scores:")
print("Train: {:.4f}".format(f1_score(y_train, y_pred_train, average='micro', pos_label=1)))
print("Valid: {:.4f}".format(f1_score(y_val, y_pred_val, average='micro', pos_label=1)))
print("Test: {:.4f}".format(f1_score(y_test, y_pred_test_svc, average='micro', pos_label=1)))

SVC - F1 scores:
Train: 0.7581
Valid: 0.5450
Test: 0.4650


In [37]:
c_rf = skl.ensemble.RandomForestClassifier(random_state=10, max_depth=30, n_estimators=300, min_samples_leaf=6, min_impurity_decrease=0.0002, 
                                           class_weight='balanced')
train(c_rf, X_train, y_train)
y_pred_train = predict(c_rf, X_train)
y_pred_val = predict(c_rf, X_val)
y_pred_test_rf = predict(c_rf, X_test)
print("Random Forests - F1 scores:")
print("Train: {:.4f}".format(f1_score(y_train, y_pred_train, average='micro', pos_label=1)))
print("Valid: {:.4f}".format(f1_score(y_val, y_pred_val, average='micro', pos_label=1)))
print("Test: {:.4f}".format(f1_score(y_test, y_pred_test_rf, average='micro', pos_label=1)))

Random Forests - F1 scores:
Train: 0.9269
Valid: 0.5275
Test: 0.4387


In [40]:
labels = []
prcnn = y_pred
svc = y_pred_test_svc
rf = y_pred_test_rf
labels.append(prcnn)
labels.append(svc)
labels.append(rf)
labels = np.array(labels)
labels = np.transpose(labels, (1, 0))
labels = scipy.stats.mode(labels, axis=1)[0]
labels = np.squeeze(labels)

print(accuracy_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred_test_svc))
print(accuracy_score(y_true, y_pred_test_rf))
print(accuracy_score(y_true, labels))

0.44375
0.465
0.43875
0.46625
