## The GTZAN music genre dataset. 
The goal is to classify different genres of music. The genre labels are:

0 'blues'

1 'classical'

2 'country'

3 'disco'

4 'hiphop'

5 'jazz'

6 'metal'

7 'pop'

8 'reggae'

9 'rock'

There are 8000 labeled training examples, 100 validation examples, and 100 test examples. Sampling rate is 22050 Hz

In [1]:
!rm gtzan.zip
!gdown --id 1wAjDwxWMSjrWz4-tbQyWIyqGLMVetWed
!unzip gtzan.zip

import numpy as np
K=10
X_tr = np.load('gtzan_tr.npy')
y_tr_dense = X_tr[:,-1]
y_tr = np.zeros((X_tr.shape[0],K))
y_tr[range(X_tr.shape[0]),y_tr_dense.astype(int)] = 1
X_tr = X_tr[:,:-1]
X_vl = np.load('gtzan_cv.npy')
y_vl_dense = X_vl[:,-1]
y_vl = np.zeros((X_vl.shape[0],K))
y_vl[range(X_vl.shape[0]),y_vl_dense.astype(int)] = 1
X_vl = X_vl[:,:-1]
X_ts = np.load('gtzan_te.npy')
y_ts_dense = X_ts[:,-1]
y_ts = np.zeros((X_ts.shape[0],K))
y_ts[range(X_ts.shape[0]),y_ts_dense.astype(int)] = 1
X_ts = X_ts[:,:-1]

print('The shape of X_ts is: ', X_ts.shape)
print('The shape of y_ts is: ', y_ts.shape)
print('The shape of X_vl is: ', X_vl.shape)
print('The shape of y_vl is: ', y_vl.shape)
print('The shape of X_tr is: ', X_tr.shape)
print('The shape of y_tr is: ', y_tr.shape)

rm: cannot remove 'gtzan.zip': No such file or directory
Downloading...
From: https://drive.google.com/uc?id=1wAjDwxWMSjrWz4-tbQyWIyqGLMVetWed
To: /content/gtzan.zip
934MB [00:07, 132MB/s]
Archive:  gtzan.zip
  inflating: gtzan_tr.npy            
  inflating: gtzan_cv.npy            
  inflating: gtzan_te.npy            
The shape of X_ts is:  (100, 44100)
The shape of y_ts is:  (100, 10)
The shape of X_vl is:  (100, 44100)
The shape of y_vl is:  (100, 10)
The shape of X_tr is:  (8000, 44100)
The shape of y_tr is:  (8000, 10)


In [6]:
# let's hear some examples
fs = 22050
from IPython.display import Audio
display(Audio(X_tr[-1], rate=fs))
y_tr[-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [8]:
# first let's standardize the data
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

normalizer = preprocessing.Normalization()
normalizer.adapt(X_tr)
X_tr_n = normalizer(X_tr)
X_vl_n = normalizer(X_vl)

In [44]:
# let's define some variables that we might need

'''
# let's overfit to only a few datapoints
X_tr_n = normalizer(X_tr[[i*800 for i in range(C)]])
'''

D = X_tr_n.shape[1]
C = y_tr.shape[1]
lr = 1.0
nepochs = 20

# here we will build the model with keras
input_data = tf.keras.Input(shape=(D,))
scores = tf.keras.layers.Dense(C)(input_data)
y_hat = tf.keras.layers.Softmax()(scores)
model = tf.keras.Model(inputs=input_data, outputs=y_hat)
model.summary()

model.compile(
    optimizer = tf.optimizers.SGD(learning_rate=lr),
    loss = tf.keras.losses.CategoricalCrossentropy()
)

model.fit(
    X_tr_n, y_tr, 
    epochs = nepochs,
    batch_size = X_tr_n.shape[0],
    validation_data=(X_vl_n,y_vl)
)

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 44100)]           0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                441010    
_________________________________________________________________
softmax_7 (Softmax)          (None, 10)                0         
Total params: 441,010
Trainable params: 441,010
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fda9cd11590>

In [41]:
# let's hear the weights
W = model.layers[1].get_weights()[0]
from IPython.display import Audio
display(Audio(X_tr[800]*W[:,0],rate=fs))