In [1]:
# In this notebook you will build a CNN and train it to classify 10 different 
# musical genres

# Fot this, we will use the GTZAN dataset
# see https://mirdata.readthedocs.io/en/latest/_modules/mirdata/datasets/gtzan_genre.html
# see "Musical genre classification of audio signals " by G. Tzanetakis and P. Cook

# Let's start by installing and loading mirdata
!pip install mirdata

import mirdata



In [2]:
# mount your Google drive so that you only have to download the data only once
from google.colab import drive
drive.mount('/content/drive')

# intialize the gtzan dataset
gtzan = mirdata.initialize('gtzan_genre', data_home='/content/drive/MyDrive')

# download it (only once)
gtzan.download(partial_download=['all']) # comment out this line after you have downloaded the data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


INFO: NumExpr defaulting to 2 threads.


In [3]:
# The GTZAN dataset has 1000 30-second-long "tracks" across 10 different musical genres

# There are 100 recordings for each genre.

# Let's split these recordings into training (~85%), validation (~10%), and test (~5%) sets

import numpy as np

# get the 100 different "track_ids"
all_tracks = gtzan.track_ids

# randomly separate these different "track_ids" intro training, validation, and test sets

Ntracks = len(all_tracks)

track_idx = np.random.choice(Ntracks,Ntracks,replace=False)

tr_tracks = [all_tracks[i] for i in track_idx[:850]]
vl_tracks = [all_tracks[i] for i in track_idx[850:950]]
ts_tracks = [all_tracks[i] for i in track_idx[-50:]]

In [47]:
# To feed this data into a CNN, we must define a DataGenerator class that
# will create sequences of data and store them in mini batches

import numpy as np
import tensorflow as tf
import librosa

class DataGenerator(tf.keras.utils.Sequence):
    
    # The class constructor
    def __init__(
          self, 
          track_ids,      # a list with the track_ids that belong to the set
          batch_size=32,  # the default number of datapoints in a minibatch
          ntime=None,     # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          nfft=None,      # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          n_channels=1,   # the default number of "channels" in the input to the CNN
          n_classes=10,   # the number of classes          
        ):
            
        self.ntime = ntime # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.nfft = nfft   # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.batch_size = batch_size        
        self.track_ids = track_ids
        self.n_channels = n_channels
        self.n_classes = n_classes                

    # this method returns how many batches there will be per epoch
    def __len__(self):
        '''
        divide the total number of datapoints in the set
        by the batch size. Make sure this returns an integer
        '''
        return int(np.floor(len(self.track_ids) / self.batch_size))

    # iterates over the mini-batches by their index,
    # generates them, and returns them
    def __getitem__(self, index):
        
        # get the track ids that will be in a batch
        track_ids_batch = self.track_ids[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(track_ids_batch)

        return X, y
  
    # actually loads the audio files and stores them in an array 
    def __data_generation(self, track_ids_batch):
        ''''
        the matrix with the audio data will have a shape [batch_size, ntime, nmel, n_channels] 
        (to work with a time-frequency representation; you can work in another domain if you want)
        '''
        
        # Generate data
        X = []
        y = []
        for t in track_ids_batch:
            
            # load the file
            x, sr = gtzan.track(t).audio

            # calculate the stft (to work with a time-frequency representation; you can work in another domain if you want)
            # hint: do you really need to listen 30 seconds of audio to know the genre or a popular song?
            x = librosa.stft(x, n_fft = self.nfft, hop_length=len(x)//(self.ntime-1)).T
            
            # convert to db (to work with a time-frequency representation; you can work in another domain if you want)
            X.append(librosa.amplitude_to_db(np.abs(x))[...,np.newaxis])

            # Store class index
            if 'blues' in t:
              y.append(0)
            elif 'classical' in t:
              y.append(1)
            elif 'country' in t:
              y.append(2)
            elif 'disco' in t:
              y.append(3)
            elif 'hiphop' in t:
              y.append(4)
            elif 'jazz' in t:
              y.append(5)
            elif 'metal' in t:
              y.append(6)
            elif 'pop' in t:
              y.append(7)
            elif 'reggae' in t:
              y.append(8)
            elif 'rock' in t:
              y.append(9)
            else:
              raise ValueError('label does not belong to valid category')

        return np.array(X), tf.keras.utils.to_categorical(np.array(y), num_classes=self.n_classes)

In [55]:
# a very simple (and bad) CNN
# you should make it better. This one is actually very very VERY bad

# learning parameters
lr = 0.0001

# input data and label parameters
ntime = 120
nfft = 256
nclasses = 10

# declaring the input to the model
inputs = tf.keras.Input(shape = (ntime,1+nfft//2,1))

# defining the CNN
cnn1 = tf.keras.layers.Conv2D(4, 5, activation = 'relu', padding='SAME')(inputs)
mxp1 = tf.keras.layers.MaxPooling2D(pool_size = 2, strides = 2, padding='SAME')(cnn1)
flat = tf.keras.layers.Flatten()(mxp1)
outputs = tf.keras.layers.Dense(10)(flat)

bad_cnn = tf.keras.Model(inputs=inputs, outputs=outputs)

# visualize the architecture
bad_cnn.summary()

# compile the model
bad_cnn.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    metrics=["accuracy"],
)

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 120, 129, 1)]     0         
                                                                 
 conv2d_14 (Conv2D)          (None, 120, 129, 4)       104       
                                                                 
 max_pooling2d_14 (MaxPoolin  (None, 60, 65, 4)        0         
 g2D)                                                            
                                                                 
 flatten_14 (Flatten)        (None, 15600)             0         
                                                                 
 dense_14 (Dense)            (None, 10)                156010    
                                                                 
Total params: 156,114
Trainable params: 156,114
Non-trainable params: 0
____________________________________________________

In [53]:
# define the data generators
training_generator = DataGenerator(tr_tracks, ntime=ntime, nfft=nfft)
validation_generator = DataGenerator(vl_tracks, ntime=ntime, nfft=nfft)

In [54]:
# train the model
tr_logs = bad_cnn.fit(training_generator, validation_data=validation_generator, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f651661ddd0>

In [None]:
# after training a good CNN, do the usual visualization of the training and validation loss across epochs

# then inspect the model's accuracy on the validation set and the confusion matrix on the validation set

# If you do everything right and design a good CNN, you should be able to train a model that achieves
# over 70% accuracy on the validation set

# If you do everything perfectly and design an outstanding CNN, you will be able to train a model that achieves
# 90% accuracy on the validation set.

# When you are done, analize the model's performance on the test set, 
# and create a post on our subreddit sharing your model's test-set accuracy
# and confusion matrix