In [None]:
# In this notebook you will build a CNN and train it to classify 10 different 
# musical genres

# Fot this, we will use the GTZAN dataset hosted on Kaggle: https://www.kaggle.com/datasets/carlthome/gtzan-genre-collection
# see "Musical genre classification of audio signals " by G. Tzanetakis and P. Cook

In [None]:
# mount your Google drive so that you only have to download the data only once
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# change the current working directory to be where this .ipynb is located within your Drive
# make sure you 'include the full folder path beginning with /content/gdrive/MyDrive'
%cd /content/drive/path/to/current/working/directory #modify this path

/content/drive/MyDrive/Teaching/DL4MIR2022/Notebooks


In [None]:
# Now we will download the GTZAN dataset from Kaggle. To do this, use the following steps.

# 1. Make a Kaggle account: https://www.kaggle.com/account/login?phase=startRegisterTab&returnUrl=%2F
# 2. Go to your account, scroll to the API section. Click Expire API Token to remove previous tokens if necessary.
# 3. Click on Create New API Token. It will download a kaggle.json file on your machine.

# 4. Upload the file from your machine:
!pip install -q kaggle
from google.colab import files
files.upload()

# 5. make a new directory within Drive named kaggle and copy the kaggle.json file there
# comment the mkdir command out if you have run this cell already
# !rm -r ~/.kaggle
# !mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/

# 6. change the permissions of the file
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Now we are ready to download the GTZAN dataset.
# YOU ONLY NEED TO RUN THIS ONCE!
!kaggle datasets download -d carlthome/gtzan-genre-collection --unzip

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [None]:
#9. Confirm the dataset is downloaded and unzipped in the expected location:
# you should see a the full 'genres' folder path and a list of all the genres in the dataset
%cd genres
!ls
%cd ..

# 10. Pat yourself on the back! 
# Kaggle is a great source for open-source and competition datasets.  You can use this process to work with other datasets

/content/drive/MyDrive/Teaching/DL4MIR2022/Notebooks/genres
blues  classical  country  disco  hiphop  jazz	metal  pop  reggae  rock
/content/drive/MyDrive/Teaching/DL4MIR2022/Notebooks


In [None]:
# The GTZAN dataset has 1000 30-second-long "tracks" across 10 different musical genres
# There are 100 recordings for each genre.

# Let's explore the format of the downloaded dataset.  We can look at the dataset on the Kaggle page to get an idea of the file structure:
#     https://www.kaggle.com/datasets/carlthome/gtzan-genre-collection
# The Data Explorer on the right-hand pane provides a graphical version of the file structure.
# We can see that each filename contains the genre and a unique number within that folder.  
# We can use these file names as our track ids.

import os
import numpy as np
import librosa

# get the 1000 different "track_ids" by recursing over directory and subidrectory

def getTrackIDs(dir_name):
    # create a list of file and sub directories 
    # names in the given directory 
    file_list = os.listdir(dir_name)
    all_tracks = list()
    # Iterate over all the entries
    for entry in file_list:
        # Create full path
        full_path = os.path.join(dir_name, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_tracks = all_tracks + getTrackIDs(full_path)
        else:
            all_tracks.append(full_path)   
    return all_tracks

all_tracks = getTrackIDs('./genres')

print("Number of tracks: ", #your code here


# Q: Why do we want to store the filepath rather than just the filename?
# A: 

# It is always good to explore your data files before you begin working with them. Let's check out the structure of one of the audio files:
sample_id = #your code here
print("Sample track ID:", sample_id)

x, sr = librosa.load(#your code here

print('\nSignal Shape:', #your code here
print('Sampling Rate:', #your code here

In [None]:
# Let's split these recordings into training (~85%), validation (~10%), and test (~5%) sets
# randomly separate these different "track_ids" intro training, validation, and test sets

Ntracks = len(#your code here

track_idx = np.random.choice(Ntracks,Ntracks,replace=False)

tr_tracks = #your code here
vl_tracks = #your code here
ts_tracks = #your code here

In [None]:
# To feed this data into a CNN, we must define a DataGenerator class that
# will create sequences of data and store them in mini batches

import tensorflow as tf

class DataGenerator(tf.keras.utils.Sequence):
    
    # The class constructor
    def __init__(
          self, 
          track_ids,      # a list with the track_ids that belong to the set
          batch_size=32,  # the default number of datapoints in a minibatch
          ntime=None,     # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          nfft=None,      # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          n_channels=1,   # the default number of "channels" in the input to the CNN
          n_classes=10,   # the number of classes          
        ):
            
        self.ntime = ntime # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.nfft = nfft   # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.batch_size = batch_size        
        self.track_ids = track_ids
        self.n_channels = n_channels
        self.n_classes = n_classes                

    # this method returns how many batches there will be per epoch
    def __len__(self):
        '''
        divide the total number of datapoints in the set
        by the batch size. Make sure this returns an integer
        '''
        return #your code here

    # iterates over the mini-batches by their index,
    # generates them, and returns them
    def __getitem__(self, index):
        
        # get the track ids that will be in a batch
        track_ids_batch = #your code here

        # Generate data
        X, y = self.__data_generation(track_ids_batch)

        return X, y
  
    # actually loads the audio files and stores them in an array 
    def __data_generation(self, track_ids_batch):
        ''''
        the matrix with the audio data will have a shape [batch_size, ntime, nmel, n_channels] 
        (to work with a time-frequency representation; you can work in another domain if you want)
        '''
        
        # Generate data
        X = []
        y = []
        for t in track_ids_batch:
            
            # load the file
            x, sr = #your code here
            # calculate the stft (to work with a time-frequency representation; you can work in another domain if you want)
            # hint: do you really need to listen 30 seconds of audio to know the genre of a popular song?
            x = librosa.stft(#your code here
            
            # convert to db (to work with a time-frequency representation; you can work in another domain if you want)
            X.append(librosa.amplitude_to_db(#your code here

            # Store class index
            if 'blues' in t:
              y.append(0)
            elif 'classical' in t:
              y.append(1)
            elif 'country' in t:
              y.append(2)
            elif 'disco' in t:
              y.append(3)
            elif 'hiphop' in t:
              y.append(4)
            elif 'jazz' in t:
              y.append(5)
            elif 'metal' in t:
              y.append(6)
            elif 'pop' in t:
              y.append(7)
            elif 'reggae' in t:
              y.append(8)
            elif 'rock' in t:
              y.append(9)
            else:
              raise ValueError('label does not belong to valid category')

        # return the input data batch along with the labels reformatted to be one-hot encoded vectors
        return np.array(X), tf.keras.utils.to_categorical(#your code here

In [None]:
# a very simple (and bad) CNN
# you should make it better. This one is actually very very VERY bad

# learning parameters
lr = 0.0001

# input data and label parameters
ntime = 120
nfft = 256
nclasses = 10

# declaring the input to the model
inputs = tf.keras.Input(shape = (ntime,1+nfft//2,1))

# defining the CNN
cnn1 = tf.keras.layers.Conv2D(4, 5, activation = 'relu', padding='SAME')(inputs)
mxp1 = tf.keras.layers.MaxPooling2D(pool_size = 2, strides = 2, padding='SAME')(cnn1)
flat = tf.keras.layers.Flatten()(mxp1)
outputs = tf.keras.layers.Dense(10)(flat)

bad_cnn = tf.keras.Model(inputs=inputs, outputs=outputs)

# visualize the architecture
bad_cnn.summary()

# compile the model
bad_cnn.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    metrics=["accuracy"],
)

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 120, 129, 1)]     0         
                                                                 
 conv2d_6 (Conv2D)           (None, 120, 129, 4)       104       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 60, 65, 4)        0         
 2D)                                                             
                                                                 
 flatten_6 (Flatten)         (None, 15600)             0         
                                                                 
 dense_6 (Dense)             (None, 10)                156010    
                                                                 
Total params: 156,114
Trainable params: 156,114
Non-trainable params: 0
_____________________________________________________

In [None]:
# define the data generators
training_generator = DataGenerator(tr_tracks, ntime=ntime, nfft=nfft)
validation_generator = DataGenerator(vl_tracks, ntime=ntime, nfft=nfft)

In [None]:
# train the model
tr_logs = bad_cnn.fit(training_generator, validation_data=validation_generator, epochs=10)

In [None]:
# after training a good CNN, do the usual visualization of the training and validation loss across epochs

# then inspect the model's accuracy on the validation set and the confusion matrix on the validation set

# If you do everything right and design a good CNN, you should be able to train a model that achieves
# over 70% accuracy on the validation set

# If you do everything perfectly and design an outstanding CNN, you will be able to train a model that achieves
# 90% accuracy on the validation set.

# When you are done, analyze the model's performance on the test set, 
# and create a post on our subreddit sharing your model's test-set accuracy
# and confusion matrix