In [None]:
### Notebook for Pytorch implementation of CNN-based GTZAN classifier using Keras
# Potentially use as script for live coding tutorial after students have done the Keras version

# Main Goals:
# 1. more practice with pytorch via a direct comparison to keras
# 2. training with minibatches
# 3. use of the Dataset and Dataloader classes to create custom datasets
# 4. more practice with CNN architecture design
# 5. more practice with training, debugging and experimentation

In [None]:
# mount your Google drive so that you only have to download the data only once
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/path/to/notebook

In [None]:
# Now we will download the GTZAN dataset from Kaggle. To do this, use the following steps.

# 1. Make a Kaggle account: https://www.kaggle.com/account/login?phase=startRegisterTab&returnUrl=%2F
# 2. Go to your account, scroll to the API section. Click Expire API Token to remove previous tokens if necessary.
# 3. Click on Create New API Token. It will download a kaggle.json file on your machine.

# 4. Upload the file from your machine:
!pip install -q kaggle
from google.colab import files
files.upload()

# 5. make a new directory within Drive named kaggle and copy the kaggle.json file there.
# comment the mkdir command out if you have run this cell already
# !rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/

# 6. change the permissions of the file
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Now we are ready to download the GTZAN dataset.
# YOU ONLY NEED TO RUN THIS ONCE!
!kaggle datasets download -d carlthome/gtzan-genre-collection --unzip

In [None]:
# The GTZAN dataset has 1000 30-second-long "tracks" across 10 different musical genres
# There are 100 recordings for each genre.

# Let's explore the format of the downloaded dataset.  We can look at the dataset on the Kaggle page to get an idea of the file structure:
#     https://www.kaggle.com/datasets/carlthome/gtzan-genre-collection
# The Data Explorer on the right-hand pane provides a graphical version of the file structure.
# We can see that each filename contains the genre and a unique number within that folder.  
# We can use these file names as our track ids.

%cd genres
!ls
%cd ..

import os
import numpy as np
import librosa

# get the 1000 different "track_ids" by recursing over directory and subidrectory

def getTrackIDs(dir_name):
    # create a list of file and sub directories 
    # names in the given directory 
    file_list = os.listdir(dir_name)
    all_tracks = list()
    # Iterate over all the entries
    for entry in file_list:
        # Create full path
        full_path = os.path.join(dir_name, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_tracks = all_tracks + getTrackIDs(full_path)
        else:
            all_tracks.append(full_path)   
    return all_tracks

all_tracks = getTrackIDs('./genres')

print("Number of tracks: ", len(all_tracks))

sample_id = all_tracks[len(all_tracks)-1]
print("Sample track ID:", sample_id)

# It is always good to explore your data files before you begin working with them. Let's check out the structure of one of the audio files:
x, sr = librosa.load(sample_id, sr=None)

print('\nSignal Shape:', x.shape)
print('Sampling Rate:', sr)

In [None]:
# Let's split these recordings into training (~85%), validation (~10%), and test (~5%) sets
# randomly separate these different "track_ids" intro training, validation, and test sets

Ntracks = len(all_tracks)

track_idx = np.random.choice(Ntracks,Ntracks,replace=False)

tr_tracks = [all_tracks[i] for i in track_idx[:int(Ntracks*0.85)]]
vl_tracks = [all_tracks[i] for i in track_idx[int(Ntracks*0.85):int(Ntracks*0.95)]]
ts_tracks = [all_tracks[i] for i in track_idx[int(Ntracks*0.95):]]

In [None]:
# Now we will import pytorch and we will use this library
# to build our custom dataloader and model

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# you may also use a GPU if you have one available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# To feed this data into a CNN, we must define a Dataset class that
# will create sequences of data and store them in mini batches

class GTZAN(Dataset):
    
    # The class constructor
    def __init__(
          self, 
          track_ids,      # a list with the track_ids that belong to the set
          ntime=None,     # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          nfft=None,      # to work with a time-frequency representation (you can work in another domain or with other features if you want)
          n_channels=1,   # the default number of "channels" in the input to the CNN
          n_classes=10,   # the number of classes          
        ):
            
        self.ntime = ntime # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.nfft = nfft   # to work with a time-frequency representation (you can work in another domain or with other features if you want)
        self.batch_size = batch_size        
        self.track_ids = track_ids
        self.n_channels = n_channels
        self.n_classes = n_classes                

    # this method returns how many samples are in the set
    def __len__(self):

        return #your code here

    # get individual data, label pairs
    def __getitem__(self, index):
        
        #your code here
        
        return #your code here
  
    # actually loads the audio file and conver to a tensor 
    def __data_generation(self, t):
        ''''
        the sample of audio data should have a shape [n_channels, ntime, nmel] 
        (to work with a time-frequency representation; you can work in another domain if you want)
        '''
        # load the file
        
        # calculate the stft (to work with a time-frequency representation; you can work in another domain if you want)
        
        # convert to db (to work with a time-frequency representation; you can work in another domain if you want)
        
        # Store class index
        if 'blues' in t:
          y = 0
        elif 'classical' in t:
          y = 1
        elif 'country' in t:
          y = 2
        elif 'disco' in t:
          y = 3
        elif 'hiphop' in t:
          y = 4
        elif 'jazz' in t:
          y = 5
        elif 'metal' in t:
          y = 6
        elif 'pop' in t:
          y = 7
        elif 'reggae' in t:
          y = 8
        elif 'rock' in t:
          y = 9
        else:
          raise ValueError('label does not belong to valid category')
        
        #unsqueeze to add a channel dimension to work with nn.Conv2d(): https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        # your code here

        return X,y

In [None]:
# input data and label parameters
ntime = 120
nfft = 256
nclasses = 10

# define the data generators and data loaders
train_data = GTZAN(tr_tracks, ntime=ntime, nfft=nfft)
val_data = GTZAN(vl_tracks, ntime=ntime, nfft=nfft)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_data, batch_size=32, shuffle=True, num_workers=0)

In [None]:
# the pytorch version of a simple (bad) CNN

# learning parameters

#initialize 1) the model architecture and 2) the forward data flow through the net
class Model(nn.Module):
    # 1) Define and initialize the neural network
    # Hint: see: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
    # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)
    def __init__(self, out_features, device):
      super(Model, self).__init__()

      # define model parameters
      self.out_features = out_features
      self.device = device
      
      #Define the model layers
      self.conv = nn.Conv2d(1, 4, 5)
      self.relu = nn.ReLU()
      self.max_pooling2d = nn.MaxPool2d(2,2)
      self.flat = nn.Flatten()
      self.out = nn.Linear(14384,out_features)

    # 2) Specify how data will pass through our model
    def forward(self, x):
      z1 = self.conv(x)
      o1 = self.relu(z1)
      o2 = self.max_pooling2d(o1)
      z2 = self.flat(o2)
      # print(z2.shape)
      theta = self.out(z2)
      return theta

# initialize the model
model = Model(out_features=nclasses, device=device).float().to(device)

# Next, we will set up the training optimization using the parameters we chose above:
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg) #this applies our L2 regularization

# Then we define the loss function outside of the model class and before the training loop. 
loss_fn = nn.CrossEntropyLoss() #softmax + negative log-likelihood

# Then, we will create a function to compare the predicted classes and the actual classes to calculate the accuracy
def multi_acc(theta, Y):
    # apply softmax to logits to obtain class probabilities
    class_probs = torch.log_softmax(theta, dim = 1)
    # select the highest probability to be the correct class
    _, Yhat = torch.max(class_probs, dim = 1)    
    
    # count how many times in the batch the prediction matches the true class
    is_correct = (Yhat == Y).float()
    # convert this to a percentage
    accuracy = is_correct.sum() / len(is_correct)
    return accuracy, Yhat


############## Model Information #####################################
# Lastly we will print out the summary of our model
print(f"Model structure: {model}\n")
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()}")

# total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"\n{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.\n")

In [None]:
# now that the model has been built, let's see how it performs on the validation data
# before being trained.

# Q: why do we use the torch.no_grad() method here?
# A:
# Q: does the evaluation before training agree with your expectation?  Why does this initial loss value make sense? Hint: Think about the log-likehood equation.
# A: 

#sample pass
loss = 0.0
acc = 0.0
with torch.no_grad():
  for batch in val_loader:
    X, y = batch
    theta = model(#your code here)
    loss += loss_fn(theta, #your code here)
    batch_acc,_ = multi_acc(theta, #your code here)
    acc += batch_acc

  print(f'Loss: {#your code here}')

In [None]:
# now we can move on to train the model
# we will add 2 inner for-loops to our training now that we are dealing with our data in mini batches
# thus, we will also need to make sure we track our batch loss appropriately

# define 2 dictionaries which will store the accuracy/epoch and loss/epoch for both train and validation sets
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

############## Main Training Loop ###################
for epoch in range(nepochs):

  train_loss = 0
  train_acc = 0
  val_loss = 0
  val_acc = 0

  for i, batch in enumerate(train_loader):
      
      X,y = batch

      # zero the epoch loss and parameter gradients
      optimizer.zero_grad()
      
      # Training: forward + backward + optimize
      theta = model(X.to(device))
      loss = loss_fn(theta, y.to(device))
      acc,Yhat = multi_acc(theta, y.to(device))
      
      loss.backward()
      optimizer.step()

      train_loss += loss.item()/len(train_loader)
      train_acc += acc/len(train_loader)

  for i, batch in enumerate(val_loader):
      X,y = batch
      # Validation: forward only
      with torch.no_grad():
        # send validation data through the network
        theta = model(X.to(device))
        loss = loss_fn(theta, y.to(device))
        acc,Yhat = multi_acc(theta, y.to(device))
        val_loss += loss.item()/len(val_loader)
        val_acc += acc/len(val_loader)

  # Save training stats:
  loss_stats['train'].append(train_loss)
  loss_stats['val'].append(val_loss)
  accuracy_stats['train'].append(train_acc)
  accuracy_stats['val'].append(val_acc)

  # print statistics
  print(f'Epoch {epoch+1}: Train Loss: {train_loss:.3f} | Train Acc.: {train_acc:.4f} | Val Loss: {val_loss:.3f} | Val Acc.: {val_acc:.4f}')

print('Finished Training')

In [None]:
# now that training is done, let's visualize the training and validation loss
# all of that information is readily available in the "training logs"

import matplotlib.pyplot as plt

# summarize history for loss
plt.plot(loss_stats['train'])
plt.plot(loss_stats['val'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
# Questions:
# What are some training peculiarities you notice after training this simple (bad) CNN?
# A: 
# List a couple ways you could remedy some of these peculiarities.
# A: modify epoch loss, add dropout, modify the learning rate, add in early stopping criterion, etc

# Next, keeping in mind your answers above and some of the other training techniques we have learned so far,
# improve the performance of the above CNN

# after training a good CNN, do the usual visualization of the training and validation loss across epochs

# then inspect the new model's accuracy on the validation set and the confusion matrix on the validation set

# If you do everything right and design a good CNN, you should be able to train a model that achieves
# over 70% accuracy on the validation set

# If you do everything perfectly and design an outstanding CNN, you will be able to train a model that achieves
# 90% accuracy on the validation set.

# When you are done, analyze the model's performance on the test set, 
# and create a post on our discord sharing your model's test-set accuracy
# and confusion matrix

# you can likely re-use much of your visualization code from the previous CNN notebook