In [1]:
import torch
from torch.utils.data import Dataset

import torchaudio
import torchaudio.transforms

import sys, os

from pprint import pprint

from tqdm.autonotebook import tqdm

import json

import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

import librosa
import librosa.display

import pandas as pd

from pathlib import Path

import gc

MANUAL_SEED = 69

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from datetime import date
from datetime import datetime

import os.path
from os import path
  
import json


In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
class FMADataset(Dataset):

  def __init__(self, path, normalize_audio, audio_num_frames):
    self.path = path
    self.normalize_audio = normalize_audio
    self.audio_num_frames = audio_num_frames
    
    self.data = self._load_audio_list()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    
    try: 
      
      waveform, _ = torchaudio.load(
        filepath=self.data[idx], 
        normalize=self.normalize_audio,
        num_frames=self.audio_num_frames
      )
      
      if waveform.shape[1] < self.audio_num_frames:
        waveform = self._apply_padding(waveform)
      
      label = self.data[idx].split("/")[-2]
      label_one_hot = self._label_from_str_to_one_hot(label)
      
      return waveform, label_one_hot
    
    except Exception as e:
      print(f"Got the following exception for the file {self.data[idx]}")
      print("\n\n")
      print(e)
      
  def _apply_padding(self, to_pad):
    padding_size = self.audio_num_frames - to_pad.shape[1]
    
    return torch.nn.functional.pad(
      to_pad, (0, padding_size)
    )
  
  def _label_from_str_to_one_hot(self, label_str: str): 
  
    if label_str == "Pop":
      return torch.tensor([1, 0, 0, 0, 0, 0]).float()
    
    if label_str == "Hip-Hop":
      return torch.tensor([0, 1, 0, 0, 0, 0]).float()
    
    if label_str == "Electronic":
      return torch.tensor([0, 0, 1, 0, 0, 0]).float()
    
    if label_str == "Rock":
      return torch.tensor([0, 0, 0, 1, 0, 0]).float()

    if label_str == "Folk":
      return torch.tensor([0, 0, 0, 0, 1, 0]).float()

    if label_str == "Jazz":
      return torch.tensor([0, 0, 0, 0, 0, 1]).float()
    
  
  def _load_audio_list(self):
    
    audio_path_list = []
    
    for path, subdirs, files in tqdm(os.walk(self.path), colour="magenta"):
      for name in files:
          
        file_audio_path = os.path.join(path, name)
        
        audio_path_list.append(file_audio_path)
        
    return audio_path_list
        
        

In [4]:
DATASET_SIZE = "extra_small"
DATASET_NAME = f"fma_{DATASET_SIZE}_organized_by_label_resampled_rechanneled"
# DATASET_NAME = f"fma_{DATASET_SIZE}_organized_by_label"
DATASET_FOLDER = "./data/audio"
# DATASET_FOLDER = "/mnt/ramdisk"

dataset_path = f"{DATASET_FOLDER}/{DATASET_NAME}"

TRAINING_LOGS_FOLDER = "./logs"

NORMALIZE_AUDIO = True
AUDIO_NUM_FRAMES = 238000

In [5]:
# fma_dataset = FMADataset(
fma_dataset = FMADataset(
  path=dataset_path, 
  normalize_audio=NORMALIZE_AUDIO, 
  audio_num_frames=AUDIO_NUM_FRAMES
)

0it [00:00, ?it/s]

In [6]:
# len(fma_dataset)

In [7]:
TRAIN_PERCENTAGE = 0.7
VAL_PERCENTAGE = 0.2

full_size = len(fma_dataset)
train_size = int(TRAIN_PERCENTAGE * len(fma_dataset))
val_size = int(VAL_PERCENTAGE * len(fma_dataset))
test_size = full_size - train_size - val_size

In [8]:
generator=torch.Generator().manual_seed(MANUAL_SEED)

fma_dataset_train, fma_dataset_val, fma_dataset_test = torch.utils.data.random_split(
  fma_dataset, [train_size, val_size, test_size], generator
)

In [9]:
# print(f"len(fma_dataset_train): {len(fma_dataset_train)}")
# print(f"len(fma_dataset_val)  : {len(fma_dataset_val)}")
# print(f"len(fma_dataset_test) : {len(fma_dataset_test)}")

In [10]:
BATCH_SIZE = 23
NUM_WORKERS = 16

In [11]:
fma_dataloader_train = torch.utils.data.DataLoader(
  fma_dataset_train, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_val = torch.utils.data.DataLoader(
  fma_dataset_val, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_test = torch.utils.data.DataLoader(
  fma_dataset_test, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)

In [12]:
def count_num_trainable_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(
  torch.cuda.get_device_name(device) if torch.cuda.is_available() else "cpu"
)

cuda
NVIDIA GeForce RTX 2070


In [14]:
def gen_train_id():
  return datetime.now().strftime("%d_%m_%Y_%H_%M_%S")

In [15]:
def save_dict_to_disk(dict, full_path):
  with open(full_path, 'w') as fp:
    json.dump(dict, fp)

In [16]:
def make_dir_if_absent(dir_path):
  
  # print("making dir: ", dir_path)
  
  if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    

In [17]:
def store_ckp(
  model, optimizer, ckp_path, train_id, epoch, loss_train, loss_val, loss_test
):
  
  full_path = f"{ckp_path}/{train_id}_epoch_{epoch}.pth"
  
  # print("STORING IN: ", full_path)
  
  make_dir_if_absent(dir_path="/".join(full_path.split('/')[:-1]))
  
  torch.save(
    {
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss_train': loss_train,
      'loss_val': loss_val,
      'loss_test': loss_test,
    }, 
    full_path
  )

In [18]:
def get_num_correct_preds(outputs, labels):
  
  output_pred_ind = torch.argmax(outputs, dim=1)
  labels_ind = torch.argmax(labels, dim=1)
  
  matching_mask = (output_pred_ind == labels_ind).float()
  
  num_correct_preds = matching_mask.sum()
  
  return num_correct_preds

In [19]:
def train_model(
  model, optimizer, criterion,
  batch_size, train_dl, val_dl, test_dl, num_epochs, device, 
  print_freq, ckp_freq, ckp_folder
):
  
  train_id = gen_train_id()
  
  training_logs = {
    "train_id": train_id,
    "batch_size": batch_size
  }
  
  model = model.to(device)
  
  pbar_epochs = tqdm(range(num_epochs), colour="#9400d3")
  pbar_batches_train = tqdm(
    iter(train_dl), colour="#4169e1", leave=False,
  )
  pbar_batches_val = tqdm(
    iter(val_dl), colour="#008080", leave=False,
  )
  
  for epoch in range(num_epochs):

    running_loss_train = 0.0
    running_loss_val   = 0.0
    running_loss_test  = -1.0
    
    num_correct_preds_train = 0.0
    num_preds_train = 0.0
    accuracy_train = 0.0
    
    num_correct_preds_val = 0.0
    num_preds_val = 0.0
    accuracy_val = 0.0
    
    num_correct_preds_test = 0.0
    num_preds_test = 0.000000001
    accuracy_test = 0.0
        
    ## BEGIN training step
    
    model.train()
    
    pbar_batches_train.reset()
    pbar_batches_val.reset()
    
    pbar_epochs.set_description(f"epoch {epoch}")
    pbar_batches_train.set_description(f"epoch {epoch}")
    pbar_batches_val.set_description  (f"epoch {epoch}")
    
    for batch_x, batch_y in iter(train_dl):

      inputs, labels = batch_x, batch_y
      inputs, labels = inputs.to(device), labels.to(device)
      
      optimizer.zero_grad()

      outputs = model(inputs)
      outputs = outputs.squeeze(-1)
      
      loss = criterion(outputs, labels)
      
      loss.backward()
      optimizer.step()

      # running_loss_train += loss.item() * inputs.shape[0]
      running_loss_train += loss.item()
      
      num_correct_preds_train += get_num_correct_preds(outputs, labels)
      num_preds_train += outputs.shape[0]
      
      pbar_batches_train.update(1)
      
    
    ## END training step
    
    ## BEGIN validation step
    
    with torch.no_grad():
      
      model.eval()
      
      for batch_x, batch_y in iter(val_dl):

        inputs, labels = batch_x, batch_y
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        outputs = outputs.squeeze(-1)
        
        loss = criterion(outputs, labels)
        
        # running_loss_val += loss.item() * inputs.shape[0]
        running_loss_val += loss.item()
        
        num_correct_preds_val += get_num_correct_preds(outputs, labels)
        num_preds_val += outputs.shape[0]
        
        pbar_batches_val.update(1)
        
    ## END validation step
    
    ## BEGIN test step
    
    if (epoch + 1 == num_epochs):
      
      pbar_batches_test = tqdm(
        iter(test_dl), colour="#808000", leave=False,
      )
      pbar_batches_test.set_description  (f"epoch {epoch}")
    
      with torch.no_grad():
        
        model.eval()
        
        for batch_x, batch_y in iter(test_dl):

          inputs, labels = batch_x, batch_y
          inputs, labels = inputs.to(device), labels.to(device)
          
          outputs = model(inputs)
          outputs = outputs.squeeze(-1)
          
          loss = criterion(outputs, labels)
          
          # running_loss_test += loss.item() * inputs.shape[0]
          running_loss_test += loss.item()
          
          num_correct_preds_test += get_num_correct_preds(outputs, labels)
          num_preds_test += outputs.shape[0]
          
          pbar_batches_test.update(1)
        
    ## END test step
    
    accuracy_train = num_correct_preds_train / num_preds_train
    accuracy_val = num_correct_preds_val / num_preds_val
    accuracy_test = num_correct_preds_test / num_preds_test
    
    training_logs[str(epoch)] = {
      "accuracy_train": accuracy_train.cpu().item(),
      "accuracy_val": accuracy_val.cpu().item(),
    }
    
    pbar_epochs.update(1)
    
    if ((epoch + 1) % print_freq == 0):  
      tqdm.write(
        f"epoch: {epoch + 1}\n" + 
        f"      train loss: {running_loss_train}, train acc: {accuracy_train}\n" + 
        f"      val loss  : {running_loss_val}, val acc  : {accuracy_val}\n"
      )
    
    if ((epoch + 1) == num_epochs):
      tqdm.write(
        f"      test loss : {running_loss_test}, test acc : {accuracy_test}"
      )
      
      training_logs[str(epoch)] = {
        "accuracy_test": accuracy_test.cpu().item(),
      }
      
    if (ckp_freq != None and (epoch + 1) % ckp_freq == 0):
      
      ckp_path = f"{ckp_folder}/{train_id}"
      
      store_ckp(
        model=model, optimizer=optimizer, ckp_path=ckp_path, epoch=epoch, 
        train_id=train_id,
        loss_train=running_loss_train, 
        loss_val=running_loss_val, 
        loss_test=running_loss_test
      )
      
  return training_logs

## CNN

### Design motivations

First layers --> neural compression layers --> dimensionality reduction to roughly match dimensions of this paper https://arxiv.org/pdf/1703.01789.pdf

Mid and final layers --> taken 1:1 from the paper linked above

Batch norm placed BEFORE the activation function, as described in the og paper https://arxiv.org/abs/1502.03167 and explained by Bengio in his DL book https://www.deeplearningbook.org/contents/optimization.html in section 8.7.1

Dropout placed according to the og paper: https://arxiv.org/pdf/1207.0580.pdf

In [23]:
class CNN(nn.Module):
  def __init__(
    self, 
    neural_compression_num_layers, 
    neural_compression_kernel_sizes, neural_compression_strides, 
    neural_compression_in_channels, neural_compression_num_filters,
  ):
    super().__init__()
        
    # Neural compression layers. See above cell for full explanation.
    
    self.compression_layers_activation = nn.ReLU()
    
    neural_compression_layers = []
    
    in_channels = neural_compression_in_channels
    
    for i in range(neural_compression_num_layers):
      
      neural_compression_layer = nn.Conv1d(
        kernel_size=neural_compression_kernel_sizes[i],
        stride=neural_compression_strides[i],
        in_channels=in_channels,
        out_channels=neural_compression_num_filters[i]
      )
      
      in_channels = neural_compression_num_filters[i]
      
      neural_compression_layers.append(neural_compression_layer)
      neural_compression_layers.append(self.compression_layers_activation)
      
    self.neural_compression_block = nn.Sequential(* neural_compression_layers)
    
  
  def forward(self, x):
    x = self.neural_compression_block(x)
    
    return x

In [24]:
neural_compression_num_layers   = 3
neural_compression_kernel_sizes = [ 3,  3,  3]
neural_compression_strides      = [ 3,  2,  2]
neural_compression_num_filters  = [ 8, 16, 32]
neural_compression_in_channels  = 1

In [25]:
cnn = CNN(
  neural_compression_num_layers=neural_compression_num_layers,
  neural_compression_kernel_sizes=neural_compression_kernel_sizes, 
  neural_compression_strides=neural_compression_strides, 
  neural_compression_in_channels=neural_compression_in_channels, 
  neural_compression_num_filters=neural_compression_num_filters,
)

In [27]:
x = torch.rand((16, 1, 238000))

x_out = cnn(x)

torch.Size([16, 32, 19832])


## Attempt 1

In [None]:
class CNN_Attempt_1(nn.Module):
  def __init__(self, dropout_p):
    super().__init__()
    
    self.dropout_p = dropout_p
    
    # First layers, see cell above for full explanation 
    
    self.conv1 = nn.Conv1d(
      in_channels=1, out_channels=128, kernel_size=6, stride=4
    )
    torch.nn.init.xavier_uniform_(self.conv1.weight)
    
    # Mid and final layers, see cell above for full explanation
    
    self.conv3 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=3
    ) 
    torch.nn.init.xavier_uniform_(self.conv3.weight)
    
    self.conv4 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    torch.nn.init.xavier_uniform_(self.conv4.weight)
    self.pool4 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv5 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    torch.nn.init.xavier_uniform_(self.conv5.weight)
    self.pool5 = nn.MaxPool1d(kernel_size=3, stride=3)    
    
    self.conv6 = nn.Conv1d(
      in_channels=128, out_channels=256, kernel_size=3, stride=1
    )
    torch.nn.init.xavier_uniform_(self.conv6.weight)
    self.pool6 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv7 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1
    )
    torch.nn.init.xavier_uniform_(self.conv7.weight)
    self.pool7 = nn.MaxPool1d(kernel_size=3, stride=3)      
    
    self.conv8 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    torch.nn.init.xavier_uniform_(self.conv8.weight)
    self.pool8 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv9 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    torch.nn.init.xavier_uniform_(self.conv9.weight)
    self.pool9 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv10 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    torch.nn.init.xavier_uniform_(self.conv10.weight)
    self.pool10 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv11 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    torch.nn.init.xavier_uniform_(self.conv11.weight)
    self.pool11 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv12 = nn.Conv1d(
      in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1
    )
    torch.nn.init.xavier_uniform_(self.conv12.weight)
    self.pool12 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv13 = nn.Conv1d(
      in_channels=512, out_channels=512, kernel_size=1, stride=1
    )
    torch.nn.init.xavier_uniform_(self.conv13.weight)
    
    # Classification layer
    
    # Using a conv output layer rather than a fully connected one
    self.conv14 = nn.Conv1d(
      in_channels=512, out_channels=6, kernel_size=1
    )
    torch.nn.init.xavier_uniform_(self.conv14.weight)
    
    self.dropout = nn.Dropout(p=self.dropout_p)
    
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    
    self.bn_16 = nn.BatchNorm1d(num_features=16)       
    self.bn_32 = nn.BatchNorm1d(num_features=32)       
    self.bn_128 = nn.BatchNorm1d(num_features=128)       
    self.bn_256 = nn.BatchNorm1d(num_features=256)       
    self.bn_512 = nn.BatchNorm1d(num_features=512)       
    

  def forward(self, x):
    
    # First layers, see cell above for full explanation 
    
    x = self.conv1(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("-2. x.shape", x.shape)
    
    # Mid and final layers, see cell above for full explanation 
    
    x = self.conv3(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("3. x.shape", x.shape)

    x = self.conv4(x)
    x = self.pool4(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("4. x.shape", x.shape)
    
    x = self.conv5(x)
    x = self.pool5(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("5. x.shape", x.shape)

    x = self.conv6(x)
    x = self.pool6(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    # print("6. x.shape", x.shape)
    
    x = self.conv7(x)
    x = self.pool7(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    # print("7. x.shape", x.shape)
    
    x = self.conv8(x)
    # print("8_conv. x.shape", x.shape)
    x = self.pool8(x)
    # print("7_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
  
    x = self.conv9(x)
    # print("9_conv. x.shape", x.shape)
    x = self.pool9(x)
    # print("8_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    
    x = self.conv10(x)
    # print("10_conv. x.shape: ", x.shape)
    x = self.pool10(x)
    # print("9_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv11(x)
    # print("11_conv. x.shape: ", x.shape)
    x = self.pool11(x)
    # print("11_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv12(x)
    # print("12_conv. x.shape: ", x.shape)
    x = self.pool12(x)
    # print("11_pool. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.relu(x)
    
    x = self.conv13(x)
    # print("13_conv. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.relu(x)
    x = self.dropout(x)
    
    # Classification layer
    x = self.conv14(x)
    x = self.sigmoid(x)
    
    return x

In [None]:
cnn_attempt_1 = CNN_Attempt_1(dropout_p=0.5)

x = torch.rand((16, 1, 238000))

x_out = cnn_attempt_1(x)

In [None]:
count_num_trainable_parameters(cnn_attempt_1)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
LR = 0.001
MOMENTUM = 0.9

# optimizer = optim.SGD(
#   cnn_attempt_1.parameters(), 
#   lr=LR, 
#   momentum=MOMENTUM,
#   nesterov=True,
#   weight_decay=1e-6
# )

optimizer = optim.Adam(
  cnn_attempt_1.parameters()
)

In [None]:
NUM_EPOCHS = 2
PRINT_FREQ = 2
CKP_FREQ = 1

In [None]:
training_logs = train_model(
  model=cnn_attempt_1, optimizer=optimizer, criterion=criterion,
  batch_size=BATCH_SIZE, train_dl=fma_dataloader_train, 
  val_dl=fma_dataloader_val,test_dl=fma_dataloader_test,
  num_epochs=NUM_EPOCHS, device=device,
  print_freq=PRINT_FREQ,
  ckp_folder=TRAINING_LOGS_FOLDER, ckp_freq=CKP_FREQ
)

train_id = training_logs["train_id"]
save_dict_to_disk(
  dict=training_logs,
  full_path=f"{TRAINING_LOGS_FOLDER}/{train_id}/{train_id}.json"
)