In [1]:
import torch
from torch.utils.data import Dataset

import torchaudio
import torchaudio.transforms

import sys, os

from pprint import pprint

from tqdm import tqdm

import json

import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

import librosa
import librosa.display

import pandas as pd

from pathlib import Path

import gc

MANUAL_SEED = 69

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim


In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [106]:
class FMADataset(Dataset):

  def __init__(self, path, normalize_audio, audio_num_frames):
    self.path = path
    self.normalize_audio = normalize_audio
    self.audio_num_frames = audio_num_frames
    
    self.data = self.load_raw_audio_data()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]
  
  def label_from_str_to_one_hot(self, label_str: str): 
  
    if label_str == "Pop":
      return torch.tensor([1, 0, 0, 0, 0, 0]).float()
    
    if label_str == "Hip-Hop":
      return torch.tensor([0, 1, 0, 0, 0, 0]).float()
    
    if label_str == "Electronic":
      return torch.tensor([0, 0, 1, 0, 0, 0]).float()
    
    if label_str == "Rock":
      return torch.tensor([0, 0, 0, 1, 0, 0]).float()

    if label_str == "Folk":
      return torch.tensor([0, 0, 0, 0, 1, 0]).float()

    if label_str == "Jazz":
      return torch.tensor([0, 0, 0, 0, 0, 1]).float()
  

  def load_raw_audio_data(self):
    
    data_list = []
    
    num_audio_files_unable_to_open = 0

    for path, subdirs, files in os.walk(self.path):
      for name in tqdm(files, colour="magenta"):
          
        file_audio_path = os.path.join(path, name)
        # print(file_audio_path)
        
        try:
          waveform, sample_rate = torchaudio.load(
            file_audio_path, normalize=self.normalize_audio,
            num_frames=self.audio_num_frames
          )
          
          label = file_audio_path.split("/")[-2]
          label_one_hot = self.label_from_str_to_one_hot(label)
          
          data_list.append(
            {
              "waveform": waveform, 
              "og_sample_rate": sample_rate,
              "label_one_hot": label_one_hot,
              "label": label,
              "path": file_audio_path,
              "hop_length": -1
            }
          )
          
        except Exception as e:
          print(e)
          # print(f"[load_audio_data] error while loading {file_audio_path}")
          num_audio_files_unable_to_open += 1
          continue
    
    return data_list

In [132]:
DATASET_SIZE = "small"
DATASET_NAME = f"fma_{DATASET_SIZE}_organized_by_label_resampled_rechanneled"
DATASET_FOLDER = "./data/audio"
# DATASET_FOLDER = "/mnt/ramdisk"

dataset_path = f"{DATASET_FOLDER}/{DATASET_NAME}"

NORMALIZE_AUDIO = True
AUDIO_NUM_FRAMES = 238000

In [133]:
fma_dataset = FMADataset(
  path=dataset_path, 
  normalize_audio=NORMALIZE_AUDIO, 
  audio_num_frames=AUDIO_NUM_FRAMES
)

0it [00:00, ?it/s]
 23%|[35m██▎       [0m| 117/499 [00:01<00:03, 115.76it/s]

Failed to open the input "./data/audio/fma_small_organized_by_label_resampled_rechanneled/Pop/023431.mp3" (Invalid argument).


100%|[35m██████████[0m| 499/499 [00:04<00:00, 112.61it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 107.75it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 111.73it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 109.19it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 108.84it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 110.46it/s]


In [134]:
len(fma_dataset)

2993

In [135]:
TRAIN_PERCENTAGE = 0.7
VAL_PERCENTAGE = 0.2
# TEST_PERCENTAGE = 0.1

full_size = len(fma_dataset)
train_size = int(TRAIN_PERCENTAGE * len(fma_dataset))
val_size = int(VAL_PERCENTAGE * len(fma_dataset))
test_size = full_size - train_size - val_size

In [136]:
generator=torch.Generator().manual_seed(MANUAL_SEED)

fma_dataset_train, fma_dataset_val, fma_dataset_test = torch.utils.data.random_split(
  fma_dataset, [train_size, val_size, test_size], generator
)

In [137]:
print(len(fma_dataset_train))
print(len(fma_dataset_val))
print(len(fma_dataset_test))

2095
598
300


In [138]:
def get_dataset_statistics(dataset):
  class_counts = {
    "Electronic": 0,
    "Pop": 0,
    "Rock": 0,
    "Jazz": 0, 
    "Hip-Hop": 0,
    "Folk": 0
  }

  for data_entry in dataset:
    class_counts[data_entry["label"]] += 1
    
  
  return {
    "class_counts": class_counts,
    "train_mean": -1,
    "train_std": -1
  }

In [139]:
train_statistics = get_dataset_statistics(fma_dataset_train)
val_statistics = get_dataset_statistics(fma_dataset_val)
test_statistics = get_dataset_statistics(fma_dataset_test)

In [140]:
# print("train statistics: ")
# pprint(train_statistics)

# print("val statistics: ")
# pprint(val_statistics)

# print("test statistics: ")
# pprint(test_statistics)

In [141]:
BATCH_SIZE = 4
NUM_WORKERS = 16

In [142]:
fma_dataloader_train = torch.utils.data.DataLoader(
  fma_dataset_train, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_val = torch.utils.data.DataLoader(
  fma_dataset_val, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_test = torch.utils.data.DataLoader(
  fma_dataset_test, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)

In [143]:
def calculate_output_length(length_in, kernel_size, stride=1, padding=0, dilation=1):
  return (length_in + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1



In [144]:
def count_num_trainable_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)


## Attempt 1

First layers --> dimensionality reduction to roughly match dimensions of this paper https://arxiv.org/pdf/1703.01789.pdf

Mid and final layers --> taken 1:1 from the paper linked above

Batch norm placed BEFORE the activation function, as described in the og paper https://arxiv.org/abs/1502.03167 and explained by Bengio in his DL book https://www.deeplearningbook.org/contents/optimization.html in section 8.7.1

Dropout placed according to the og paper: https://arxiv.org/pdf/1207.0580.pdf

In [145]:
class CNN_Attempt_1(nn.Module):
  def __init__(self, dropout_p):
    super().__init__()
    
    self.dropout_p = dropout_p
    
    # First layers, see cell above for full explanation 
    
    self.conv1 = nn.Conv1d(
      in_channels=1, out_channels=128, kernel_size=6, stride=4
    )
    
    # Mid and final layers, see cell above for full explanation
    
    self.conv3 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=3
    ) 
    
    self.conv4 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    self.pool4 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv5 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    self.pool5 = nn.MaxPool1d(kernel_size=3, stride=3)    
    
    self.conv6 = nn.Conv1d(
      in_channels=128, out_channels=256, kernel_size=3, stride=1
    )
    self.pool6 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv7 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1
    )
    self.pool7 = nn.MaxPool1d(kernel_size=3, stride=3)      
    
    self.conv8 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool8 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv9 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool9 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv10 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool10 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv11 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool11 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv12 = nn.Conv1d(
      in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1
    )
    self.pool12 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv13 = nn.Conv1d(
      in_channels=512, out_channels=512, kernel_size=1, stride=1
    )
    
    # Classification layer
    
    # Using a conv output layer rather than a fully connected one
    self.conv14 = nn.Conv1d(
      in_channels=512, out_channels=6, kernel_size=1
    )
    
    self.dropout = nn.Dropout(p=self.dropout_p)
    
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    
    self.bn_16 = nn.BatchNorm1d(num_features=16)       
    self.bn_32 = nn.BatchNorm1d(num_features=32)       
    self.bn_128 = nn.BatchNorm1d(num_features=128)       
    self.bn_256 = nn.BatchNorm1d(num_features=256)       
    self.bn_512 = nn.BatchNorm1d(num_features=512)       
    

  def forward(self, x):
    
    # First layers, see cell above for full explanation 
    
    x = self.conv1(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("-2. x.shape", x.shape)
    
    # Mid and final layers, see cell above for full explanation 
    
    x = self.conv3(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("3. x.shape", x.shape)

    x = self.conv4(x)
    x = self.pool4(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("4. x.shape", x.shape)
    
    x = self.conv5(x)
    x = self.pool5(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    # print("5. x.shape", x.shape)

    x = self.conv6(x)
    x = self.pool6(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    # print("6. x.shape", x.shape)
    
    x = self.conv7(x)
    x = self.pool7(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    # print("7. x.shape", x.shape)
    
    x = self.conv8(x)
    # print("8_conv. x.shape", x.shape)
    x = self.pool8(x)
    # print("7_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
  
    x = self.conv9(x)
    # print("9_conv. x.shape", x.shape)
    x = self.pool9(x)
    # print("8_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    
    x = self.conv10(x)
    # print("10_conv. x.shape: ", x.shape)
    x = self.pool10(x)
    # print("9_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv11(x)
    # print("11_conv. x.shape: ", x.shape)
    x = self.pool11(x)
    # print("11_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv12(x)
    # print("12_conv. x.shape: ", x.shape)
    x = self.pool12(x)
    # print("11_pool. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.relu(x)
    
    x = self.conv13(x)
    # print("13_conv. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.relu(x)
    x = self.dropout(x)
    
    # Classification layer
    x = self.conv14(x)
    x = self.sigmoid(x)
    
    return x

In [146]:
cnn_attempt_1 = CNN_Attempt_1(dropout_p=0.5)

x = torch.rand((16, 1, 238000))

x_out = cnn_attempt_1(x)

In [147]:
count_num_trainable_parameters(cnn_attempt_1)

1892966

In [148]:
criterion = nn.CrossEntropyLoss()

In [149]:
LR = 0.001
MOMENTUM = 0.9

optimizer = optim.SGD(cnn_attempt_1.parameters(), lr=LR, momentum=MOMENTUM)

In [150]:
NUM_EPOCHS = 20

In [151]:
model = cnn_attempt_1

In [152]:
for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times

  running_loss = 0.0
  
  for i, batch in enumerate(fma_dataloader_train):
    
    print("batch: ", i)
    inputs = batch["waveform"]
    print("inputs.shape: ", inputs.shape)
    
    labels = batch["label_one_hot"]

    # zero the parameter gradients
    optimizer.zero_grad()

    outputs = model(inputs)
    outputs = outputs.squeeze(-1)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()

batch:  0
inputs.shape:  torch.Size([4, 1, 238000])
batch:  1
inputs.shape:  torch.Size([4, 1, 238000])
batch:  2
inputs.shape:  torch.Size([4, 1, 238000])


RuntimeError: Caught RuntimeError in DataLoader worker process 3.
Original Traceback (most recent call last):
  File "/home/dansolombrino/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/dansolombrino/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/home/dansolombrino/.local/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 160, in default_collate
    return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
  File "/home/dansolombrino/.local/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 160, in <dictcomp>
    return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
  File "/home/dansolombrino/.local/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 140, in default_collate
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable
