In [1]:
import torch
from torch.utils.data import Dataset

import torchaudio
import torchaudio.transforms

import sys, os

from pprint import pprint

from tqdm import tqdm

import json

import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

import librosa
import librosa.display

import pandas as pd

from pathlib import Path

import gc

MANUAL_SEED = 69

import torch.nn as nn
import torch.nn.functional as F

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
class FMADataset(Dataset):

  def __init__(self, path, normalize_audio, audio_num_frames):
    self.path = path
    self.normalize_audio = normalize_audio
    self.audio_num_frames = audio_num_frames
    
    self.data = self.load_raw_audio_data()

  def __len__(self):
    return len(self.data.index)

  def __getitem__(self, idx):
    return self.data.iloc[idx]
  
  def label_from_str_to_one_hot(self, label_str: str): 
  
    if label_str == "Pop":
      return torch.tensor([1, 0, 0, 0, 0, 0])
    
    if label_str == "Hip-Hop":
      return torch.tensor([0, 1, 0, 0, 0, 0])
    
    if label_str == "Electronic":
      return torch.tensor([0, 0, 1, 0, 0, 0])
    
    if label_str == "Rock":
      return torch.tensor([0, 0, 0, 1, 0, 0])

    if label_str == "Folk":
      return torch.tensor([0, 0, 0, 0, 1, 0])

    if label_str == "Jazz":
      return torch.tensor([0, 0, 0, 0, 0, 1])
  

  def load_raw_audio_data(self):
    
    data_list = []
    
    num_audio_files_unable_to_open = 0

    for path, subdirs, files in os.walk(self.path):
      for name in tqdm(files, colour="magenta"):
          
        file_audio_path = os.path.join(path, name)
        
        try:
          waveform, sample_rate = torchaudio.load(
            file_audio_path, normalize=self.normalize_audio,
            num_frames=self.audio_num_frames
          )
          
          label = file_audio_path.split("/")[-2]
          label_one_hot = self.label_from_str_to_one_hot(label)
          
          data_list.append(
            {
              "waveform": waveform, 
              "og_sample_rate": sample_rate,
              "label_one_hot": label_one_hot,
              "label": label,
              "path": file_audio_path,
              "hop_length": -1
            }
          )

          
        except:
          # print(f"[load_audio_data] error while loading {file_audio_path}")
          num_audio_files_unable_to_open += 1
          continue
    
    return pd.DataFrame(data_list)

In [4]:
DATASET_SIZE = "small"
DATASET_NAME = f"fma_{DATASET_SIZE}_organized_by_label_resampled_rechanneled"
DATASET_FOLDER = "./data/audio"
# DATASET_FOLDER = "/mnt/ramdisk"

dataset_path = f"{DATASET_FOLDER}/{DATASET_NAME}"

NORMALIZE_AUDIO = True
AUDIO_NUM_FRAMES = 238000

In [5]:
fma_dataset = FMADataset(
  path=dataset_path, 
  normalize_audio=NORMALIZE_AUDIO, 
  audio_num_frames=AUDIO_NUM_FRAMES
)

0it [00:00, ?it/s]
100%|[35m██████████[0m| 499/499 [00:04<00:00, 103.29it/s]
100%|[35m██████████[0m| 499/499 [00:05<00:00, 98.66it/s] 
100%|[35m██████████[0m| 499/499 [00:04<00:00, 101.30it/s]
100%|[35m██████████[0m| 499/499 [00:05<00:00, 98.34it/s] 
100%|[35m██████████[0m| 499/499 [00:05<00:00, 93.75it/s] 
100%|[35m██████████[0m| 499/499 [00:05<00:00, 98.52it/s] 


In [6]:
TRAIN_PERCENTAGE = 0.7
VAL_PERCENTAGE = 0.2
# TEST_PERCENTAGE = 0.1

full_size = len(fma_dataset)
train_size = int(TRAIN_PERCENTAGE * len(fma_dataset))
val_size = int(VAL_PERCENTAGE * len(fma_dataset))
test_size = full_size - train_size - val_size

In [7]:
generator=torch.Generator().manual_seed(MANUAL_SEED)

fma_dataset_train, fma_dataset_val, fma_dataset_test = torch.utils.data.random_split(
  fma_dataset, [train_size, val_size, test_size], generator
)

In [8]:
print(len(fma_dataset_train))
print(len(fma_dataset_val))
print(len(fma_dataset_test))

2095
598
300


In [9]:
def get_dataset_statistics(dataset):
  class_counts = {
    "Electronic": 0,
    "Pop": 0,
    "Rock": 0,
    "Jazz": 0, 
    "Hip-Hop": 0,
    "Folk": 0
  }

  for data_entry in dataset:
    class_counts[data_entry["label"]] += 1
    
  
  return {
    "class_counts": class_counts,
    "train_mean": -1,
    "train_std": -1
  }

In [10]:
train_statistics = get_dataset_statistics(fma_dataset_train)
val_statistics = get_dataset_statistics(fma_dataset_val)
test_statistics = get_dataset_statistics(fma_dataset_test)

In [11]:
# print("train statistics: ")
# pprint(train_statistics)

# print("val statistics: ")
# pprint(val_statistics)

# print("test statistics: ")
# pprint(test_statistics)

In [12]:
BATCH_SIZE = 32
NUM_WORKERS = 16

In [13]:
fma_dataloader_train = torch.utils.data.DataLoader(
  fma_dataset_train,batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_val = torch.utils.data.DataLoader(
  fma_dataset_val,batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)
fma_dataloader_test = torch.utils.data.DataLoader(
  fma_dataset_test,batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, 
  generator=generator
)

## Attempt 1

First layers --> dimensionality reduction to roughly match dimensions of this paper https://arxiv.org/pdf/1703.01789.pdf

Mid and final layers --> taken 1:1 from the paper linked above

Batch norm placed BEFORE the activation function, as described in the og paper https://arxiv.org/abs/1502.03167 and explained by Bengio in his DL book https://www.deeplearningbook.org/contents/optimization.html in section 8.7.1

Dropout placed according to the og paper: https://arxiv.org/pdf/1207.0580.pdf

In [46]:
class CNN_Attempt_1(nn.Module):
  def __init__(self, dropout_p):
    super().__init__()
    
    self.dropout_p = dropout_p
    
    # First layers, see cell above for full explanation 
    
    self.conv1 = nn.Conv1d(
      in_channels=1, out_channels=128, kernel_size=6, stride=4
    )
    
    # self.conv2 = nn.Conv1d(
    #   in_channels=16, out_channels=32, kernel_size=3, stride=2
    # )
    
    # Mid and final layers, see cell above for full explanation
    
    self.conv3 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=3
    ) 
    
    self.conv4 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    self.pool4 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv5 = nn.Conv1d(
      in_channels=128, out_channels=128, kernel_size=3, stride=1
    )
    self.pool5 = nn.MaxPool1d(kernel_size=3, stride=3)    
    
    self.conv6 = nn.Conv1d(
      in_channels=128, out_channels=256, kernel_size=3, stride=1
    )
    self.pool6 = nn.MaxPool1d(kernel_size=3, stride=3)
    
    self.conv7 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1
    )
    self.pool7 = nn.MaxPool1d(kernel_size=3, stride=3)      
    
    self.conv8 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool8 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv9 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool9 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv10 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool10 = nn.MaxPool1d(kernel_size=3,  stride=3)      
    
    self.conv11 = nn.Conv1d(
      in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1
    )
    self.pool11 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv12 = nn.Conv1d(
      in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1
    )
    self.pool12 = nn.MaxPool1d(kernel_size=3,  stride=3)      

    self.conv13 = nn.Conv1d(
      in_channels=512, out_channels=512, kernel_size=1, stride=1
    )
    
    self.dropout = nn.Dropout(p=self.dropout_p)
    
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    
    self.bn_16 = nn.BatchNorm1d(num_features=16)       
    self.bn_32 = nn.BatchNorm1d(num_features=32)       
    self.bn_128 = nn.BatchNorm1d(num_features=128)       
    self.bn_256 = nn.BatchNorm1d(num_features=256)       
    self.bn_512 = nn.BatchNorm1d(num_features=512)       
    

  def forward(self, x):
    
    # First layers, see cell above for full explanation 
    
    x = self.conv1(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    print("-2. x.shape", x.shape)
    
    # x = self.conv2(x)
    # x = self.bn_32(x)
    # x = self.relu(x)
    
    # print("-1. x.shape", x.shape)
    
    # Mid and final layers, see cell above for full explanation 
    
    x = self.conv3(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    print("3. x.shape", x.shape)

    x = self.conv4(x)
    x = self.pool4(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    print("4. x.shape", x.shape)
    
    x = self.conv5(x)
    x = self.pool5(x)
    x = self.bn_128(x)
    x = self.relu(x)
    
    print("5. x.shape", x.shape)

    x = self.conv6(x)
    x = self.pool6(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    print("6. x.shape", x.shape)
    
    x = self.conv7(x)
    x = self.pool7(x)
    x = self.bn_256(x)
    x = self.relu(x)
    
    print("7. x.shape", x.shape)
    
    x = self.conv8(x)
    print("8_conv. x.shape", x.shape)
    x = self.pool8(x)
    # print("7_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
  

    
    x = self.conv9(x)
    print("9_conv. x.shape", x.shape)
    x = self.pool9(x)
    # print("8_pool. x.shape", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    
    x = self.conv10(x)
    print("10_conv. x.shape: ", x.shape)
    x = self.pool10(x)
    # print("9_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv11(x)
    print("11_conv. x.shape: ", x.shape)
    x = self.pool11(x)
    print("11_pool. x.shape: ", x.shape)
    x = self.bn_256(x)
    x = self.relu(x)
    
    x = self.conv12(x)
    print("12_conv. x.shape: ", x.shape)
    x = self.pool12(x)
    # print("11_pool. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.relu(x)
    
    x = self.conv13(x)
    print("13_conv. x.shape: ", x.shape)
    x = self.bn_512(x)
    x = self.sigmoid(x)
    x = self.dropout(x)
    
    return x

In [47]:
cnn_attempt_1 = CNN_Attempt_1(dropout_p=0.1)

x = torch.rand((16, 1, 238000))

x_out = cnn_attempt_1(x)

-2. x.shape torch.Size([16, 128, 59499])
3. x.shape torch.Size([16, 128, 19833])
4. x.shape torch.Size([16, 128, 6610])
5. x.shape torch.Size([16, 128, 2202])
6. x.shape torch.Size([16, 256, 733])
7. x.shape torch.Size([16, 256, 243])
8_conv. x.shape torch.Size([16, 256, 243])
9_conv. x.shape torch.Size([16, 256, 81])
10_conv. x.shape:  torch.Size([16, 256, 27])
11_conv. x.shape:  torch.Size([16, 256, 9])
11_pool. x.shape:  torch.Size([16, 256, 3])
12_conv. x.shape:  torch.Size([16, 512, 3])
13_conv. x.shape:  torch.Size([16, 512, 1])


In [None]:
x

tensor([[0.9619, 0.9511, 0.1388,  ..., 0.8091, 0.4585, 0.6079]])

In [29]:
def calculate_output_length(length_in, kernel_size, stride=1, padding=0, dilation=1):
  return (length_in + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1

In [76]:
conv1_out = calculate_output_length(238000, 6, 6)
conv1_out

39666

In [78]:
calculate_output_length(conv1_out, 3, 2)

19832

In [31]:
conv3_out = calculate_output_length(243, 3, 3)
conv3_out

81