To create the data for this classifier:

1. Download the Musicnet library
2. Put the folder titled "musicnet" in the music-translation folder
3. Create a folder titled "music_classification_data" in the "musicnet" folder
4. Create two folders in "music_classification_data" titled "test" and "train"
5. In the "test" and "train" folders, put folders with the wav files in each such that the folders are titled with the labels for the wav files (e.g. "Beethoven_Accompanied_Violin")

Import libraries:

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import librosa
import numpy as np

from pathlib import Path

Check current device:

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


Puts data and labels into 2d arrays, splits into train and test:

Tutorial on pathlib libary: https://realpython.com/python-pathlib/

In [26]:
dataset = Path.cwd().parent.joinpath("musicnet", "music_classification_data")

train = dataset.joinpath("train")
test = dataset.joinpath("test")

train_labels = [p.stem for p in train.iterdir()]
test_labels = [p.stem for p in test.iterdir()]

train_labels.remove(".DS_Store")
test_labels.remove(".DS_Store")

print("train labels:", train_labels, "\n")
print("test labels:", test_labels, "\n")

train_wav = []
test_wav = []

for label in train_labels:
    train_wav.append([wav for wav in train.joinpath(label).iterdir() if wav.name != ".DS_Store"])
    
for label in test_labels:
    test_wav.append([wav for wav in test.joinpath(label).iterdir() if wav.name != ".DS_Store"])
    
print(len(train_wav), len(train_wav[0]))
print(len(test_wav), len(test_wav[0]))

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\aliwa\\Desktop\\orchestrator\\music-translation\\musicnet\\music_classification_data\\train'

Puts data and labels into 1d arrays:

In [25]:
train_y = []
test_y = []

unprocessed_train_x = []
unprocessed_test_x = []

for i in range(len(train_labels)):
    for j in range(len(train_wav[i])):
        train_y.append(train_labels[i])
        
for i in range(len(test_labels)):
    for j in range(len(test_wav[i])):
        test_y.append(test_labels[i])
        
for arr in train_wav:
    unprocessed_train_x.extend(arr)
    
for arr in test_wav:
    unprocessed_test_x.extend(arr)
        
print(len(train_y), len(unprocessed_train_x))
print(len(test_y), len(unprocessed_test_x))

NameError: name 'train_labels' is not defined

Processed data, uses librosa to turn wav file into a tensor. Takes first 160,000 samples (~4s), and samples every 5 to get processed audio tensor.

In [6]:
train_x = []
test_x = []

train_progress_counter = 0
test_progress_counter = 0

print("Processing train wav files:")

for path in unprocessed_train_x:
    data, rate = librosa.load(path, sr=16000)
    assert rate == 16000
    sample_tensor = torch.tensor(data).float()
    short_tensor = sample_tensor[:160000]
    downsampled_tensor = short_tensor[::5]
    print(sample_tensor.size())
    train_progress_counter += 1
    if train_progress_counter % 10 == 0:
        print(train_progress_counter, "/", len(unprocessed_train_x), "processed")
    train_x.append(downsampled_tensor)
    
print("\n", "Processing test wav files:")
    
for path in unprocessed_test_x:
    data, rate = librosa.load(path, sr=16000)
    assert rate == 16000
    sample_tensor = torch.tensor(data).float()
    short_tensor = sample_tensor[:160000]
    downsampled_tensor = short_tensor[::5]
    print(sample_tensor.size())
    test_progress_counter += 1
    if test_progress_counter % 10 == 0:
        print(test_progress_counter, "/", len(unprocessed_test_x), "processed")
    test_x.append(downsampled_tensor)

Processing train wav files:
torch.Size([8640471])
1 / 153 processed
torch.Size([8112170])
2 / 153 processed
torch.Size([14573819])
3 / 153 processed
torch.Size([4891795])
4 / 153 processed
torch.Size([5874835])
5 / 153 processed
torch.Size([4955743])
6 / 153 processed
torch.Size([7830884])
7 / 153 processed
torch.Size([4283664])
8 / 153 processed
torch.Size([6760072])
9 / 153 processed
torch.Size([6552347])
10 / 153 processed
torch.Size([3165205])
11 / 153 processed
torch.Size([8514665])
12 / 153 processed
torch.Size([3076598])
13 / 153 processed
torch.Size([7374054])
14 / 153 processed
torch.Size([9968745])
15 / 153 processed
torch.Size([3787129])
16 / 153 processed
torch.Size([1816451])
17 / 153 processed
torch.Size([1717395])
18 / 153 processed
torch.Size([1569437])
19 / 153 processed
torch.Size([3913352])
20 / 153 processed
torch.Size([1862845])
21 / 153 processed
torch.Size([2148311])
22 / 153 processed
torch.Size([1920523])
23 / 153 processed
torch.Size([3795488])
24 / 153 proces

torch.Size([8115932])
47 / 50 processed
torch.Size([4798590])
48 / 50 processed
torch.Size([3920040])
49 / 50 processed
torch.Size([5468996])
50 / 50 processed


Save train_x and test_x to .pt files

In [7]:
torch.save(train_x, "train_x.pt")
torch.save(test_x, "test_x.pt")

Retreive train_x and test_x from saved .pt files

In [9]:
train_x = torch.load("train_x.pt")
test_x = torch.load("test_x.pt")

print(train_x[0])
print(test_x[0])



torch.Size([32000])
torch.Size([32000])


In [47]:

# Make train_x so that it has labels + data, same w test_x


# Skipping the batching for now

# kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
# train_loader = torch.utils.data.DataLoader(train_x, batch_size = 128, shuffle = True, **kwargs)
# test_loader = torch.utils.data.DataLoader(test_x, batch_size = 128, shuffle = True, **kwargs)

CNN modeled after the M5 network architecture described in https://arxiv.org/pdf/1610.00087.pdf

In [48]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = CNN()
model.to(device)
print(model)

CNN(
  (conv1): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avgPool): AvgPool1d(kernel_size=(30,), stride=(30,), 

Define optimizer (using Adam) and scheduler, which lowers the learning rate from 0.01 to 0.0001 over the course of training

In [49]:
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

In [50]:
def train(model, epoch):
    model.train()
    for data, target in train_x:
        # for batch_idx
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        data = data.requires_grad_() #set requires_grad to True for training
        output = model(data)
        output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10 
        loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input
        loss.backward()
        optimizer.step()
#         if batch_idx % log_interval == 0: #print training stats
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader), loss))

In [51]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [52]:
log_interval = 20
for epoch in range(1, 41):
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
    scheduler.step()
    train(model, epoch)
    test(model, epoch)

ValueError: not enough values to unpack (expected 2, got 1)