To create the data for this classifier:

1. Download the Musicnet library
2. Put the folder titled "musicnet" in the music-translation folder
3. Create a folder titled "music_classification_data" in the "musicnet" folder
4. Create two folders in "music_classification_data" titled "test" and "train"
5. In the "test" and "train" folders, put folders with the wav files in each such that the folders are titled with the labels for the wav files (e.g. "Beethoven_Accompanied_Violin")

Import libraries:

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import librosa
import numpy as np
import random

from pathlib import Path

Check current device:

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


Puts data and labels into 2d arrays, splits into train and test:

Tutorial on pathlib libary: https://realpython.com/python-pathlib/

In [4]:
dataset = Path.cwd().parent.joinpath("musicnet", "music_classification_data")

train = dataset.joinpath("train")
test = dataset.joinpath("test")

train_labels = [p.stem for p in train.iterdir()]
test_labels = [p.stem for p in test.iterdir()]

train_labels.remove(".DS_Store")
test_labels.remove(".DS_Store")

print("train labels:", train_labels, "\n")
print("test labels:", test_labels, "\n")

train_wav = []
test_wav = []

for label in train_labels:
    train_wav.append([wav for wav in train.joinpath(label).iterdir() if wav.name != ".DS_Store"])
    
for label in test_labels:
    test_wav.append([wav for wav in test.joinpath(label).iterdir() if wav.name != ".DS_Store"])
    
print(len(train_wav), len(train_wav[0]))
print(len(test_wav), len(test_wav[0]))

train labels: ['Beethoven_Accompanied_Violin', 'Bach_Solo_Piano', 'Bach_Solo_Cello', 'Beethoven_Solo_Piano', 'Beethoven_String_Quartet', 'Cambini_Wind_Quintet'] 

test labels: ['Beethoven_Accompanied_Violin', 'Bach_Solo_Piano', 'Bach_Solo_Cello', 'Beethoven_Solo_Piano', 'Beethoven_String_Quartet', 'Cambini_Wind_Quintet'] 

6 16
6 6


Puts data and labels into 1d arrays:

In [5]:
train_y = []
test_y = []

unprocessed_train_x = []
unprocessed_test_x = []

for i in range(len(train_labels)):
    for j in range(len(train_wav[i])):
        train_y.append(train_labels[i])
        
for i in range(len(test_labels)):
    for j in range(len(test_wav[i])):
        test_y.append(test_labels[i])
        
for arr in train_wav:
    unprocessed_train_x.extend(arr)
    
for arr in test_wav:
    unprocessed_test_x.extend(arr)
        
print(len(train_y), len(unprocessed_train_x))
print(len(test_y), len(unprocessed_test_x))

153 153
50 50


Processed data, uses librosa to turn wav file into a tensor. Takes first 160,000 samples (~4s), and samples every 5 to get processed audio tensor.

In [14]:
train_x = []
test_x = []

train_progress_counter = 0
test_progress_counter = 0

print("Processing train wav files:")

for path in unprocessed_train_x:
    data, rate = librosa.load(path, sr=16000, duration=10)
    assert rate == 16000
    sample_tensor = torch.tensor(data).float()
    assert sample_tensor.size()  == torch.Size([160000])
    downsampled_tensor = sample_tensor[::5]
    train_progress_counter += 1
    if train_progress_counter % 10 == 0:
        print(train_progress_counter, "/", len(unprocessed_train_x), "processed")
    train_x.append(downsampled_tensor)
    
print("\n", "Processing test wav files:")
    
for path in unprocessed_test_x:
    data, rate = librosa.load(path, sr=16000, duration=10)
    assert rate == 16000
    sample_tensor = torch.tensor(data).float()
    assert sample_tensor.size()  == torch.Size([160000])
    downsampled_tensor = sample_tensor[::5]
    test_progress_counter += 1
    if test_progress_counter % 10 == 0:
        print(test_progress_counter, "/", len(unprocessed_test_x), "processed")
    test_x.append(downsampled_tensor)

Processing train wav files:
10 / 153 processed
20 / 153 processed
30 / 153 processed
40 / 153 processed
50 / 153 processed
60 / 153 processed
70 / 153 processed
80 / 153 processed
90 / 153 processed
100 / 153 processed
110 / 153 processed
120 / 153 processed
130 / 153 processed
140 / 153 processed
150 / 153 processed

 Processing test wav files:
10 / 50 processed
20 / 50 processed
30 / 50 processed
40 / 50 processed
50 / 50 processed


Save train_x, test_x, train_y, test_y to .pt files

In [16]:
torch.save(train_x, "train_x.pt")
torch.save(test_x, "test_x.pt")

torch.save(train_y, "train_y.pt")
torch.save(test_y, "test_y.pt")

Retreive train_x, test_x, train_y, test_y from saved .pt files

In [6]:
train_x = torch.load("train_x.pt")
# test_x = torch.load("test_x.pt")
# train_y = torch.load("train_y.pt")
# test_y = torch.load("test_y.pt")

print(train_x[0])
# print(test_x[0])
# print(train_y[0])
# print(test_y[0])

torch.Size([32000])


Stack train_x and train_y into train_data, stack test_x and test_y into test_data

In [18]:
train_y_class = []
test_y_class = []

dict = {'Beethoven_Accompanied_Violin':0, 'Bach_Solo_Piano':1, 'Bach_Solo_Cello':2, 'Beethoven_Solo_Piano':3, 'Beethoven_String_Quartet':4, 'Cambini_Wind_Quintet':5}

for label in train_y:
    train_y_class.append(torch.tensor(dict[label]))
    
for label in test_y:
    test_y_class.append(torch.tensor(dict[label]))
    
train_data = list(zip(train_x, train_y_class))
test_data = list(zip(test_x, test_y_class))

random.shuffle(train_data)
random.shuffle(test_data)

In [19]:

# Make train_x so that it has labels + data, same w test_x


# Skipping the batching for now

# kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
# train_loader = torch.utils.data.DataLoader(train_x, batch_size = 128, shuffle = True, **kwargs)
# test_loader = torch.utils.data.DataLoader(test_x, batch_size = 128, shuffle = True, **kwargs)

CNN modeled after the M5 network architecture described in https://arxiv.org/pdf/1610.00087.pdf

In [20]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 6)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = CNN()
model.to(device)
print(model)

CNN(
  (conv1): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avgPool): AvgPool1d(kernel_size=(30,), stride=(30,), 

Define optimizer (using Adam) and scheduler, which lowers the learning rate from 0.01 to 0.0001 over the course of training

In [21]:
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

In [22]:
def train(model, epoch):
    model.train()
    for data, label in train_data:
        # for batch_idx
        optimizer.zero_grad()
        data = data.requires_grad_() #set requires_grad to True for training
        data = torch.unsqueeze(data, 0)
        data = torch.unsqueeze(data, 0)
        output = model(data)
        output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x6 
        loss = F.nll_loss(output[0], torch.unsqueeze(label, 0)) #the loss functions expects a batchSizex6 input
        loss.backward()
        optimizer.step()
#         if batch_idx % log_interval == 0: #print training stats
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader), loss))

In [27]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, label in test_data:
        data = torch.unsqueeze(data, 0)
        data = torch.unsqueeze(data, 0)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        print(pred, label)
        correct += pred.eq(label).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_data),
        100. * correct / len(test_data)))

In [28]:
log_interval = 20
for epoch in range(1, 41):
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
    scheduler.step()
    train(model, epoch)
    test(model, epoch)

tensor([[3]]) tensor(0)
tensor([[1]]) tensor(3)
tensor([[3]]) tensor(0)
tensor([[4]]) tensor(2)
tensor([[4]]) tensor(4)
tensor([[4]]) tensor(1)
tensor([[4]]) tensor(3)
tensor([[4]]) tensor(3)
tensor([[2]]) tensor(3)
tensor([[3]]) tensor(0)
tensor([[3]]) tensor(0)
tensor([[4]]) tensor(4)
tensor([[3]]) tensor(0)
tensor([[4]]) tensor(4)
tensor([[1]]) tensor(1)
tensor([[1]]) tensor(5)
tensor([[1]]) tensor(5)
tensor([[4]]) tensor(3)
tensor([[4]]) tensor(3)
tensor([[1]]) tensor(1)
tensor([[4]]) tensor(3)
tensor([[1]]) tensor(1)
tensor([[1]]) tensor(3)
tensor([[4]]) tensor(3)
tensor([[1]]) tensor(1)
tensor([[4]]) tensor(3)
tensor([[3]]) tensor(4)
tensor([[4]]) tensor(3)
tensor([[1]]) tensor(3)
tensor([[3]]) tensor(4)
tensor([[4]]) tensor(1)
tensor([[3]]) tensor(4)
tensor([[1]]) tensor(1)
tensor([[4]]) tensor(2)
tensor([[4]]) tensor(4)
tensor([[4]]) tensor(3)
tensor([[4]]) tensor(3)
tensor([[4]]) tensor(3)
tensor([[3]]) tensor(4)
tensor([[4]]) tensor(3)
tensor([[4]]) tensor(2)
tensor([[1]]) te

KeyboardInterrupt: 