# Exercise 3

In [None]:
NAME = "Peter Rjabcsenko"
STUDENT_ID = "1228563"

# Part 1

### Ex 2 imports and constants

In [None]:
# lets import everything we will need first...
# some generic stuff, numpy will help us with math!
import os
import numpy as np
import time

# madmom audio processing stuff and evaluation
import madmom
from madmom.utils import search_files

# pytorch, deep learning library
import torch
import torch.nn as nn
import torch.nn.functional as torch_func
import torch.optim as optim
from torch.utils.data import Dataset as Dataset

# paths to our small example dataset
PATH = os.getcwd()

# use GPU for NN training?
g_use_cuda = True

# seed for RNG for reproducible results
seed = 1234 #12345
print('done')

### Extra imports, constants and helper functions

In [None]:
import librosa
import random
from sklearn.metrics import roc_auc_score

######## ADJUST DATA PATHS ACCORDING TO YOUR LOCAL CONFIGURATION ########
DATA_PATH_1 = os.path.join(PATH, 'data/part_1')
AUDIO_PATH_1 = os.path.join(DATA_PATH_1, 'mp3.zip')
ANNOTATIONS_PATH_1 = os.path.join(DATA_PATH_1, 'annotations_final.csv')
META_DATA_PATH_1 = os.path.join(DATA_PATH_1, 'clip_info_final.csv')

CACHE_PATH_1 = os.path.join(DATA_PATH_1, 'feat_cache')
if not os.path.exists(CACHE_PATH_1):
    os.makedirs(CACHE_PATH_1)
MODEL_PATH_1 = os.path.join(DATA_PATH_1, 'models')
if not os.path.exists(MODEL_PATH_1):
    os.makedirs(MODEL_PATH_1)  
    
CNN_MODEL_NAME = 'cnn_model'

# function for formatting input data
replace = np.vectorize(lambda v : v.replace("\"",""))

### Load audio, annotations and metadata

In [None]:
audio_files = search_files(AUDIO_PATH_1, '.mp3', recursion_depth=1)

# librosa cant load these files for some reason
# norine_braun-now_and_zen-08-gently-117-146.mp3
del audio_files[10687]
# jacob_heringman-josquin_des_prez_lute_settings-19-gintzler__pater_noster-204-233.mp3
del audio_files[12821]
# american_baroque-dances_and_suites_of_rameau_and_couperin-26-loracle_suite_in_d_from_les_fetes_dhebe_rameau-0-29
del audio_files[13701]

annotations = np.genfromtxt(ANNOTATIONS_PATH_1, dtype=str, delimiter='\t')
meta_data = np.genfromtxt(META_DATA_PATH_1, dtype=str, delimiter='\t')

### LogMelSpectrogram from Music Auto Tagging with some adjustments (+ caching)

In [None]:
def compute_melgram(audio_path):
    ''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
    96 == #mel-bins and 1366 == #time frame
    parameters
    ----------
    audio_path: path for the audio file.
                Any format supported by audioread will work.
    More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
    '''

    # mel-spectrogram parameters
    SR = 12000
    N_FFT = 512
    N_MELS = 96
    HOP_LEN = 256
    DURA = 29.12  # to make it 1366 frame..

    src, sr = librosa.load(audio_path, sr=SR)  # whole signal
    n_sample = src.shape[0]
    n_sample_fit = int(DURA*SR)

    if n_sample < n_sample_fit:  # if too short
        src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
    elif n_sample > n_sample_fit:  # if too long
        # src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
        src = src[int((n_sample-n_sample_fit)/2):int((n_sample+n_sample_fit)/2)]
        
    #logam = librosa.logamplitude
    logam = librosa.core.power_to_db
    
    melgram = librosa.feature.melspectrogram
    
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS)**2,
                ref_power=1.0)
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS))
        
    ret = ret[np.newaxis, np.newaxis, :]
    return ret

def init_features(files, cache=True, cache_ext='.cache.npy', **kwargs):
    """
    Create features for given audio files or load them from cache.

    Parameters
    ----------
    files : list
        List with audio file names.
    cache : bool, optional
        Cache features or use cached ones if available.
    cache_ext : str, optional
        Extension used for caching.
    kwargs : dict, optional
        Additional arguments passed for feature computation.

    Returns
    -------
    feature_list : list
        List containing the computed/loaded features.

    """

    feature_list = []
    for audio_file in files:
        file_path, file_name = os.path.split(audio_file)
        file_base, file_ext = os.path.splitext(file_name)
        cache_file = os.path.join(CACHE_PATH_1, file_base + cache_ext)
        if cache and os.path.exists(cache_file):
            feat = np.load(cache_file)
        else:
            feat = compute_melgram(audio_file)
            if cache:
                np.save(cache_file, feat)
        feature_list.append(feat)
        if len(feature_list)%5000 == 0:
            print('computed', len(feature_list), 'features...')
    return feature_list

### Only keep 50 top tags

In [None]:
def filter_top_50_tags(annotations):
    """
    returns annotations filtered by top 50 most frequent tags
    """
    anno = annotations.copy()
    
    anno_values = anno[1:, 1:len(anno[0])-1]
    anno_int = np.asarray(replace(anno_values), dtype=int)
    anno_sum = anno_int.sum(axis=0)
    anno_sorted = np.sort(anno_sum)[::-1]
    smallest_tag_value = anno_sorted[49]
    
    tag_indices = np.where(anno_sum >= smallest_tag_value)
    tag_array = [i+1 for i in tag_indices[0]]
    cols = [0] + tag_array
    cols = cols + [len(annotations[0])-1]
    
    return anno[:, cols]

In [None]:
top_annotations = filter_top_50_tags(annotations)

### Create Train / Validation / Test splits

In [None]:
def compute_title_dictionary(meta_data):
    """
    returns dictionary: audio file name -> track title
    """
    meta = meta_data.copy()
    
    filtered_meta = meta[1:, [2,9]]
    clean_meta = np.asarray(replace(filtered_meta))

    meta_dict = {}
    for i, d in enumerate(clean_meta):
        meta_dict[d[1]] = d[0]
    
    return meta_dict

def compute_target_dictionary(annotations):
    """
    returns dictionary: audio file name -> list of annotations
    """
    anno = annotations.copy()
    
    filtered_anno = anno[1:, 1:]
    clean_anno = np.asarray(replace(filtered_anno))

    target_dict = {}
    for i, d in enumerate(clean_anno):
        target_dict[d[50]] = d[:50].astype(np.float32)
    
    return target_dict

def group_audio(audio_files, meta_dict):
    """
    returns audio grouped by track title (based on dictionary)
    """
    grouped_audio = []
    same_track = []
    for i, a in enumerate(audio_files):
        if i == 0:
            same_track.append(a)
        else:
            previous_title = meta_dict[audio_files[i-1].split(AUDIO_PATH_1+'/')[1]]
            current_title = meta_dict[audio_files[i].split(AUDIO_PATH_1+'/')[1]]
            if previous_title == current_title:
                same_track.append(a)
            else:
                grouped_audio.append(same_track)
                same_track = []
                same_track.append(a)

    grouped_audio.append(same_track)
    return grouped_audio

def shuffle_and_split_files(grouped_audio):
    """
    returns approx. 50% as training, 25% as validation, 25% as test data (randomly shuffled)
    """
    half_idx = int(len(grouped_audio)/2)
    three_quarter_idx = int(half_idx/2) + half_idx
    
    grouped_audio_shuffled = random.Random(seed).sample(grouped_audio, len(grouped_audio))

    training_audio = grouped_audio_shuffled[:half_idx]
    validation_audio = grouped_audio_shuffled[half_idx:three_quarter_idx]
    test_audio = grouped_audio_shuffled[three_quarter_idx:]

    training_audio = [item for sublist in training_audio for item in sublist]
    validation_audio = [item for sublist in validation_audio for item in sublist]
    test_audio = [item for sublist in test_audio for item in sublist]
    
    return np.array(training_audio), np.array(validation_audio), np.array(test_audio)

def init_targets(audio_files, target_dict):
    """
    returns targtes based on shuffled audio
    """
    targets = list(map(lambda v : target_dict[v.split(AUDIO_PATH_1+'/')[1]], audio_files))
    return np.array(targets)

In [None]:
meta_dict = compute_title_dictionary(meta_data)
target_dict = compute_target_dictionary(top_annotations)
grouped_audio = group_audio(audio_files, meta_dict)
training_audio, validation_audio, test_audio = shuffle_and_split_files(grouped_audio)

In [None]:
train_feat = init_features(training_audio)
val_feat = init_features(validation_audio)
test_feat = init_features(test_audio)

In [None]:
train_targ = init_targets(training_audio, target_dict)
val_targ = init_targets(validation_audio, target_dict)
test_targ = init_targets(test_audio, target_dict)

In [None]:
# features and targets used for CNN training and testing

# use step size to configure amount of data used by the CNN
step_size = 48
training_features = train_feat[0::step_size]
validation_features = val_feat[0::step_size]
test_features = test_feat[0::step_size]

training_targets = train_targ[0::step_size]
validation_targets = val_targ[0::step_size]
test_targets = test_targ[0::step_size]

# for testing on all data
test_features_all = test_feat
test_targets_all = test_targ

### CNN model, helper functions and training loop

In [None]:
# tagger base class
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #
        # In this constructor, create the layers needed to build the network.

        self.conv1 = nn.Conv2d(1, 128, kernel_size=3, padding=1)
        self.conv1_bn = nn.BatchNorm2d(128)
        self.mp1 = nn.MaxPool2d((2,4), stride=(2,4))
        self.drop1 = nn.Dropout2d(p=0.5)
        
        self.conv2 = nn.Conv2d(128, 384, kernel_size=3, padding=1)
        self.conv2_bn = nn.BatchNorm2d(384)
        self.mp2 = nn.MaxPool2d((4,5), stride=(4,5))
        self.drop2 = nn.Dropout2d(p=0.5)
        
        self.conv3 = nn.Conv2d(384, 768, kernel_size=3, padding=1)
        self.conv3_bn = nn.BatchNorm2d(768)
        self.mp3 = nn.MaxPool2d((3,8), stride=(3,8))
        self.drop3 = nn.Dropout2d(p=0.5)
        
        self.conv4 = nn.Conv2d(768, 2048, kernel_size=3, padding=1)
        self.conv4_bn = nn.BatchNorm2d(2048)
        self.mp4 = nn.MaxPool2d((4,8), stride=(4,8))
        self.drop4 = nn.Dropout2d(p=0.5)
        
        self.lin1 = nn.Linear(2048, 50)

    def forward(self, x):
        # This function calculates a forward pass through the network (i.e. calculates the output for given input x).
        # Hand x through the layers of the network and calculate the output.

        h1 = torch_func.relu(self.conv1_bn(self.conv1(x)))
        h2 = self.drop1(self.mp1(h1))
        h3 = torch_func.relu(self.conv2_bn(self.conv2(h2)))
        h4 = self.drop2(self.mp2(h3))
        h5 = torch_func.relu(self.conv3_bn(self.conv3(h4)))
        h6 = self.drop3(self.mp3(h5))
        h7 = torch_func.relu(self.conv4_bn(self.conv4(h6)))
        h8 = self.drop4(self.mp4(h7))
        h8 = h8.reshape(h8.size(0), -1)
        y = torch_func.sigmoid(self.lin1(h8))
        return y

In [None]:
# training / test / inference functions and data set class for the data loader

def train_nn(args, model, device, train_loader, optimizer, epoch):
    """
    Training loop for one epoch of NN training.
    Within one epoch, all the data is used once, we use mini-batch gradient descent.
    :param args: NN parameters for training and inference
    :param model: The model to be trained
    :param device: PyTorch device: CPU or GPU
    :param train_loader: Data provider
    :param optimizer: Optimizer (Gradient descent update algorithm)
    :param epoch: Current epoch number
    :return:
    """
    # set model to training mode (activate dropout layers for example).
    model.train()
    # we measure the needed time
    t = time.time()
    # iterate over training data
    for batch_idx, (data, target) in enumerate(train_loader):
        # move data to device (GPU) if necessary
        data, target = data.to(device), target.to(device)
        # reset optimizer
        optimizer.zero_grad()
        # forward pass (calculate output of network for input)
        output = model(data.float())
        # calculate loss
        loss = torch_func.binary_cross_entropy(output, target)
        # do a backward pass (calculate gradients using automatic differentiation and backpropagation)
        loss.backward()
        # udpate parameters of network using our optimizer
        optimizer.step()
        # print some outputs if we reached our logging intervall
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, took {:.2f}s'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item(), time.time()-t))
            t = time.time()


       
def test_nn(args, model, device, test_loader):
    """
    Function wich iterates over test data (eval or test set) and calculates loss.
    Here no parameter update is done
    :param args: NN parameters for training and inference
    :param model: The model to be tested
    :param device: PyTorch device: CPU or GPU
    :param test_loader: Data provider
    :return: cumulative test loss
    """
    # set model to inference mode (deactivate dropout layers for example).
    model.eval()
    # init cumulative loss
    test_loss = 0
    # do not calculate gradients since we do not want to do updates
    with torch.no_grad():
        # iterate over test data
        for data, target in test_loader:
            # move data to device (GPU) if necessasry
            data, target = data.to(device), target.to(device)
            # forward pass (calculate output of network for input)
            output = model(data.float())
            # claculate loss and add it to our cumulative loss
            test_loss += torch_func.binary_cross_entropy(output, target, reduction='sum').item()  # sum up batch loss

    # output results of test run
    test_loss /= len(test_loader.dataset)
    print('Average loss: {:.4f}\n'.format(
        test_loss, len(test_loader.dataset)))

    return test_loss
  

    
def inference_cnn(model, device, data):
    """
    Function calculating the actual output of the network, given some input.
    :param args: NN parameters for training and inference
    :param model: The network to be used
    :param device: PyTorch device: CPU or GPU
    :param data: Data for which the output should be calculated
    :return: output of network
    """
    # set model to inference mode (deactivate dropout layers for example).
    model.eval()
    output = None
    # move input to device if necessary
    data = torch.from_numpy(data)
    data = data.to(device)
    # do not calculate gradients since we do not want to do updates
    with torch.no_grad():
        output = model(data.float())
    return output


    
# class which formats the spectrogram data in the way needed for convolutional neural network training
class TagSet(Dataset):
    def __init__(self, feat_list, targ_list):
        self.features = feat_list
        self.targets = targ_list
        self.length = len(self.features)
        super(TagSet, self).__init__()

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        # get 1 feature/target pair
        # convert to PyTorch tensor and return
        return torch.from_numpy(self.features[index]).squeeze_(0), torch.from_numpy(self.targets[index])

    

# helper class for arguments
class Args:
    pass

print('done')

In [None]:
# cnn tagging experiment
def cnn():
    print('Training CNN...')

    # parameters for NN training
    args = Args()
    args.batch_size = 8 #64
    args.max_epochs = 25 #1000
    args.patience = 4
    args.lr = 0.01 # 0.001, 0.0001
    args.momentum = 0.5
    args.no_cuda = not g_use_cuda
    args.seed = 1
    args.log_interval = 10 #100

    # setup pytorch
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    
    # create model and optimizer
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # setup our datasets for training, evaluation and testing
    kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {'num_workers': 4}
    train_loader = torch.utils.data.DataLoader(TagSet(training_features, training_targets),
                                               batch_size=args.batch_size, shuffle=True, **kwargs)
    valid_loader = torch.utils.data.DataLoader(TagSet(validation_features, validation_targets),
                                               batch_size=args.batch_size, shuffle=False, **kwargs)
    test_loader = torch.utils.data.DataLoader(TagSet(test_features, test_targets),
                                              batch_size=args.batch_size, shuffle=False, **kwargs)

    # main training loop
    best_test_loss = 9999
    cur_patience = args.patience
    for epoch in range(1, args.max_epochs + 1):
        # run one epoch of NN training
        train_nn(args, model, device, train_loader, optimizer, epoch)
        # validate on validation set
        print('\nValidation Set:')
        test_loss = test_nn(args, model, device, valid_loader)
        # check for early stopping
        if test_loss < best_test_loss:
            torch.save(model.state_dict(), os.path.join(MODEL_PATH_1, CNN_MODEL_NAME + '.model'))
            best_test_loss = test_loss
            cur_patience = args.patience
        else:
            # if performance does not improve, we do not stop immediately but wait for 4 iterations (patience)
            if cur_patience <= 0:
                print('Early stopping, no improvement for %d epochs...' % args.patience)
                break
            else:
                print('No improvement, patience: %d' % cur_patience)
                cur_patience -= 1

    # testing on test data
    print('Evaluate CNN...')
    print('Test Set:')
    # calculate loss for test set
    test_nn(args, model, device, test_loader)

In [None]:
# run trainig
cnn()

### Load saved CNN model and run inference

In [None]:
def run_inference(test_features):
    no_cuda = not g_use_cuda
    use_cuda = not no_cuda and torch.cuda.is_available()
    torch.manual_seed(seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    
    # load model
    model = Net().to(device)
    model.load_state_dict(torch.load(os.path.join(MODEL_PATH_1, CNN_MODEL_NAME + '.model')))
    print('model loaded...')
    
    # calculate actual output for the test data
    results_cnn = [None for _ in range(len(test_features))]
    # iterate over test tracks
    for test_idx, cur_test_feat in enumerate(test_features):
        if test_idx % 100 == 0:
            completion = int((test_idx / len(test_features))*100)
            print(str(completion)+'% complete...')
        
        # run the inference method
        result = inference_cnn(model, device, cur_test_feat)
        results_cnn[test_idx] = result.numpy()[0]

    return results_cnn

### Compute ROC AUC on limited test set (2%)
#### result: 66.00%

In [None]:
results = run_inference(test_features)

In [None]:
score = roc_auc_score(test_targets, results)
print('Tagger ROC AUC score on limited test set is:', score)

### Compute ROC AUC on full test set
#### result: 65.45%

In [None]:
results_all = run_inference(test_features_all)

In [None]:
score_all = roc_auc_score(test_targets_all, results_all)
print('Tagger ROC AUC score on full test set is:', score_all)

### Results

#### Instructions:
All necessary data used for computations should be located under "data/part_1" relative to the jupyter notebook path.

The "annotaions_final.csv", "clip_info_final.csv" and "mpr3.zip" folder from the MagnaTagATune dataset (you can adjust the default paths in the second code block).

Executing the notebook will additionally create a "data/part_1/feat_cache" folder containing the computed spectrograms, which will be loaded if present in the folder instead of being computed from scracth.

A "data/part_1/models" folder will also be created if the CNN training block is executed which will contain the "cnn_model.model" file. Note that the evaluation block always loads the model from the "models" folder, so please make sure to either run the training before evaluation (might take up to 2 hours on a regular machine) or alternatively create the folder manually and copy the provided "cnn_model.model" file into it.

#### Methodology:
LogMelSpectrograms were used as features along with the top 50 most frequent tags from the MagnaTagATune dataset as targets for the CNN.

The data was randomly split into training, validation and test sets with an approx. 50%/25%/25% ratio with extra care being taken to ensure that no fragments of a single track (same title) end up in different sets.
This resulted in the following splits:

Training - 12749 features;<br/>
Validation - 6448 features;<br/>
Test - 6663 features;

However since training would have been infeasible due to long training times, it was performed only on a subset of the original data (approx. 2% or 540 features).
The reduced splits:

Training - 266 features;<br/>
Validation - 135 features;<br/>
Test - 139 features;

Final testing however was performed <b>both</b> on the <b>Full Test Set</b> (6663) and the <b>Reduced Test Set</b> (139).

#### Results and Observations:
As mentioned above training was conducted on a reduced training set with 1 epoch taking up approx. 5 minutes and the whole training phase lasting 120 minutes (23 epochs) on a MacBook Pro 2017.

ROC area under the curve was used to measure CNN performance.

The network achieved the following ROC AUC socres:
<br/>
66.00% on the Reduced Test Set (139 features)
<br/>
65.45% on the Full Test Set (6663 features)

Since the results were fairly similar in both cases and considerably above 50% (which is associated with random guessing), we can conclude that this implementation is quite promising.

The obvious next step to improving the CNN would be to actually train it on the whole training split, since it is well known that deep neural networks benefit from large amounts of data. Also maybe utilise some tricks to reduce training time :)

# Part 2