# EX 3 Experiment!!!

### Ex 2 imports and constants

In [None]:
# lets import everything we will need first...
# some generic stuff, numpy will help us with math!
import os
import numpy as np
import time

# filters, might be useful for separate and detect
from scipy.signal import butter, freqz
from scipy.ndimage.filters import maximum_filter, uniform_filter

# classifier for segment and classify method
from sklearn.neighbors import KNeighborsClassifier

# madmom audio processing stuff and evaluation
import madmom
from madmom.audio.spectrogram import LogarithmicFilteredSpectrogram
from madmom.audio import Signal
from madmom.features.onsets import OnsetPeakPickingProcessor
from madmom.evaluation import OnsetEvaluation, OnsetSumEvaluation
from madmom.features import CNNOnsetProcessor
from madmom.utils import search_files

# pytorch, deep learning library
import torch
import torch.nn as nn
import torch.nn.functional as torch_func
import torch.optim as optim
from torch.utils.data import Dataset as Dataset

"""
# plotting library for visualization for debugging
import matplotlib.pyplot as plt
plt.rcParams.update({'pgf.rcfonts': False})

COLAB_DRIVE_BASE = "/content/g-drive"
import sys
IN_COLAB = 'google.colab' in sys.modules

# if in colab, mount gdrive
if IN_COLAB:
  from google.colab import drive
  print('trying to mount google drive...')
  drive.mount(COLAB_DRIVE_BASE, force_remount=True)
"""

#
# some global parameter settings we will need along the way
#
EPSILON = np.finfo(np.float32).eps  # small epsilon needed sometimes for computational stability (div by zeros)

"""
SETTINGS = {  # settings for spectrogram (feature) calculation
    'fps': 100,  # frames per second of our resulting spectrograms
    'fmin': 30,  # minimum frequency
    'fmax': 15000,  # maximum frequency of spectrogram
    'frame_size': 2048,  # frame size for spectrogram
    'sample_rate': 44100,  # input sample rate - input audio will be resampled to this sample rate.
    'num_bands': 12,  # bands per octave (freq. factor 2)
    'num_channels': 1,  # input audio will be converted to mono
    'norm_filters': True,  # normalize triangular filters for log/log spectrogram to have equal area
}

# drum label names
# all arrays and lists containing instruments will always follow this index system, 0:KD (kick/bass drum),
# 1:SD (snare drum), 2: HH (hi-hat).
names_3_map = ['KD', 'SD', 'HH']
num_3_drum_notes = len(names_3_map)
"""

# paths to our small example dataset
PATH = os.getcwd()

"""
if IN_COLAB:
  PATH = os.path.join(COLAB_DRIVE_BASE, 'My Drive/Colab Notebooks')
"""

"""
DATA_PATH = os.path.join(PATH, 'data/drums_simple')  # change this value if you copied the dataset somewhere else!
ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'annotations')
SAMPLE_ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'sample_annotations')
AUDIO_PATH = os.path.join(DATA_PATH, 'audio')
SAMPLES_PATH = os.path.join(DATA_PATH, 'samples')
CACHE_PATH = os.path.join(DATA_PATH, 'feat_cache')
if not os.path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH)
MODEL_PATH = os.path.join(DATA_PATH, 'models')
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
CNN_MODEL_NAME = 'cnn_model'
"""

"""
# some info about our data
NUM_KITS = 4  # we have 4 different drum kits
NUM_TRACKS = 4  # and 4 tracks per kit
FPS = SETTINGS['fps']  # shorthand to the FPS we use for our spectrogram
RANK = num_3_drum_notes  # we use three instruments

# turn on / off plotting (for debugging)
plot = False
plot_len = 400
"""

# use GPU for NN training?
g_use_cuda = True

# seed for RNG for reproducible results
seed = 1234 #12345
print('done')


### Extra imports and constants

In [None]:
import librosa

######## ADJUST DATA PATHS ACCORDING TO YOUR LOCAL CONFIGURATION ########
DATA_PATH_1 = os.path.join(PATH, 'data/part_1')
AUDIO_PATH_1 = os.path.join(DATA_PATH_1, 'mp3.zip')
ANNOTATIONS_PATH_1 = os.path.join(DATA_PATH_1, 'annotations_final.csv')
META_DATA_PATH_1 = os.path.join(DATA_PATH_1, 'clip_info_final.csv')

CACHE_PATH_1 = os.path.join(DATA_PATH_1, 'feat_cache')
if not os.path.exists(CACHE_PATH_1):
    os.makedirs(CACHE_PATH_1)
MODEL_PATH_1 = os.path.join(DATA_PATH_1, 'models')
if not os.path.exists(MODEL_PATH_1):
    os.makedirs(MODEL_PATH_1)  
    
CNN_MODEL_NAME = 'cnn_model'

### Helper Functions

In [None]:
import random

replace = np.vectorize(lambda v : v.replace("\"",""))

### Load audio, annotations and metadata

In [None]:
audio_files = search_files(AUDIO_PATH_1, '.mp3', recursion_depth=1)

#print(len(audio_files))

# librosa cant load these files for some reason
# norine_braun-now_and_zen-08-gently-117-146.mp3
del audio_files[10687]
# jacob_heringman-josquin_des_prez_lute_settings-19-gintzler__pater_noster-204-233.mp3
del audio_files[12821]
# american_baroque-dances_and_suites_of_rameau_and_couperin-26-loracle_suite_in_d_from_les_fetes_dhebe_rameau-0-29
del audio_files[13701]

#print(len(audio_files))

annotations = np.genfromtxt(ANNOTATIONS_PATH_1, dtype=str, delimiter='\t')
meta_data = np.genfromtxt(META_DATA_PATH_1, dtype=str, delimiter='\t')

### LogMelSpectrogram from Music Auto Tagging (+ caching from Ex2)

In [None]:
def compute_melgram(audio_path):
    ''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
    96 == #mel-bins and 1366 == #time frame
    parameters
    ----------
    audio_path: path for the audio file.
                Any format supported by audioread will work.
    More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
    '''

    # mel-spectrogram parameters
    SR = 12000
    N_FFT = 512
    N_MELS = 96
    HOP_LEN = 256
    DURA = 29.12  # to make it 1366 frame..

    src, sr = librosa.load(audio_path, sr=SR)  # whole signal
    n_sample = src.shape[0]
    n_sample_fit = int(DURA*SR)

    if n_sample < n_sample_fit:  # if too short
        src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
    elif n_sample > n_sample_fit:  # if too long
        # src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
        src = src[int((n_sample-n_sample_fit)/2):int((n_sample+n_sample_fit)/2)]
        
    #logam = librosa.logamplitude
    logam = librosa.core.power_to_db
    
    melgram = librosa.feature.melspectrogram
    
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS)**2,
                ref_power=1.0)
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS))
    
    """
    stft = librosa.core.stft(y=src, n_fft=N_FFT, hop_length=HOP_LEN)
    initial_spectrogram = abs(stft)**2
    mel_bins = librosa.filters.mel(sr=SR, n_fft=N_FFT, n_mels=N_MELS)
    mel_spectrogram = mel_bins.dot(initial_spectrogram)
    db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    ret = db_mel_spectrogram
    """
    
    ret = ret[np.newaxis, np.newaxis, :]
    return ret

def init_features(files, cache=True, cache_ext='.cache.npy', **kwargs):
    """
    Create features for given audio files or load them from cache.

    Parameters
    ----------
    files : list
        List with audio file names.
    cache : bool, optional
        Cache features or use cached ones if available.
    cache_ext : str, optional
        Extension used for caching.
    kwargs : dict, optional
        Additional arguments passed for feature computation.

    Returns
    -------
    feature_list : list
        List containing the computed/loaded features.

    """

    feature_list = []
    for audio_file in files:
        file_path, file_name = os.path.split(audio_file)
        file_base, file_ext = os.path.splitext(file_name)
        cache_file = os.path.join(CACHE_PATH_1, file_base + cache_ext)
        if cache and os.path.exists(cache_file):
            feat = np.load(cache_file)
        else:
            feat = compute_melgram(audio_file)
            if cache:
                np.save(cache_file, feat)
        feature_list.append(feat)
        if len(feature_list)%5000 == 0:
            print('computed', len(feature_list), 'features...')
    return feature_list

### Only keep 50 top tags

In [None]:
def filter_top_50_tags(annotations):
    """
    returns annotations filtered by top 50 most frequent tags
    """
    anno = annotations.copy()
    
    anno_values = anno[1:, 1:len(anno[0])-1]
    anno_int = np.asarray(replace(anno_values), dtype=int)
    anno_sum = anno_int.sum(axis=0)
    anno_sorted = np.sort(anno_sum)[::-1]
    smallest_tag_value = anno_sorted[49]
    
    tag_indices = np.where(anno_sum >= smallest_tag_value)
    tag_array = [i+1 for i in tag_indices[0]]
    cols = [0] + tag_array
    cols = cols + [len(annotations[0])-1]
    
    return anno[:, cols]

In [None]:
top_annotations = filter_top_50_tags(annotations)

### Create Train / Validation / Test splits

In [None]:
def compute_title_dictionary(meta_data):
    """
    returns dictionary: audio file name -> track title
    """
    meta = meta_data.copy()
    
    filtered_meta = meta[1:, [2,9]]
    clean_meta = np.asarray(replace(filtered_meta))

    meta_dict = {}
    for i, d in enumerate(clean_meta):
        meta_dict[d[1]] = d[0]
    
    return meta_dict

def compute_target_dictionary(annotations):
    """
    returns dictionary: audio file name -> list of annotations
    """
    anno = annotations.copy()
    
    filtered_anno = anno[1:, 1:]
    clean_anno = np.asarray(replace(filtered_anno))

    target_dict = {}
    for i, d in enumerate(clean_anno):
        target_dict[d[50]] = d[:50].astype(np.float32)
    
    return target_dict

def group_audio(audio_files, meta_dict):
    """
    returns audio grouped by track title (based on dictionary)
    """
    grouped_audio = []
    same_track = []
    for i, a in enumerate(audio_files):
        if i == 0:
            same_track.append(a)
        else:
            previous_title = meta_dict[audio_files[i-1].split(AUDIO_PATH_1+'/')[1]]
            current_title = meta_dict[audio_files[i].split(AUDIO_PATH_1+'/')[1]]
            if previous_title == current_title:
                same_track.append(a)
            else:
                grouped_audio.append(same_track)
                same_track = []
                same_track.append(a)

    grouped_audio.append(same_track)
    return grouped_audio

def shuffle_and_split_files(grouped_audio):
    """
    returns approx. 50% as training, 25% as validation, 25% as test data (randomly shuffled)
    """
    half_idx = int(len(grouped_audio)/2)
    three_quarter_idx = int(half_idx/2) + half_idx
    
    grouped_audio_shuffled = random.Random(seed).sample(grouped_audio, len(grouped_audio))

    training_audio = grouped_audio_shuffled[:half_idx]
    validation_audio = grouped_audio_shuffled[half_idx:three_quarter_idx]
    test_audio = grouped_audio_shuffled[three_quarter_idx:]

    training_audio = [item for sublist in training_audio for item in sublist]
    validation_audio = [item for sublist in validation_audio for item in sublist]
    test_audio = [item for sublist in test_audio for item in sublist]
    
    return np.array(training_audio), np.array(validation_audio), np.array(test_audio)

def init_targets(audio_files, target_dict):
    targets = list(map(lambda v : target_dict[v.split(AUDIO_PATH_1+'/')[1]], audio_files))
    return np.array(targets)

In [None]:
meta_dict = compute_title_dictionary(meta_data)
target_dict = compute_target_dictionary(top_annotations)
grouped_audio = group_audio(audio_files, meta_dict)
training_audio, validation_audio, test_audio = shuffle_and_split_files(grouped_audio)

In [None]:
training_features = init_features(training_audio)
validation_features = init_features(validation_audio)
test_features = init_features(test_audio)

In [None]:
training_targets = init_targets(training_audio, target_dict)
validation_targets = init_targets(validation_audio, target_dict)
test_targets = init_targets(test_audio, target_dict)

### CNN Model

In [None]:
# transcription base class
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #
        # In this constructor, create the layers needed to build the network.
        # Use the pytorch components nn.Conv2d, nn.BatchNorm2d, nn.Dropout2d, nn.Linear, nn.BatchNorm1d
        # The network should have the same architecture as presented in the lecture slides (CNN)
        # Note that one convolutional block (yellowish blocks) consists of TWO layers of 3x3 convolutions with
        # batch normalization for EACH layer and max pooling and dropout after the convolutional block (after the 2nd
        # convolutional layer. The whole network consists of two convolutional blocks, where in the first each layer
        # contains 32 filters, and in the second each layer contains 64 filters.
        # After that, a dense layer (nn.Linear) with 50 neurons and the output dense layer with 3 neurons follow.
        
        # e.g. for the first convolutional layer we will need something like:
        """
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv1_bn = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv2_bn = nn.BatchNorm2d(32)
        self.mp1 = nn.MaxPool2d(3, stride=2)
        self.drop1 = nn.Dropout2d(p=0.5)
        
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3_bn = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv4_bn = nn.BatchNorm2d(64)
        self.mp2 = nn.MaxPool2d(3, stride=2)
        self.drop2 = nn.Dropout2d(p=0.5)
        
        self.lin1 = nn.Linear(64*5*19, 50) #64x5x19 are the dimensions of one feature at this point
        self.lin1_bn = nn.BatchNorm1d(50)
        self.drop3 = nn.Dropout2d(p=0.5)
        
        self.lin2 = nn.Linear(50,3)
        """

    def forward(self, x):
        # This function calculates a forward pass through the network (i.e. calculates the output for given input x).
        # Hand x through the layers of the network and calculate the output.
        # Don't forget to apply the nonlinearities (activation functions).
        # Use ReLU activation function ( torch_func.relu() ) except for the ouput of the network where we need
        # sigmoid activations (0-1) for our activation functions ( torch_func.sigmoid() or torch.sigmoid() )
        # e.g. to calculate the hidden output of the first convolutional layer:
        
        print('wtf!!!', x.shape)
        """
        h1 = torch_func.relu(self.conv1_bn(self.conv1(x)))
        h2 = torch_func.relu(self.conv2_bn(self.conv2(h1)))
        h3 = self.drop1(self.mp1(h2))
        h4 = torch_func.relu(self.conv3_bn(self.conv3(h3)))
        h5 = torch_func.relu(self.conv4_bn(self.conv4(h4)))
        h6 = self.drop2(self.mp2(h5))
        h6 = h6.view(h6.size(0), -1) # "reshape" for the fully connected layer (keeping the batch size)
        h7 = torch_func.relu(self.drop3(self.lin1_bn(self.lin1(h6))))
        h8 = self.lin2(h7)

        # Note that you should always apply the batch normalization, max pooling (torch_func.max_pool2d),
        # and dropout BEFORE you apply the activation function!!
        hn = h8
        y = torch_func.sigmoid(hn)
        """
        y = None
        return y


In [None]:
# class which formats the spectrogram data in the way needed for convolutional neural network training
class TagSet(Dataset):
    def __init__(self, feat_list, targ_list):
        """
        Create spectrogram based drum dataset for CNN training
        :param feat_list: list with spectrograms (np.array) for individual tracks
        :param targ_list: list with targets (np.array) for individual tracks
        """
        self.features = feat_list
        self.targets = targ_list
        self.length = len(self.features)
        super(TagSet, self).__init__()

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        """
        Get a snipped by index, from the whole dataset
        :param index: index of the snipped to be returned
        :return: a snipped for CNN training
        """
        # convert to PyTorch tensor and return
        #return torch.from_numpy(self.features[index]).unsqueeze_(0), torch.from_numpy(self.targets[index])
        return torch.from_numpy(self.features[index]).squeeze_(0), torch.from_numpy(self.targets[index])

# helper class for arguments
class Args:
    pass

print('done')

In [None]:
# cnn drum transcription experiment
def cnn():
    print('Training CNN...')

    args = Args()
    args.batch_size = 64
    args.no_cuda = not g_use_cuda

    # setup pytorch
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # setup our datasets for training, evaluation and testing
    kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {'num_workers': 4}
    train_loader = torch.utils.data.DataLoader(TagSet(training_features, training_targets),
                                               batch_size=args.batch_size, shuffle=True, **kwargs)
    valid_loader = torch.utils.data.DataLoader(TagSet(validation_features, validation_targets),
                                               batch_size=args.batch_size, shuffle=False, **kwargs)
    test_loader = torch.utils.data.DataLoader(TagSet(test_features, test_targets),
                                              batch_size=args.batch_size, shuffle=False, **kwargs)

    # f, t = iter(test_loader).next()
    # print(f.shape)
    # print(t.shape)

In [None]:
cnn()