# EX 3 Experiment!!!

### Ex 2 imports and constants

In [None]:
# lets import everything we will need first...
# some generic stuff, numpy will help us with math!
import os
import numpy as np
import time

# filters, might be useful for separate and detect
from scipy.signal import butter, freqz
from scipy.ndimage.filters import maximum_filter, uniform_filter

# classifier for segment and classify method
from sklearn.neighbors import KNeighborsClassifier

# madmom audio processing stuff and evaluation
import madmom
from madmom.audio.spectrogram import LogarithmicFilteredSpectrogram
from madmom.audio import Signal
from madmom.features.onsets import OnsetPeakPickingProcessor
from madmom.evaluation import OnsetEvaluation, OnsetSumEvaluation
from madmom.features import CNNOnsetProcessor
from madmom.utils import search_files

# pytorch, deep learning library
import torch
import torch.nn as nn
import torch.nn.functional as torch_func
import torch.optim as optim
from torch.utils.data import Dataset as Dataset

"""
# plotting library for visualization for debugging
import matplotlib.pyplot as plt
plt.rcParams.update({'pgf.rcfonts': False})

COLAB_DRIVE_BASE = "/content/g-drive"
import sys
IN_COLAB = 'google.colab' in sys.modules

# if in colab, mount gdrive
if IN_COLAB:
  from google.colab import drive
  print('trying to mount google drive...')
  drive.mount(COLAB_DRIVE_BASE, force_remount=True)
"""

#
# some global parameter settings we will need along the way
#
EPSILON = np.finfo(np.float32).eps  # small epsilon needed sometimes for computational stability (div by zeros)

"""
SETTINGS = {  # settings for spectrogram (feature) calculation
    'fps': 100,  # frames per second of our resulting spectrograms
    'fmin': 30,  # minimum frequency
    'fmax': 15000,  # maximum frequency of spectrogram
    'frame_size': 2048,  # frame size for spectrogram
    'sample_rate': 44100,  # input sample rate - input audio will be resampled to this sample rate.
    'num_bands': 12,  # bands per octave (freq. factor 2)
    'num_channels': 1,  # input audio will be converted to mono
    'norm_filters': True,  # normalize triangular filters for log/log spectrogram to have equal area
}

# drum label names
# all arrays and lists containing instruments will always follow this index system, 0:KD (kick/bass drum),
# 1:SD (snare drum), 2: HH (hi-hat).
names_3_map = ['KD', 'SD', 'HH']
num_3_drum_notes = len(names_3_map)
"""

# paths to our small example dataset
PATH = os.getcwd()

"""
if IN_COLAB:
  PATH = os.path.join(COLAB_DRIVE_BASE, 'My Drive/Colab Notebooks')
"""

"""
DATA_PATH = os.path.join(PATH, 'data/drums_simple')  # change this value if you copied the dataset somewhere else!
ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'annotations')
SAMPLE_ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'sample_annotations')
AUDIO_PATH = os.path.join(DATA_PATH, 'audio')
SAMPLES_PATH = os.path.join(DATA_PATH, 'samples')
CACHE_PATH = os.path.join(DATA_PATH, 'feat_cache')
if not os.path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH)
MODEL_PATH = os.path.join(DATA_PATH, 'models')
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
CNN_MODEL_NAME = 'cnn_model'
"""

"""
# some info about our data
NUM_KITS = 4  # we have 4 different drum kits
NUM_TRACKS = 4  # and 4 tracks per kit
FPS = SETTINGS['fps']  # shorthand to the FPS we use for our spectrogram
RANK = num_3_drum_notes  # we use three instruments

# turn on / off plotting (for debugging)
plot = False
plot_len = 400
"""

# use GPU for NN training?
g_use_cuda = True

# seed for RNG for reproducible results
seed = 12345
print('done')


### Extra imports and constants

In [None]:
import librosa

######## ADJUST DATA PATHS ACCORDING TO YOUR LOCAL CONFIGURATION ########
DATA_PATH_1 = os.path.join(PATH, 'data/part_1')
AUDIO_PATH_1 = os.path.join(DATA_PATH_1, 'mp3.zip')
ANNOTATIONS_PATH_1 = os.path.join(DATA_PATH_1, 'annotations_final.csv')
META_DATA_PATH_1 = os.path.join(DATA_PATH_1, 'clip_info_final.csv')

CACHE_PATH_1 = os.path.join(DATA_PATH_1, 'feat_cache')
if not os.path.exists(CACHE_PATH_1):
    os.makedirs(CACHE_PATH_1)
MODEL_PATH_1 = os.path.join(DATA_PATH_1, 'models')
if not os.path.exists(MODEL_PATH_1):
    os.makedirs(MODEL_PATH_1)  
    
CNN_MODEL_NAME = 'cnn_model'

### Helper Functions

In [None]:
replace = np.vectorize(lambda v : v.replace("\"",""))

### Load audio, annotations and metadata

In [None]:
audio_files = search_files(AUDIO_PATH_1, '.mp3', recursion_depth=1)

#print(len(audio_files))

# librosa cant load these files for some reason
# norine_braun-now_and_zen-08-gently-117-146.mp3
del audio_files[10687]
# jacob_heringman-josquin_des_prez_lute_settings-19-gintzler__pater_noster-204-233.mp3
del audio_files[12821]
# american_baroque-dances_and_suites_of_rameau_and_couperin-26-loracle_suite_in_d_from_les_fetes_dhebe_rameau-0-29
del audio_files[13701]

#print(len(audio_files))

annotations = np.genfromtxt(ANNOTATIONS_PATH_1, dtype=str, delimiter='\t')
meta_data = np.genfromtxt(META_DATA_PATH_1, dtype=str, delimiter='\t')

### LogMelSpectrogram from Music Auto Tagging (+ caching from Ex2)

In [None]:
def compute_melgram(audio_path):
    ''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
    96 == #mel-bins and 1366 == #time frame
    parameters
    ----------
    audio_path: path for the audio file.
                Any format supported by audioread will work.
    More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
    '''

    # mel-spectrogram parameters
    SR = 12000
    N_FFT = 512
    N_MELS = 96
    HOP_LEN = 256
    DURA = 29.12  # to make it 1366 frame..

    src, sr = librosa.load(audio_path, sr=SR)  # whole signal
    n_sample = src.shape[0]
    n_sample_fit = int(DURA*SR)

    if n_sample < n_sample_fit:  # if too short
        src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
    elif n_sample > n_sample_fit:  # if too long
        # src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
        src = src[int((n_sample-n_sample_fit)/2):int((n_sample+n_sample_fit)/2)]
        
    #logam = librosa.logamplitude
    logam = librosa.core.power_to_db
    
    melgram = librosa.feature.melspectrogram
    
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS)**2,
                ref_power=1.0)
    """
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS))
    
    """
    stft = librosa.core.stft(y=src, n_fft=N_FFT, hop_length=HOP_LEN)
    initial_spectrogram = abs(stft)**2
    mel_bins = librosa.filters.mel(sr=SR, n_fft=N_FFT, n_mels=N_MELS)
    mel_spectrogram = mel_bins.dot(initial_spectrogram)
    db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    ret = db_mel_spectrogram
    """
    
    ret = ret[np.newaxis, np.newaxis, :]
    return ret

def init_features(files, cache=True, cache_ext='.cache.npy', **kwargs):
    """
    Create features for given audio files or load them from cache.

    Parameters
    ----------
    files : list
        List with audio file names.
    cache : bool, optional
        Cache features or use cached ones if available.
    cache_ext : str, optional
        Extension used for caching.
    kwargs : dict, optional
        Additional arguments passed for feature computation.

    Returns
    -------
    feature_list : list
        List containing the computed/loaded features.

    """

    feature_list = []
    for audio_file in files:
        file_path, file_name = os.path.split(audio_file)
        file_base, file_ext = os.path.splitext(file_name)
        cache_file = os.path.join(CACHE_PATH_1, file_base + cache_ext)
        if cache and os.path.exists(cache_file):
            feat = np.load(cache_file)
        else:
            feat = compute_melgram(audio_file)
            if cache:
                np.save(cache_file, feat)
        feature_list.append(feat)
        if len(feature_list)%5000 == 0:
            print('computed', len(feature_list), 'features...')
    return feature_list

In [None]:
features = init_features(audio_files)

### Only keep 50 top tags

In [None]:
def filter_top_50_tags(annotations):
    """
    returns annotations filtered by top 50 most frequent tags
    """
    anno = annotations.copy()
    
    anno_values = anno[1:, 1:len(anno[0])-1]
    anno_int = np.asarray(replace(anno_values), dtype=int)
    anno_sum = anno_int.sum(axis=0)
    anno_sorted = np.sort(anno_sum)[::-1]
    smallest_tag_value = anno_sorted[49]
    
    tag_indices = np.where(anno_sum >= smallest_tag_value)
    tag_array = [i+1 for i in tag_indices[0]]
    cols = [0] + tag_array
    cols = cols + [len(annotations[0])-1]
    
    return anno[:, cols]

In [None]:
top_annotations = filter_top_50_tags(annotations)

### Create Track Title Dictionary

In [None]:
def compute_title_dictionary(meta_data):
    meta = meta_data.copy()
    
    filtered_meta = meta[1:, [2,9]]
    clean_meta = np.asarray(replace(filtered_meta))

    meta_dict = {}
    for i, d in enumerate(clean_meta):
        meta_dict[d[1]] = d[0]
    
    return meta_dict

In [None]:
meta_dict = compute_title_dictionary(meta_data)

### Create Train / Validation / Test splits

In [None]:
def group_audio(audio_files):
    grouped_audio = []
    same_track = []
    for i, a in enumerate(audio_files):
        if i == 0:
            same_track.append(a)
        else:
            previous_title = meta_dict[audio_files[i-1].split(AUDIO_PATH_1+'/')[1]]
            current_title = meta_dict[audio_files[i].split(AUDIO_PATH_1+'/')[1]]
            if previous_title == current_title:
                same_track.append(a)
            else:
                grouped_audio.append(same_track)
                same_track = []
                same_track.append(a)

    grouped_audio.append(same_track)
    return grouped_audio

In [None]:
grouped_audio = group_audio(audio_files)
#grouped_audio[100]