# EX 2 Experiment!!!

# 1. Dependencies, imports, and global variables

In [None]:
# lets import everything we will need first...
# some generic stuff, numpy will help us with math!
import os
import numpy as np
import time

# filters, might be useful for separate and detect
from scipy.signal import butter, freqz
from scipy.ndimage.filters import maximum_filter, uniform_filter

# classifier for segment and classify method
from sklearn.neighbors import KNeighborsClassifier

# madmom audio processing stuff and evaluation
import madmom
from madmom.audio.spectrogram import LogarithmicFilteredSpectrogram
from madmom.audio import Signal
from madmom.features.onsets import OnsetPeakPickingProcessor
from madmom.evaluation import OnsetEvaluation, OnsetSumEvaluation
from madmom.features import CNNOnsetProcessor
from madmom.utils import search_files

# pytorch, deep learning library
import torch
import torch.nn as nn
import torch.nn.functional as torch_func
import torch.optim as optim
from torch.utils.data import Dataset as Dataset

# plotting library for visualization for debugging
import matplotlib.pyplot as plt
plt.rcParams.update({'pgf.rcfonts': False})

COLAB_DRIVE_BASE = "/content/g-drive"
import sys
IN_COLAB = 'google.colab' in sys.modules

# if in colab, mount gdrive
if IN_COLAB:
  from google.colab import drive
  print('trying to mount google drive...')
  drive.mount(COLAB_DRIVE_BASE, force_remount=True)

#
# some global parameter settings we will need along the way
#
EPSILON = np.finfo(np.float32).eps  # small epsilon needed sometimes for computational stability (div by zeros)

SETTINGS = {  # settings for spectrogram (feature) calculation
    'fps': 100,  # frames per second of our resulting spectrograms
    'fmin': 30,  # minimum frequency
    'fmax': 15000,  # maximum frequency of spectrogram
    'frame_size': 2048,  # frame size for spectrogram
    'sample_rate': 44100,  # input sample rate - input audio will be resampled to this sample rate.
    'num_bands': 12,  # bands per octave (freq. factor 2)
    'num_channels': 1,  # input audio will be converted to mono
    'norm_filters': True,  # normalize triangular filters for log/log spectrogram to have equal area
}

# drum label names
# all arrays and lists containing instruments will always follow this index system, 0:KD (kick/bass drum),
# 1:SD (snare drum), 2: HH (hi-hat).
names_3_map = ['KD', 'SD', 'HH']
num_3_drum_notes = len(names_3_map)

# paths to our small example dataset
PATH = os.getcwd()

if IN_COLAB:
  PATH = os.path.join(COLAB_DRIVE_BASE, 'My Drive/Colab Notebooks')

DATA_PATH = os.path.join(PATH, 'data/drums_simple')  # change this value if you copied the dataset somewhere else!
ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'annotations')
SAMPLE_ANNOTATIONS_PATH = os.path.join(DATA_PATH, 'sample_annotations')
AUDIO_PATH = os.path.join(DATA_PATH, 'audio')
SAMPLES_PATH = os.path.join(DATA_PATH, 'samples')
CACHE_PATH = os.path.join(DATA_PATH, 'feat_cache')
if not os.path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH)
MODEL_PATH = os.path.join(DATA_PATH, 'models')
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
CNN_MODEL_NAME = 'cnn_model'

# some info about our data
NUM_KITS = 4  # we have 4 different drum kits
NUM_TRACKS = 4  # and 4 tracks per kit
FPS = SETTINGS['fps']  # shorthand to the FPS we use for our spectrogram
RANK = num_3_drum_notes  # we use three instruments

# turn on / off plotting (for debugging)
plot = False
plot_len = 400

# use GPU for NN training?
g_use_cuda = True

# seed for RNG for reproducible results
seed = 12345
print('done')


# 2. Helper functions

In [None]:


#
# some helper functions to handle data from our example dataset
#
def step_diff(array, step):
    """
    Calculates a 1st order difference between rows step values .

    Parameters
    ----------
    array : np.array
        Input array to calculate the 1st order difference.

    step : int
        Number of steps for offset for difference calculation.

    Returns
    -------
    difference : np.array
        Array containing the 1st order difference. Note that the number of rows will be steps less than the input's.
    """
    a = array[step:]
    b = array[:-step]
    return b-a


def load_audio(audio_file_list):
    """
    Load audio from the given files.

    Parameters
    ----------
    audio_file_list : list
        List with audio filenames.

    Returns
    -------
    audio_list : list
        List containing the actual audio.

    """
    audio_list = []
    for audio_file in audio_file_list:
        signal = Signal(audio_file, **SETTINGS)
        audio_list.append(signal)
    return audio_list


def compute_feature(file, **kwargs):
    """
    Compute (spectrogram) feature for the given audio file.

    Parameters
    ----------
    file : str
        Audio file name.
    kwargs : dict, optional
        Additional arguments used for feature computation.

    Returns
    -------
    feature : numpy array
        Computed feature

    """
    # create (filtered) spectrogram
    return LogarithmicFilteredSpectrogram(file, **kwargs)


def create_features(files, cache=True, cache_ext='.cache.npy', **kwargs):
    """
    Create features for given audio files or load them from cache.

    Parameters
    ----------
    files : list
        List with audio file names.
    cache : bool, optional
        Cache features or use cached ones if available.
    cache_ext : str, optional
        Extension used for caching.
    kwargs : dict, optional
        Additional arguments passed for feature computation.

    Returns
    -------
    feature_list : list
        List containing the computed/loaded features.

    """

    feature_list = []
    for audio_file in files:
        file_path, file_name = os.path.split(audio_file)
        file_base, file_ext = os.path.splitext(file_name)
        cache_file = os.path.join(CACHE_PATH, file_base + cache_ext)
        if cache and os.path.exists(cache_file):
            feat = np.load(cache_file)
            print('successfully loaded cached file:', cache_file)
        else:
            feat = compute_feature(audio_file, **kwargs)
            if cache:
                np.save(cache_file, feat)
                print('successfully stored cache for file:', audio_file)
        feature_list.append(feat)
    return feature_list


def load_annotations(files):
    """
    Load annotations from files.

    Parameters
    ----------
    files : list
        List with annotation filenames.

    Returns
    -------
    annotation_list : list
        List with annotations.

    """
    annotation_list = []
    for annotation_file in files:
        annotation = madmom.io.load_notes(annotation_file)
        annotation_list.append(annotation)
    return annotation_list


def compute_target_array_from_times(times, fps, num_frames, num_targets):
    """
    creates a numpy array with targets for neural network training
    :param times: list
    list of annotations for which the target should be 1. times in seconds.
    :param fps:
    sampling frequency of target array
    :param num_frames:
    total number of frames (all entries in times must fit into the total number of frames).
    :param num_targets:
    total number of targets (all entries in times must fit into the total number of labels).
    :return:
    """
    if len(times) > 0 and np.max(times, 0)[0] * fps > num_frames:
        print("Maximum time is larger than number of samples - cutting times.")
    if len(times) > 0 and np.max(times, 0)[1] >= num_targets:
        print("Maximum label index is larger than num_targets - cutting labels.")

    new_targets = np.zeros((num_frames, num_targets))
    for entry_nr, time_entry in enumerate(times):
        cur_time = time_entry[0]
        time_idx = int(cur_time*fps)
        inst_idx = int(time_entry[1])
        if 0 <= inst_idx < num_targets:
            if time_idx < num_frames:
                new_targets[time_idx, inst_idx] = 1

    return new_targets


def create_targets(annotation_list, feature_list, fps=FPS, num_classes=3):
    """
    Create targets for the given annotations.

    Parameters
    ----------
    annotation_list : list
        List with annotations
    feature_list : list
        List with features (needed to determine length)
    fps : float
        Frames per second
    num_classes

    Returns
    -------
    target_list : list
        List with targets for NN training.

    """
    target_list = []
    for annotation, feature in zip(annotation_list, feature_list):
        target = compute_target_array_from_times(annotation, fps, len(feature), num_classes)
        target_list.append(target)
    return target_list


def plot_peak_picking(onset_function, pre_avg = 0.05, post_avg = 0.05, pre_max = 0.02, post_max = 0.02, combine = 0.02,
                      thresh = 0.2, smooth = 0.0, plot_frames=1000):
    """
    helper function which visualizes the peak picking parameters, use to adapt peak picking settings if necessary.
    :param onset_function:
    :param pre_avg:
    :param post_avg:
    :param pre_max:
    :param post_max:
    :param combine:
    :param thresh:
    :param smooth:
    :param plot_frames:
    :return:
    """
    # plot example to investigate peak picking
    peak_picker = OnsetPeakPickingProcessor(threshold=thresh, smooth=smooth, pre_avg=pre_avg,
                                               post_avg=post_avg, pre_max=pre_max, post_max=post_max,
                                               combine=combine, fps=FPS)
    inst_det = [peak_picker.process(onset_function[:, inst]) for inst in range(3)]

    for inst in range(3):
        plt.subplot(3, 1, inst + 1)
        activations = onset_function[:plot_frames, inst]

        avg_length = (pre_avg + post_avg) * FPS + 1
        avg_origin = int(np.floor((pre_avg - post_avg) * FPS / 2))
        avg_filter_size = avg_length
        max_length = (pre_max + post_max) * FPS + 1
        max_filter_size = max_length
        max_origin = int(np.floor((pre_max - post_max) * FPS / 2))

        mov_avg = uniform_filter(activations, avg_filter_size, mode='constant', origin=avg_origin)
        mov_max = maximum_filter(activations, max_filter_size, mode='constant', origin=max_origin)

        select = inst_det[inst] * FPS < plot_frames
        peaks = inst_det[inst][select] * FPS
        plt.plot(activations)
        plt.plot(peaks, onset_function[np.asarray(peaks, dtype=int), inst], 'ro')
        plt.plot(mov_avg, 'g')
        plt.plot(mov_max, 'y')
    plt.show()


def plot_activation_functions(spectrogram, activation_functions, templates=None):
    """
    helper function that visualizes spectrogram alongside detected activation functions.
    use to debug your methods.
    :param spectrogram:
    :param activation_functions:
    :param templates:
    :return:
    """
    if templates is None:
        num_plots = 2
    else:
        num_plots = 3
    plt.figure()
    plt.subplot(num_plots, 1, 1)
    plt.imshow(spectrogram.T, aspect='auto', origin='lower')
    if templates is not None:
        plt.subplot(num_plots, 1, 2)
        plt.imshow(templates, aspect='auto', origin='lower')
    plt.subplot(num_plots, 1, num_plots)
    plt.plot(activation_functions[:, 0])
    plt.plot(activation_functions[:, 1] + 1)
    plt.plot(activation_functions[:, 2] + 2)

    plt.show()
    
print('done')



# 3. Dataset

In [None]:


#
# load our example dataset
#

# load audio and calculate features
audio_files = search_files(AUDIO_PATH, '.wav')
audio_files += search_files(AUDIO_PATH, '.flac')
audio = load_audio(audio_files)
features = create_features(audio_files, **SETTINGS)

sample_files = search_files(SAMPLES_PATH, '.wav')
sample_files += search_files(SAMPLES_PATH, '.flac')
sample_audio = load_audio(sample_files)
sample_features = create_features(sample_files, **SETTINGS)


# load annotations and create targets
annotation_files = search_files(ANNOTATIONS_PATH, '.txt')
annotations = load_annotations(annotation_files)
targets = create_targets(annotations, features)

sample_annotation_files = search_files(SAMPLE_ANNOTATIONS_PATH, '.txt')
sample_annotations = load_annotations(sample_annotation_files)
sample_targets = create_targets(sample_annotations, features)
sample_times = [[0, 8], [11, 19], [21, 29]]  # these are the times within which the onsets for each instrumt
                                              # are in the audio (in spec seconds) detailed annotations exist too!

fs = SETTINGS['sample_rate']
test_audio = audio[:(NUM_TRACKS * NUM_KITS)]

print('done')


# 4. Separate and detect approach

In [None]:
def separate_and_detect():
    """
    this function runs the main loop over our dataset using a simple separate and detect approach
    :return:
    """

    """
    To separate the individual drum instruments you can either use bandpass filters, see 
        https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html
    You can look at the spectrogram to decide where you want to place the cutoff-frequencies.
    For the kits used in our dataset, these frequencies might work ok: 
    low 0-100 Hz
    high 10k - 22k Hz
    mid  100-10k Hz
        
    or simply calculate a spectrogram and only use the relevant frequency bands, i.e. set the rest to 0 
    (recommended, will usually work better). 
    The following frequency bands should work with our dataset: 
    low: bands 0-4
    mid: bands 6-30
    high: bands 50-end
    
    Note: this doesn't work too well, aim for around 50% f-measure; don't spend to much time on this approach.
    """

    # peak picking settings, use these settings, only play around with them once you have a working system.
    ############ NOTE: 2.3 seems like a reasonable value for the threshold ###############################
    peak_picking_sep = OnsetPeakPickingProcessor(threshold=2.3, smooth=0.0, combine=0.04, delay=0.0, fps=100,
                                                 pitch_offset=0, pre_max=0.02, post_max=0.02)

    results_sep = [None for _ in range(len(test_audio))]
    # iterate over tracks
    for idx, data in enumerate(test_audio):
        inst_eval = [None, None, None]
        # for each instrument
        for inst in range(3):
            # get the spectrogram for the current file
            filt_spec = np.copy(features[idx])

            # fiter the signal for current instrument
            ################# INSTRUMENT FILTERING ######################
            for i, frame in enumerate(filt_spec):
                filt_frame = None
                if inst == 0:
                    filt_frame = np.concatenate((frame[:5], np.zeros(len(frame[5:]))))
                if inst == 1:
                    filt_frame = np.concatenate((np.zeros(len(frame[:6])), frame[6:31], np.zeros(len(frame[31:]))))    
                if inst == 2:
                    filt_frame = np.concatenate((np.zeros(len(frame[:50])), frame[50:]))
                    
                filt_spec[i] = filt_frame
                
            # calculate a simple onset detection function
            # e.g. use the step_diff function to calculate the spectral diff, like discussed
            # in the onset detection part (sum of abs. diffs)
            ####################### ODF CALCULATION #########
            odf = []
            for i, frame in enumerate(filt_spec):
                flux_sum = 0
                for j, bin in enumerate(frame):
                    diff = filt_spec[i][j] - (filt_spec[i-1][j] if i > 0 else 0)
                    flux = diff if diff >= 0 else 0
                    flux_sum = flux_sum + flux

                odf.append(flux_sum)
                
            ##################### INFO ####################
            print('spectrogram:', idx, 'instrument:', inst)
            ###############################################

            onset_activations = np.array(odf)

            # peack picking and onset evaluation
            filt_detections = peak_picking_sep.process(onset_activations)
            inst_eval[inst] = OnsetEvaluation(filt_detections,
                                              [event[0] for event in annotations[idx] if event[1] == inst],
                                              window=0.05, combine=0.025)

        # evaluation over all isntruments for the current track
        results_sep[idx] = OnsetSumEvaluation(inst_eval)

    # evaluation over all tracks, ouput results
    overall_results_sep = OnsetSumEvaluation(results_sep)
    print('Separate and Detect f-measure: %f' % overall_results_sep.fmeasure)


## 4.1. Run method

In [None]:
separate_and_detect()


In [None]:
# CUSTOM

######################### ODF CALCULATION flux 1 (uses step_diff) #########
'''
filt_spec_T = filt_spec.transpose()

diffs = []
for i, bin in enumerate(filt_spec_T):
    diff = step_diff(bin, 1)
    diff = [v if v > 0 else 0 for v in diff]
    diffs.append(diff)

diffs = np.array(diffs)
diffs_T = diffs.transpose()

odf_1 = []
for i, frame in enumerate(diffs_T):
    odf_1.append(sum(frame))
'''
print('')

# 5. Segment and classify approach

In [None]:
def segment_and_classify():
    """
    this function runs the main loop over our dataset using a simple segment and classify approach
    :return:
    """

    # train the classifier
    #
    # we first have to train a classifier which is able to classify the different instruments and also
    # combined onsets of them. Since we only have single instrument onsets as sample, we first build a dictionary
    # of instrument hit combinations.
    # We will then use a simple KNN-classifier; you can experiment with other classifiers after the KNN version 
    # works, if you want (e.g. try SVMs from sklearn)

    # the hits are separated by about one second:
    hits_len = int(1*fs)

    # list of our features and labels for the classes used to train our classifier
    knn_feats = []
    knn_labels = []

    # onset combinations we want to use. 1..use instrument in combo 0..don't use
    # indices are: KD, SD, HH
    combos = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]
    # iterate over kits and create combinations:
    for kit_idx in range(NUM_KITS):
        cur_audio = sample_audio[kit_idx]
        cur_annot = np.asarray(sample_annotations[kit_idx][:, 0]*fs, dtype=int)
        # create combinations:
        # there are four onsetes per instrument, with different loudness.
        # we will only combine onsetes with the same loudness, but use
        # ever loudness value for our classifier.
        for onset_idx in range(4):
            kd = cur_annot[onset_idx]
            sd = cur_annot[onset_idx + 4]
            hh = cur_annot[onset_idx + 8]

            for combo_idx, combo in enumerate(combos):
                # add audio to create combined onset
                combo_audio = cur_audio[kd:(kd+hits_len)] * combo[0] +\
                              cur_audio[sd:(sd+hits_len)] * combo[1] +\
                              cur_audio[hh:(hh+hits_len)] * combo[2]

                # calculate features (mean spectrogram), add features and add lable
                cur_mean = np.sum(compute_feature(combo_audio, **SETTINGS), axis=0)
                cur_mean = cur_mean / np.max(cur_mean)
                knn_feats.append(cur_mean)
                knn_labels.append(combo_idx)

    # for the no instrument class (7) use noise with five different levels of volume:
    for idx in range(5):
        factor = idx*0.2+0.1
        combo_audio = np.random.randint(low=int(-32768*factor), high=int(32767*factor),
                                        size=(hits_len,), dtype=np.int16)

        cur_mean = np.sum(compute_feature(combo_audio, **SETTINGS), axis=0)
        cur_mean = cur_mean / np.max(cur_mean)
        knn_feats.append(cur_mean)
        knn_labels.append(7)

    # Train classifier
    # Use a KNeighborsClassifier from sklearn (e.g. with 5 neighbours)
    # TODO

    # initialize an onset detector (use the one from madmom: CNNOnsetProcessor)
    # TODO

    # and a peak picking method (use the one from madmom: OnsetPeakPickingProcessor)
    # TODO

    # results list
    results_class = [None for _ in range(len(test_audio))]
    # iterate over dataset
    for idx, data in enumerate(test_audio):
        # Detect onsets, using your onset detector of choice.
        # TODO
        onsets = []  # TODO replace with list of onset positions

        # Calculate features for onsets.
        onset_feats = [None for _ in range(len(onsets))]
        for onsets_idx, onset in enumerate(onsets):
            # TODO
            onset_feats[onsets_idx] = np.zeros((10,))  # TODO replace with features (mean spectrogram) for the current
                                                       # TODO do it as it is done when creating the training data

        # Predict class labels for onsets using the trained KNN classifier.
        # TODO

        # Translate labels back to instrument combinations.
        # TODO

        # Finally, fill this list with sub lists for each instrument with the corresponding onset times.
        # inst_det = [[<times for KD>], [<times for SD>], [<times for HH>]]
        # TODO
        inst_det = []  # TODO replace this empty list

        # Evaluate onsets for this track.
        inst_eval = [OnsetEvaluation(inst_det[inst], [event[0] for event in annotations[idx] if event[1] == inst],
                                     window=0.05, combine=0.025) for inst in range(3)]
        results_class[idx] = OnsetSumEvaluation(inst_eval)

    # Evaluate over all tracks and print results.
    overall_results_class = OnsetSumEvaluation(results_class)
    print('Segment and Classify f-measure: %f' % overall_results_class.fmeasure)

## 5.1. Run method

In [None]:
segment_and_classify()
