# Experiment Here!

In [None]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import madmom
import librosa
import mir_eval

In [None]:
FPS = 100

In [None]:
from madmom.utils import search_files, match_file

AUDIO_FILES = search_files('data/train', '.flac')

def find_audio_files(ann_files, audio_files, ann_suffix=None, audio_suffix='.flac'):
    """
    Find matching audio files.
    
    Parameters
    ----------
    ann_files : list
        List with annotation file names.
    audio_files : list
        List with audio file names to be matched
    ann_suffix : str, optional
        Suffix of the annotation files. If 'None'
        the suffix is inferred from the annotation
        files.
    audio_suffix : str, optional
        Suffix of the audio files.
    
    Returns
    -------
    matched_files : list
        List of matched audio file (names).
    matched_indices : list
        List of matching indices in `audio_files`.
        
    """
    matched_files = []
    matched_indices = []
    for i, ann_file in enumerate(ann_files):
        if ann_suffix is None:
            ann_suffix = os.path.splitext(ann_file)[1]
        matches = match_file(ann_file, audio_files,
                             ann_suffix, audio_suffix)
        if len(matches) == 1:
            matched_files.append(matches[0])
            matched_indices.append(i)
        else:
            continue
    return matched_files, matched_indices

In [None]:
# CUSTOM

# len(AUDIO_FILES)

# from scripts import utilities
# utilities.main()

# Pre-Processing

## Task: 1

In [None]:
# define additional constants
SR = 44100 # samping rate
FRAME_SIZE = 2048 # number of samples per frame
HOP_SIZE = int(SR / FPS) # hop size depends on sampling rate and frame rate
NUM_BANDS = 40 # number of mel bins

def pre_process(filename, frame_size=2048, frame_rate=FPS, num_bands=40, **kwargs):
    """
    Pre-process the audio signal.

    Parameters
    ----------
    filename : str
        File to be processed.
    frame_size : int
        Size of the frames.
    frame_rate : float
        Frame rate used for the STFT.
    num_bands : int
        Number of frequency bands for the Mel filterbank.
    kwargs : dict, optional
        Additional keyword arguments.

    Returns
    -------
    spectrogram : numpy array
        Spectrogram.

    """    
    # STEP 1: read in audio
    signal, sampling_rate_unused = librosa.load(filename, sr=SR) # read file
    
    # STEP 2,3: compute stft (default windowing function is Hann)
    stft = librosa.core.stft(y=signal, n_fft=frame_size, hop_length=HOP_SIZE)
    
    # STEP 4: discard phase info and square magnitudes
    initial_spectrogram = abs(stft)**2
    
    # STEP 5: apply mel scaling
    mel_bins = librosa.filters.mel(sr=SR, n_fft=frame_size, n_mels=num_bands)
    mel_spectrogram = mel_bins.dot(initial_spectrogram)
    
    # STEP 6: apply DB scaling
    db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    
    # double check
    # mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=frame_size, hop_length=hop_size, n_mels=num_bands)
    # db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    # print((db_mel_spectrogram)[0])
        
    spectrogram = db_mel_spectrogram
    return spectrogram

In [None]:
# CUSTOM
from librosa.display import specshow

def test_pre_process():
    texasName = AUDIO_FILES[19] #AUDIO_FILES[19]

    spectrogram = pre_process(texasName, FRAME_SIZE, FPS, NUM_BANDS)

    # print(spectrogram.shape)

    plt.figure(figsize=(15, 5))
    specshow(spectrogram, sr=SR, hop_length=HOP_SIZE, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
# test_pre_process()

In [None]:
# list for collecting pre-processed spectrograms
# Note: it is not necessary to use this list but recommended in order to
#       avoid recomputation of the same features over and over again.
#       *_AUDIO_IDX canbe used to acces the precomputed spectrograms by
#       index.
SPECTROGRAMS = []

for audio_file in AUDIO_FILES:
    spec = pre_process(audio_file, FRAME_SIZE, FPS, NUM_BANDS) # params missing in tuwel
    SPECTROGRAMS.append(spec)

# Onset detection

In [None]:
# you are not required to use these predefined constants, but it is recommended
ONSET_ANNOTATION_FILES = search_files('data/train', '.onsets')
ONSET_AUDIO_FILES, ONSET_AUDIO_IDX = find_audio_files(ONSET_ANNOTATION_FILES, AUDIO_FILES)
ONSET_AUDIO = [SPECTROGRAMS[i] for i in ONSET_AUDIO_IDX]
ONSET_ANNOTATIONS = [madmom.io.load_onsets(f) for f in ONSET_ANNOTATION_FILES]

assert len(ONSET_ANNOTATION_FILES) == 321
assert len(ONSET_AUDIO_FILES) == 321
assert len(ONSET_AUDIO) == 321
assert len(ONSET_ANNOTATIONS) == 321

## Task: 2a

In [None]:
def onset_detection_function(spectrogram):
    """
    Compute an onset detection function.

    Parameters
    ----------
    spectrogram : numpy array
        Spectrogram

    Returns
    -------
    odf : numpy array
        Onset detection function.

    """
    spectrogram_T = spectrogram.transpose()
    
    odf = []
    for i, frame in enumerate(spectrogram_T):
        sum = 0
        for j, bin in enumerate(frame):
            diff = spectrogram_T[i][j] - (spectrogram_T[i-1][j] if i > 0 else 0)
            flux = diff if diff >= 0 else 0
            sum = sum + flux

        odf.append(sum / NUM_BANDS)
                    
    return odf

In [None]:
# CUSTOM
def odf_test():
    spec = ONSET_AUDIO[19]

    odf = onset_detection_function(spec)
    # fix the weird librosa offset
    #odf = [0.0, 0.0] + odf
    #odf.pop()
    #odf.pop()

    odf_lib = librosa.onset.onset_strength(sr=SR, S=spec)

    # print('odf_lib:', odf_lib[6], " len: ", len(odf_lib))
    # print(odf[6])

    print(len(odf), "and", len(odf_lib))
    #for i, elem in enumerate(odf):
    #    print(odf[i] == odf_lib[i])
# odf_test()

## Task: 2b

In [None]:
MAX_LEFT = 2 # 3 default value
MAX_RIGHT = 3 # 1 
AVG_LEFT = 10 # 10
AVG_RIGHT = 11 # 11
MIN_DIST = 3 # 3 (30ms)
            # 0.5 is used # 0.07 threshold

def detect_onsets(odf, threshold, frame_rate=FPS, **kwargs):
    """
    Detect the onsets in the onset detection function (ODF).

    Parameters
    ----------
    odf : numpy array
        Onset detection function.
    threshold : float
        Threshold for peak picking
    frame_rate : float
        Frame rate of the onset detection function.
    kwargs : dict, optional
        Additional keyword arguments.

    Returns
    -------
    onsets : numpy array
        Detected onsets (in seconds).

    """
            
    new_odf = []
    
    ######## MOVING AVERAGE AND THRESHOLD ########
    
    for i in range(0, len(odf)):
        l = i - AVG_LEFT if i - AVG_LEFT > 0 else 0
        r = i + AVG_RIGHT if i + AVG_RIGHT < len(odf) else len(odf)
        
        new_val = odf[i] - np.average(odf[l:r])
        new_odf.append(new_val if new_val >= threshold else 0)
    
    ######## LOCAL MAXIMUM ########
    
    for i in range(0, len(new_odf)):
        l = i - MAX_LEFT if i - MAX_LEFT > 0 else 0
        r = i + MAX_RIGHT if i + MAX_RIGHT < len(odf) else len(odf)
        
        if new_odf[i] < max(new_odf[l:r]):
            new_odf[i] = 0
    
    ######## MINIMUM DISTANCE ########
    
    last = -1
    for i in range(0, len(new_odf)):
        if new_odf[i] > 0 and (last < 0 or i - last > MIN_DIST):
            last = i
        else:
            new_odf[i] = 0
    
    ######## SELECTING ONSETS ########

    onsets = np.array([])
    
    for i, el in enumerate(new_odf):
        if new_odf[i] > 0:
            onsets = np.append(onsets, i)
        
    return onsets / frame_rate

In [None]:
# CUSTOM
def detect_test():
    spec = ONSET_AUDIO[19]
    odf = onset_detection_function(spec)
    odf_1 = np.array(odf)
    odf_2 = np.array(odf)
    onsets = detect_onsets(odf_1, 0.4, FPS)
    onsets_lib = librosa.util.peak_pick(odf_2,3,1,10,11,0.4,3)/100 # librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf_2)/100
    gt = ONSET_ANNOTATIONS[19]
    print(onsets[0:20], "\n", len(onsets))
    print(onsets_lib[0:20], "\n", len(onsets_lib))
    print(gt[0:20], "\n", len(gt))
#detect_test()

In [None]:
# CUSTOM
def compute_odfs():
    odfs = []
    for i, spec in enumerate(ONSET_AUDIO[0:321]):
        odfs.append(onset_detection_function(spec))
    return odfs
# odfs = compute_odfs()

In [None]:
# CUSTOM
def parameter_test():
    under = 0
    over = 0
    total = 0
    for i, odf in enumerate(odfs):
        odf_1 = np.array(odf)
        odf_2 = np.array(odf)
        odf_3 = np.array(odf)
        odf_4 = np.array(odf)

        onsets = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf_1)
        peaks = librosa.util.peak_pick(odf_2,wait=3, pre_max=3, post_max=1, pre_avg=10, post_avg=11, delta=0.4)
        onsets_m = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf_3,
                                             wait=3, pre_max=3, post_max=1, pre_avg=10, post_avg=11, delta=0.07)
        onsets_c = detect_onsets(odf_4, 0.4, FPS)
            
        x = onsets_c #peaks
        y = ONSET_ANNOTATIONS[i] # onsets

        diff = len(x) - len(y) 
        if(diff < 0):
            under = under + diff
        else:
            over = over + diff
        if(i % 25 == 0):   
            print(i, ":", len(x), "and", len(y), "=", diff)
        
        total = total + len(ONSET_ANNOTATIONS[i])

    print("under", under)
    print("over", over)
    print("total", total)
# parameter_test()

## Task: 2c

In [None]:
# define additional constants
THRESHOLD = 0.5

# list for collecting the onset detections
onset_detections = []

for i, spec in enumerate(ONSET_AUDIO):
    odf = onset_detection_function(spec)
    onsets = detect_onsets(odf, THRESHOLD, FPS)
    onset_detections.append(onsets)

## Task: 2d

In [None]:
def evaluate_onsets(onsets, annotations):
    """
    Evaluate detected onsets against ground truth annotations.
    
    Parameters
    ----------
    onsets : list
        List with onset detections for all files.
    annotations : list
        List with corresponding ground truth annotations.

    Returns
    -------
    precision : float
        Averaged precision.
    recall : float
        Averaged recall.
    fmeasure : float
        Averaged f-measure.
    
    """
    sum_precision = 0
    sum_recall = 0
    sum_fmeasure = 0
    for i in range(0, len(onsets)):
        tp, fp, tn, fn, errors = madmom.evaluation.onsets.onset_evaluation(onsets[i], annotations[i], window=0.025)
        p = len(tp) / (len(tp) + len(fp)) if len(tp) > 0 else 0
        r = len(tp) / (len(tp) + len(fn)) if len(tp) > 0 else 0
        f = 2*p*r / (p + r) if p + r > 0 else 0
        sum_precision = sum_precision + p
        sum_recall = sum_recall + r
        sum_fmeasure = sum_fmeasure + f
    
    precision = sum_precision / len(onsets)
    recall = sum_recall / len(onsets)
    fmeasure = sum_fmeasure / len(onsets)
    return precision, recall, fmeasure
    
# evaluate against ground truth
p, r, f = evaluate_onsets(onset_detections, ONSET_ANNOTATIONS)

print('Signal processing-based onset detection\nPrecision: %.3f\nRecall:    %.3f\nF-measure: %.3f' % (p, r, f))

## Task: 2e

In [None]:
def optimize_parameters(verbose=False):
    frame_sizes = [1024, 2048, 4096]
    num_bands = [20, 40, 80]
    thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    
    best_fmeasure = 0
    best_frame_size = 0
    best_num_bands = 0
    best_threshold = 0

    for i in range(0, len(frame_sizes)):
        for j in range(0, len(num_bands)):
            for k in range(0, len(thresholds)):
                FRAME_SIZE = frame_sizes[i]
                NUM_BANDS = num_bands[j]
                THRESHOLD = thresholds[k]
                
                if verbose:
                    print("parameters:", FRAME_SIZE, NUM_BANDS, THRESHOLD)
                
                # spectrograms
                specs = []
                for audio_file in AUDIO_FILES:
                    spec = pre_process(audio_file, FRAME_SIZE, FPS, NUM_BANDS)
                    specs.append(spec)

                onset_audio = [specs[i] for i in ONSET_AUDIO_IDX]
                
                # onset detections
                ods = []
                for l, spec in enumerate(onset_audio):
                    odf = onset_detection_function(spec)
                    onsets = detect_onsets(odf, THRESHOLD, FPS)
                    ods.append(onsets)
                
                # evaluation
                precision, recall, fmeasure = evaluate_onsets(ods, ONSET_ANNOTATIONS)
                if verbose:
                    print('Signal processing-based onset detection\nPrecision: %.3f\nRecall:    %.3f\nF-measure: %.3f' % (precision, recall, fmeasure))
                    print('')
                
                if fmeasure > best_fmeasure:
                    best_fmeasure = fmeasure
                    best_frame_size = FRAME_SIZE
                    best_num_bands = NUM_BANDS
                    best_threshold = THRESHOLD
                    
    return best_fmeasure, best_frame_size, best_num_bands, best_threshold

# uncomment and run block to optimize parameters
# best_fmeasure, best_frame_size, best_num_bands, best_threshold = optimize_parameters(verbose=True)
# print("best found parameters are:", best_frame_size, best_num_bands, best_threshold, "with F-measure:", best_fmeasure)

Parameter optimization was run on the following parameters: <br>
frame size (1024, 2048 and 4096), <br>
number of mel bins (20, 40 and 80) and <br>
threshold (in range from 0 to 1.0 (or 1.5 in some cases) with step size 0.1).
<br><br>
An example of a well performing combination: 2048 40 0.5 with precision: 79.6%
recall: 75.4%, F-measure: 75.8%
<br><br>
The results are uploaded to the root directory in 3 separate files grouped for convenience by the frame size parameter: "1024 param config.txt", "2048 param config.txt" and "4096 param config.txt"
<br><br>
The first and most obvious observation in all cases is the influence of the threshold parameter on precision and recall values, starting with a low threshold value (high recall) and moving upwards (high precision) we can see how hitting a sweet spot with the threshold somewhere in the middle is necessary for a good F-measure value.
<br><br>
Furthermore we can see that selecting 20 as the number of mel bins almost universally yields slightly worse results regarldess of other parameters (within reasonable bounds) than the other 2 values. 20 seems to be too few bins, while 40 and 80 perform more or less similarly. <br>
That being said we still achieved an F-measure of 74.5% with parameters 2048 20 0.3, while our overall best achieved F-measure was at 75.9%, so probably this difference is negligible
<br><br>
Most interestingly though one can see how picking 4096 as frame size results in significantly worse F-measure values, in best cases barely hitting the 65% mark, while 1024 and 2048 are consistently above 70%, often reaching the maximum of 75.9% with proper threshold and bin number parameters. <br> This can be attributed to the fact that by selecting a larger frame size one loses some of the temporal accuracy that is essential for onset detection.

# Machine learning-based onset detection

## Task 3a:

In [None]:
def train(audio, annotations, diffs=False, early_stopping=False,
          verbose=True, model='model.pkl', **kwargs):
    """
    Train an MLP on the data.

    Parameters
    ----------
    audio : list
        List of audio files or precomputed spectrograms.
    annotations : list of numpy arrays
        List with corresponding onset annotations.
    diffs : bool, optional
        Include diffs as input features (step 7).
    early_stopping : bool, optional
        Use early stopping to prevent overfitting (step 8).
    verbose : bool, optional
        Be verbose during training.
    model : str, optional
        Save the fitted model to given file name.
    kwargs : dict, optional
        Additional keyword arguments.
        
    Returns
    -------
    mlp : MLPRegressor
        Trained MLP.

    """
    from sklearn.neural_network import MLPRegressor
    # define MLP
    mlp = MLPRegressor(hidden_layer_sizes=(50, 50), tol=1e-4, max_iter=100,
                       early_stopping=early_stopping, verbose=verbose)
    if verbose:
        print(mlp)
        
    # prepare input features and targets
    x = []
    y = []
    
    ######## INPUT PREPARATION ########
    
    # concatenate all features and transpose to fit the MLP input format
    spectral_features = np.concatenate((audio), axis=1)
    spectral_features_T = spectral_features.transpose()
    x = spectral_features_T
    
    # add spectral flux to features
    if diffs:
        if verbose:
            print('')
            print('adding spectral flux to input features...')
            print('')
        flux = kwargs['flux']
        flux = np.concatenate((flux))
        flux = np.vstack(flux)
        x = np.concatenate((x, flux), axis=1)

    # create target as 0 array with value 1 where index matches the frame 
    y = np.array([])

    for i in range(0, len(audio)):
        spec_T = audio[i].transpose()
        target = np.zeros(len(spec_T))
        
        onset_frames = np.rint(annotations[i] * FPS)
        for j in range(0, len(onset_frames)):
            target[int(onset_frames[j])] = 1

        y = np.append(y, target)
        
    ###################################
    
    # reshape x and y
    # Note: depending on your data pre-processing these lines might
    #       need to be adjusted accordingly
    x = np.vstack(x)
    y = np.hstack(y)
    
    # train model
    if verbose:
        print('training model:', model)
    mlp.fit(x.squeeze(), y.squeeze())
    
    # save model and return it
    with open(model, 'wb') as f:
        pickle.dump(mlp, f)
    return mlp

In [None]:
# CUSTOM
def test_target_gen():
    targets = np.array([])

    for i in range(0, len(ONSET_AUDIO)):
        spec_T = ONSET_AUDIO[i].transpose()
        onset_frames = np.rint(ONSET_ANNOTATIONS[i] * FPS)

        target = np.zeros(len(spec_T))
        for j in range(0, len(onset_frames)):
            target[int(onset_frames[j])] = 1

        targets = np.append(targets, target)
        #print(onset_frames)
        #print(len(target))
        #print(target)

    print(len(targets))
#test_target_gen()

## Task 3b:

In [None]:
MLP_MODEL = train(ONSET_AUDIO, ONSET_ANNOTATIONS, False, False, model='model.pkl')

## Task 3c:

In [None]:
# A solid arbitrary starting value for the threshold
MLP_THRESHOLD = 0.0025

In [None]:
#### STEP 2 ####

# Function for optimizing the threshold parameter
def optimize_mlp_threshold(model, thresholds=[], diffs=False, verbose=True, **kwargs):
    best_threshold = 0
    best_fmeasure = 0
    
    if diffs and verbose:
        print('running optimization with spectral flux...')
        print('')

    for i in range(0, len(thresholds)):
        ods_opt = []
        for j, spec in enumerate(ONSET_AUDIO):
            
            x = spec.transpose()
            
            # add spectral flux to features
            if diffs:
                flux = kwargs['flux']
                flux = np.vstack(flux[j])
                x = np.concatenate((x, flux), axis=1)
            
            mlp_odf = model.predict(x)
            mlp_onsets = detect_onsets(mlp_odf, thresholds[i], FPS)
            ods_opt.append(mlp_onsets)
        
        p, r, f = evaluate_onsets(ods_opt, ONSET_ANNOTATIONS)
        if verbose:
            print('Current threshold:', thresholds[i])
            print('MLP onset detection\nPrecision: %.3f\nRecall:    %.3f\nF-measure: %.3f' % (p, r, f))
            print('')
        
        if f > best_fmeasure:
            best_fmeasure = f
            best_threshold = thresholds[i]
        
    if verbose:
        print('Optimized threshold is:', best_threshold, 'with F measure:', best_fmeasure)
    return best_threshold

mlp_thresholds = np.arange(0,0.005,0.0005)  # thresholds to use for optimization

# COMMENT OUT LINE BELOW TO AVOID RUNNING THRESHOLD OPTIMIZATION (might take up to a minute or two)
MLP_THRESHOLD = optimize_mlp_threshold(model=MLP_MODEL, thresholds=mlp_thresholds)

In [None]:
#### STEP 3 ####

mlp_onset_detections = []

for i, spec in enumerate(ONSET_AUDIO):
    mlp_odf = MLP_MODEL.predict(spec.transpose())
    mlp_onsets = detect_onsets(mlp_odf, MLP_THRESHOLD, FPS)
    mlp_onset_detections.append(mlp_onsets)

# evaluate against ground truth
p, r, f = evaluate_onsets(mlp_onset_detections, ONSET_ANNOTATIONS)

print('MLP onset detection\nPrecision: %.3f\nRecall:    %.3f\nF-measure: %.3f' % (p, r, f))

## Task 3d: TO BE DONE!!!!!

## Task 3e: 

In [None]:
######## CALCULATE FLUX ########

FLUX = []
for i in range(0, len(ONSET_AUDIO)):
    diff = onset_detection_function(ONSET_AUDIO[i])
    FLUX.append(diff)

In [None]:
MLP_DIFF_MODEL = train(ONSET_AUDIO, ONSET_ANNOTATIONS, True, False, model='model_diff.pkl', flux=FLUX)

## Task 3f:

In [None]:
# A solid arbitrary starting value for the threshold
MLP_DIFF_THRESHOLD = 0.075

In [None]:
mlp_diff_thresholds = np.arange(0.03,0.11,0.01)  # thresholds to use for optimization

# COMMENT OUT LINE BELOW TO AVOID RUNNING THRESHOLD OPTIMIZATION (might take up to a minute or two)
MLP_DIFF_THRESHOLD = optimize_mlp_threshold(model=MLP_DIFF_MODEL, thresholds=mlp_diff_thresholds, diffs=True, flux=FLUX)

In [None]:
mlp_diff_detections = []

for i, spec in enumerate(ONSET_AUDIO):
    flux = np.vstack(FLUX[i])
    x = np.concatenate((spec.transpose(), flux), axis=1)

    mlp_diff_odf = MLP_DIFF_MODEL.predict(x)
    mlp_diff_onsets = detect_onsets(mlp_diff_odf, MLP_DIFF_THRESHOLD, FPS)
    mlp_diff_detections.append(mlp_diff_onsets)

# evaluate against ground truth
p, r, f = evaluate_onsets(mlp_diff_detections, ONSET_ANNOTATIONS)

print('MLP onset detection with temporal diffs\nPrecision: %.3f\nRecall:    %.3f\nF-measure: %.3f' % (p, r, f))

## Task 3g: TO BE DONE!!!!!