# Experiment Here!

In [None]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import madmom
import librosa
import mir_eval

In [None]:
FPS = 100

In [None]:
from madmom.utils import search_files, match_file

AUDIO_FILES = search_files('data/train', '.flac')

def find_audio_files(ann_files, audio_files, ann_suffix=None, audio_suffix='.flac'):
    """
    Find matching audio files.
    
    Parameters
    ----------
    ann_files : list
        List with annotation file names.
    audio_files : list
        List with audio file names to be matched
    ann_suffix : str, optional
        Suffix of the annotation files. If 'None'
        the suffix is inferred from the annotation
        files.
    audio_suffix : str, optional
        Suffix of the audio files.
    
    Returns
    -------
    matched_files : list
        List of matched audio file (names).
    matched_indices : list
        List of matching indices in `audio_files`.
        
    """
    matched_files = []
    matched_indices = []
    for i, ann_file in enumerate(ann_files):
        if ann_suffix is None:
            ann_suffix = os.path.splitext(ann_file)[1]
        matches = match_file(ann_file, audio_files,
                             ann_suffix, audio_suffix)
        if len(matches) == 1:
            matched_files.append(matches[0])
            matched_indices.append(i)
        else:
            continue
    return matched_files, matched_indices

In [None]:
# CUSTOM

# len(AUDIO_FILES)

# from scripts import utilities
# utilities.main()

# Pre-Processing

## Task: 1

In [None]:
# define additional constants
SR = 44100 # samping rate
FRAME_SIZE = 2048 # number of samples per frame
HOP_SIZE = int(SR / FPS) # hop size depends on sampling rate and frame rate
NUM_BANDS = 40 # number of mel bins

def pre_process(filename, frame_size=2048, frame_rate=FPS, num_bands=40, **kwargs):
    """
    Pre-process the audio signal.

    Parameters
    ----------
    filename : str
        File to be processed.
    frame_size : int
        Size of the frames.
    frame_rate : float
        Frame rate used for the STFT.
    num_bands : int
        Number of frequency bands for the Mel filterbank.
    kwargs : dict, optional
        Additional keyword arguments.

    Returns
    -------
    spectrogram : numpy array
        Spectrogram.

    """    
    # STEP 1: read in audio
    signal, sampling_rate_unused = librosa.load(filename, sr=SR) # read file
    
    # STEP 2,3: compute stft (default windowing function is Hann)
    stft = librosa.core.stft(y=signal, n_fft=frame_size, hop_length=HOP_SIZE)
    
    # STEP 4: discard phase info and square magnitudes
    initial_spectrogram = abs(stft)**2
    
    # STEP 5: apply mel scaling
    mel_bins = librosa.filters.mel(sr=SR, n_fft=frame_size, n_mels=num_bands)
    mel_spectrogram = mel_bins.dot(initial_spectrogram)
    
    # STEP 6: apply DB scaling
    db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    
    # double check
    # mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=frame_size, hop_length=hop_size, n_mels=num_bands)
    # db_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    # print((db_mel_spectrogram)[0])
        
    spectrogram = db_mel_spectrogram
    return spectrogram

In [None]:
# CUSTOM

from librosa.display import specshow

def test_pre_process():
    texasName = AUDIO_FILES[19] #AUDIO_FILES[19]

    spectrogram = pre_process(texasName, FRAME_SIZE, FPS, NUM_BANDS)

    # print(spectrogram.shape)

    plt.figure(figsize=(15, 5))
    specshow(spectrogram, sr=SR, hop_length=HOP_SIZE, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')

# test_pre_process()

In [None]:
# list for collecting pre-processed spectrograms
# Note: it is not necessary to use this list but recommended in order to
#       avoid recomputation of the same features over and over again.
#       *_AUDIO_IDX canbe used to acces the precomputed spectrograms by
#       index.
SPECTROGRAMS = []

for audio_file in AUDIO_FILES:
    spec = pre_process(audio_file)
    SPECTROGRAMS.append(spec)

# Onset detection

In [None]:
# you are not required to use these predefined constants, but it is recommended
ONSET_ANNOTATION_FILES = search_files('data/train', '.onsets')
ONSET_AUDIO_FILES, ONSET_AUDIO_IDX = find_audio_files(ONSET_ANNOTATION_FILES, AUDIO_FILES)
ONSET_AUDIO = [SPECTROGRAMS[i] for i in ONSET_AUDIO_IDX]
ONSET_ANNOTATIONS = [madmom.io.load_onsets(f) for f in ONSET_ANNOTATION_FILES]

assert len(ONSET_ANNOTATION_FILES) == 321
assert len(ONSET_AUDIO_FILES) == 321
assert len(ONSET_AUDIO) == 321
assert len(ONSET_ANNOTATIONS) == 321

## Task: 2a

In [None]:
def onset_detection_function(spectrogram):
    """
    Compute an onset detection function.

    Parameters
    ----------
    spectrogram : numpy array
        Spectrogram

    Returns
    -------
    odf : numpy array
        Onset detection function.

    """
    spectrogram_T = spectrogram.transpose()
    
    odf = []
    for i, frame in enumerate(spectrogram_T):
        sum = 0
        for j, bin in enumerate(frame):
            diff = spectrogram_T[i][j] - (spectrogram_T[i-1][j] if i > 0 else 0)
            flux = diff if diff >= 0 else 0
            sum = sum + flux

        odf.append(sum / NUM_BANDS)
                    
    return odf

In [None]:
# CUSTOM

def odf_test():
    spec = ONSET_AUDIO[19]

    odf = onset_detection_function(spec)
    # fix the weird librosa offset
    #odf = [0.0, 0.0] + odf
    #odf.pop()
    #odf.pop()

    odf_lib = librosa.onset.onset_strength(sr=SR, S=spec)

    # print('odf_lib:', odf_lib[6], " len: ", len(odf_lib))
    # print(odf[6])

    print(len(odf), "and", len(odf_lib))
    #for i, elem in enumerate(odf):
    #    print(odf[i] == odf_lib[i])
    
# odf_test()

## Task: 2b

In [None]:
def detect_onsets(odf, threshold, frame_rate=FPS, **kwargs):
    """
    Detect the onsets in the onset detection function (ODF).

    Parameters
    ----------
    odf : numpy array
        Onset detection function.
    threshold : float
        Threshold for peak picking
    frame_rate : float
        Frame rate of the onset detection function.
    kwargs : dict, optional
        Additional keyword arguments.

    Returns
    -------
    onsets : numpy array
        Detected onsets (in seconds).

    """
    
    max_w_left = 3
    max_w_right = 3
    avg_w_left = 10
    avg_w_right = 11
    min_distance = 3 #30ms
    
    ######## MOVING AVERAGE AND THRESHOLD ########
    avg_odf = []
    
    ### first window ###
    sum = 0
    for j in range(0, avg_w_right):
        sum = sum + odf[j]
    avg = sum / avg_w_right
    
    for j in range(0, avg_w_right):
        diff = odf[j] - avg
        val = diff if diff > threshold else 0
        avg_odf.append(val)
        
    #print(avg_odf)
    
    ### other windows ###
    print("")
    w_step = avg_w_left + avg_w_right
    for i in range(avg_w_right, len(odf), w_step):
        sum = 0
        avg = 0
        upper_bound = i + min(w_step, len(odf) - i)
        for j in range(i, upper_bound):
            #print(j)
            sum = sum + odf[j]
        avg = sum / avg_w_right
        
        for j in range(i, upper_bound):
            diff = odf[j] - avg
            val = diff if diff > threshold else 0
            avg_odf.append(val)
        #print(min(w_step, len(odf) - i))
        
    #print(avg_odf[0])

    ######## LOCAL MAXIMUM ########
    
    ######## MINIMUM DISTANCE ########
    
   # for i, el in enumerate(odf):
   #     print(odf[i])
    
    onsets = np.array([])
    
    for i, el in enumerate(avg_odf):
        if avg_odf[i] > 0:
            onsets = np.append(onsets, i)
            #print(avg_odf[i])
        
    return onsets / frame_rate

In [None]:
# CUSTOM

def detect_test():
    spec = ONSET_AUDIO[19]
    odf = onset_detection_function(spec)
    odf = np.array(odf)

    print(odf)
    onsets = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf)
    print(odf)
    peaks = librosa.util.peak_pick(odf,3,3,10,11,0.0695,3)

    #print(len(onsets))
    #print(onsets/100)
    print(len(peaks))
    print(peaks/100)


    # onsets = detect_onsets(odf, 0.0695, FPS)

# detect_test()

In [None]:
# CUSTOM

def param_single_test():
    # onsets = detect_onsets(odf_lib, 5, FPS)

    # testing

    spec = ONSET_AUDIO[19]

    odf = onset_detection_function(spec)
    odf = np.array(odf)

    onsets = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf)
    # peaks = librosa.util.peak_pick(odf,5,5,7,7,0.066,3) # also good
    # peaks = librosa.util.peak_pick(odf,3,3,7,7,0.07,3) # also good
    # peaks = librosa.util.peak_pick(odf,3,3,11,11,0.0695,3) # also good
    peaks = librosa.util.peak_pick(odf,3,3,10,11,0.0695,3)

    #onsets_lib = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf_lib)

    #y, sampling_rate_unused = librosa.load(AUDIO_FILES[19], sr=SR)
    #onsets_full_lib = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, y=y)

    print(len(onsets))
    print(onsets/100)
    print(len(peaks))
    print(peaks/100)
    #print(onsets_lib/100)
    #print(onsets_full_lib/100)

# param_single_test()

In [None]:
# CUSTOM

# print(len(ONSET_ANNOTATIONS[19]))
# ONSET_ANNOTATIONS[19]

In [None]:
#CUSTOM
def parameter_test():
    under = 0
    over = 0
    for i, spec in enumerate(ONSET_AUDIO):
        odf = onset_detection_function(spec)
        odf = np.array(odf)

        onsets = librosa.onset.onset_detect(sr=SR, hop_length=HOP_SIZE, onset_envelope=odf)
        peaks = librosa.util.peak_pick(odf,3,3,10,11,0.0695,3)

        diff = len(peaks) - len(onsets) 
        if(diff < 0):
            under = under + diff
        else:
            over = over + diff
        if(i % 25 == 0):   
            print(i, ":", len(peaks), "and", len(onsets), "=", diff)

    print("under", under)
    print("over", over)
    
# parameter_test()