## The Dataset

We implemnt by `ballroom` [1], which has 8 genres & tempo & (down-)beats, 698 excerpts (30s)

[1] [ballroom](http://mtg.upf.edu/ismir2004/contest/tempoContest/node5.html)


In [1]:
import librosa
from pathlib import Path
import soundfile as sf




In [2]:
# Tempo list
tmpo_fctr = {"original": 1.00,
             "x125": 1.25,
             "x150": 1.50,
             "x075": 0.75,
             "x050": 0.50}
# Key
keys = [-2, -1, 0, 1, 2]


## Time Stretch


In [3]:
def time_stretch(signal, time_stretch_rate):
    """Time stretching implemented with librosa:
    https://librosa.org/doc/main/generated/librosa.effects.pitch_shift.html?highlight=pitch%20shift#librosa.effects.pitch_shift
    """
    return librosa.effects.time_stretch(signal, time_stretch_rate)


## Pitch Scaleing


In [4]:
def pitch_scale(signal, sr, num_semitones):
    """Pitch scaling implemented with librosa:
    https://librosa.org/doc/main/generated/librosa.effects.pitch_shift.html?highlight=pitch%20shift#librosa.effects.pitch_shift
    """
    return librosa.effects.pitch_shift(signal, sr, num_semitones)


## Get Key


In [5]:
import numpy as np


def get_music_key(y, sr):
    # pitches in 12 tone equal temperament
    pitches = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
    song_chroma = np.mean(librosa.feature.chroma_cqt(y=y, sr=sr), axis=1)
    key = dict()

    # select the most dominate pitch
    pitch_id = np.argmax(song_chroma)
    key['pitch'] = pitches[pitch_id]

    min_third_id = (pitch_id+3) % 12
    maj_third_id = (pitch_id+4) % 12

    # check if the musical 3rd is major or minor
    if song_chroma[min_third_id] < song_chroma[maj_third_id]:
        key['third'] = 'Major'
    elif song_chroma[min_third_id] > song_chroma[maj_third_id]:
        key['third'] = 'Minor'

    return key


In [6]:
_WAV_PATH = rf'audio/wav'
_AUGMENT_PATH = rf'audio/augments'
_WAV = '.wav'

wav_audio_lst = Path(_WAV_PATH).glob('*.wav')

for wa in wav_audio_lst:
    # Load the audio as a waveform `y`
    # Store the sampling rate as `sr`
    y, sr = librosa.load(rf'{wa}')

    # With diferent tempo
    # scale (in this case stretch) the overall tempo by this factor
    for tmp_n, tmp_v in tmpo_fctr.items():
        augments_1_audio = time_stretch(y, tmp_v)

        # With diferent key
        for key in keys:
            augments_2_audio = pitch_scale(augments_1_audio, sr, key)
            key_analyze = get_music_key(augments_2_audio, sr)
            key_note = ''
            file_name = ''
            if key == 0:
                key_note = 'original'
            else:
                key_note = f"{key_analyze['pitch']}{key_analyze['third']}"

            file_name = f"{str(wa).split('/')[-1][:-4]}_{tmp_n}_{key_note}{_WAV}"
            sf.write(Path(_AUGMENT_PATH, file_name), augments_2_audio, sr)


## Load a trained neural network model


In [7]:
import tensorflow as tf
from musicnn_keras.tagger import top_tags


musicnn = tf.keras.models.load_model(
    './musicnn_keras/keras_checkpoints/MSD_musicnn.h5')


In [10]:
import json
from tqdm import tqdm

_MP3_PATH = rf'audio/augments'

audios = Path(_MP3_PATH).glob('*.wav')
audio_json = dict()

for af in tqdm(audios):
    # Get Tags
    audio_dict = dict()
    audio_dict['tags'] = top_tags(str(af),
                                  model='MTT_musicnn',
                                  topN=10,
                                  print_tags=False)

    # Save result
    audio_name = str(af).split('/')[-1]
    meta_data = audio_name.split('.')[0].split('_')
    audio_dict['music'] = meta_data[0]
    audio_dict['tempo'] = meta_data[1]
    audio_dict['key'] = meta_data[2]
    audio_json[audio_name.split('.')[0]] = audio_dict

with open('result.json', 'w', encoding='utf-8') as f:
    json.dump(audio_json, f, ensure_ascii=False, indent=4, sort_keys=True)


0it [00:00, ?it/s]

Computing spectrogram (w/ librosa) and tags (w/ tensorflow).. 

0it [00:07, ?it/s]

['Albums-Pais-Tropical-05', 'original', 'AMajor']





IndexError: list index out of range