In [50]:
# to hide tensorflow endless logs
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from essentia.standard import MonoLoader, TensorflowPredictMusiCNN, PercivalBpmEstimator
from pydub import AudioSegment
import matplotlib.pyplot as plt
import json
import glob
import numpy as np

AUDIO_PATH = '/home/nikita/ml-experiments/audio_analysis/essentia/audio/'
VIDEO_PATH = '/home/nikita/ml-experiments/videos/skrillex.mp3'
VIDEODATA_PATH = 'audio/skrillex2015djset/list.txt'
MODELS_PATH = '/home/nikita/ml-experiments/audio_analysis/essentia/models/'

MUSIC_TAGS = ['rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 
              'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock',
              'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 
              'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist',
              'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 
              'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy']

NON_ENERGY_TAGS = ['60s', '70s', '80s', '90s', '00s', 'oldies', 'female vocalist', 'female vocalists', 
                   'male vocalists', 'sad', 'happy', 'guitar', 'beautiful', 'acoustic', 'instrumental',
                   'catchy', 'sexy', 'experimental', 'ambient', 'electro',  'electronica', 'electronic']

ENERGY_COEFS = [0.75, 1.5, 2.75, 4, 5, 6, 7, 8.25, 9.5, 22]
ENERGY_LEVELS = [['folk', 'chill', 'Mellow', 'jazz', 'chillout', 'indie'],
                 ['country', 'easy listening', 'blues'],
                 ['indie pop', 'soul', 'funk'],
                 ['party', 'pop', 'punk'],
                 ['punk', 'dance'],
                 ['alternative rock', 'indie rock', 'alternative'],
                 ['Hip-Hop', 'rnb', 'rock', 'classic rock', 'Progressive rock'],
                 ['House'],
                 ['hard rock', 'metal'],
                 ['heavy metal']]


Из исходного DJ-сета Skrillex получаем 31 трек 
https://www.youtube.com/watch?v=V2VmcuOEqEg)

Во всех DJ-сетах Energy Level прогрессирует по ходу выступления
Пример - [5, 7, 5, 7, 7, 6, 8, 6, ..., 7, 8, 8, 7, 8]

Цель - получить результат, схожий с приведенным примером

In [None]:
# extract audio from Skrillex 2015 Dj set video
def calculate_ms(min: str, sec: str):
    return 1000*(int(min)*60 + int(sec))

def parse_data(filename: str):
    file = open(filename, 'r')
    raw_data = file.readlines()
    timecodes_list = []

    for entry in raw_data:
        start = entry.split(' ')[0]
        start_min = start.split(':')[0]
        start_sec = start.split(':')[1]
        
        end = entry.split(' ')[1]
        end_min = end.split(':')[0]
        end_sec = end.split(':')[1]

        start_ms = calculate_ms(start_min, start_sec)
        end_ms = calculate_ms(end_min, end_sec)

        timecodes_list.append([start_ms, end_ms])

    return timecodes_list

# get start and end for each track in a dj-set
data = parse_data(VIDEODATA_PATH)
audio = AudioSegment.from_file(VIDEO_PATH, format="mp3")
audio_parts = []

# cut fullaudio into parts
i = 0
for entry in data:
    current_part = audio[entry[0]:entry[1]]
    filename = AUDIO_PATH + str(i) + '.mp3'

    current_part.export(filename, format="mp3")
    i += 1
    
#subprocess.call(["youtube-dl", "-x", "--audio-format=mp3", "-o", "audio/skrillex2015djset/%(title)s.%(ext)s", 'https://www.youtube.com/watch?v=8CWgj2VzJnA'])

In [52]:
# load tensorflow models
with open(MODELS_PATH + 'msd-musicnn-1.json', 'r') as json_file:
    metadata_autotagging = json.load(json_file)
model_autotagging = TensorflowPredictMusiCNN(graphFilename=MODELS_PATH+'msd-musicnn-1.pb')

In [53]:
# load audio data
files = glob.glob(AUDIO_PATH + 'tests/*.mp3')
files_size = len(files)

converted_files = []
files = sorted(files)

# convert mp3 audio to essentia's vector_real datatype
for file in files:
    audio = MonoLoader(filename=file)()
    converted_files.append(audio)

In [54]:
# return level influence depending on genre
def get_level(tag: str):
    for i in range(0, len(ENERGY_LEVELS), 1):
        if tag in ENERGY_LEVELS[i]:
            return ENERGY_COEFS[i]

# return bpm influence
def get_bpm_influence(bpm: float):
    if bpm < 60:
        return -1
    if bpm >= 60 and bpm <= 80:
        return -0.85
    if bpm > 80 and bpm < 96:
        return -0.55
    if bpm > 124 and bpm < 140:
        return 0.45
    if bpm >= 140 and bpm < 160:
        return 1.1
    if bpm >= 160:
        return 1.5

    return 0

In [55]:
# patch_maxes contains of maximum weights per patch
patch_maxes = []
# resulting dict contains sums of weights for all patches
resulting_dict = dict()

count_argpartition = len(NON_ENERGY_TAGS) + 5

c = 1
for audio in converted_files:
    # get model prediction
    activations = model_autotagging(audio)

    # estimate bpm
    bpm = PercivalBpmEstimator()(audio)

    
    # analyze every patch in audio
    for label in activations:
        # get maximum 5 activations per patch (taking 22 )
        indexes = np.argpartition(label, -count_argpartition)[-count_argpartition:]

        max_activations = []
        for index in indexes:
            #check if current music tag is unreliable
            if MUSIC_TAGS[index] in NON_ENERGY_TAGS:
                continue
            max_activations.append(label[index])
            max_activations.append(MUSIC_TAGS[index])

        patch_maxes.append(max_activations)

    # summarize patch values
    for patch_max in patch_maxes:
        for i in range(0, len(patch_max), 2):
            if patch_max[i+1] in resulting_dict.keys():
                existing_value = float(resulting_dict.get(patch_max[i+1]))
                current_value = patch_max[i]
                resulting_dict.update({patch_max[i+1]: existing_value+current_value})
            else:
                resulting_dict.update({patch_max[i+1]: patch_max[i]})

    # sort dictionary to get 5 maximum music tags values
    result = sorted(resulting_dict.items(), key=lambda x: x[1], reverse=True)[:5]

    # get sum of top5 weights
    sum = 0.0
    for entry in result:
        sum += entry[1]

    # calculate energy
    energy = 0.0
    for entry in result:
        level = get_level(entry[0])
        energy += entry[1]*level/sum

    # add bpm influence
    energy += get_bpm_influence(bpm)

    if energy > 10:
        energy = 10
    if energy < 0:
        energy = 0

    print(f'track: {c}, energy: {energy:.3f}, bpm: {int(bpm)}\npredicted genres: {result}\n')
    patch_maxes.clear()
    resulting_dict.clear()
    c += 1


track: 1, energy: 8.617, bpm: 100
predicted genres: [('metal', 53.32127647660673), ('rock', 16.290636161342263), ('alternative', 10.39286882430315), ('indie', 9.304042805917561), ('heavy metal', 5.954982544761151)]

track: 2, energy: 7.975, bpm: 125
predicted genres: [('metal', 109.54385840380564), ('rock', 36.465331794694066), ('alternative', 24.08151570148766), ('indie', 21.233539139851928), ('alternative rock', 10.840834859758615)]

track: 3, energy: 6.187, bpm: 99
predicted genres: [('metal', 138.42196609824896), ('rock', 87.6961831562221), ('indie', 64.47173651959747), ('alternative', 59.6176533959806), ('chillout', 30.936127196531743)]

track: 4, energy: 3.784, bpm: 93
predicted genres: [('rock', 30.389904649928212), ('chillout', 27.515701189637184), ('Progressive rock', 18.97131568007171), ('alternative', 18.461249830201268), ('indie', 17.793041984550655)]

track: 5, energy: 2.802, bpm: 71
predicted genres: [('rock', 29.96402631700039), ('jazz', 18.196367719210684), ('alternativ

In [None]:
    # create plot 
    
    # print(np.sum(label), metadata_autotagging['classes'][c])
    # c += 1

    #indexes = np.argpartition(classifications_avg, -5)[-5:]
    # print(classifications_avg)
    # print(len(classifications_avg))
    # print(metadata_autotagging['classes'])
    # print(len(metadata_autotagging['classes']))
    #for i in range(5):
       # print(indexes[i])
        #print(classifications_avg[indexes[i]], metadata_autotagging['classes'][indexes[i]])
    # create dependency plot
   # ig, ax = plt.subplots(1, 1, figsize=(10, 10))
   # ax.matshow(activations.T, aspect='auto')

  #  ax.set_yticks(range(len(metadata_autotagging['classes'])))
   # ax.set_yticklabels(metadata_autotagging['classes'])
  #  ax.set_xlabel('patch number')
   # ax.axis.set_ticks_position('bottom')
   # plt.title('Tag activations')
   # plt.show()