In [None]:
!pip install essentia-tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting essentia-tensorflow
  Downloading essentia_tensorflow-2.1b6.dev1034-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.4/291.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: essentia-tensorflow
Successfully installed essentia-tensorflow-2.1b6.dev1034


In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/ICT Project/weights_metadata_openl3.json" "weights_metadata_openl3.json"
!cp "/content/drive/MyDrive/ICT Project/extract_openl3_embeddings.py" "extract_openl3_embeddings.py"
!cp "/content/drive/MyDrive/ICT Project/openL3.zip" "openL3.zip"
!cp "/content/drive/MyDrive/ICT Project/songs.zip" "songs.zip"
!unzip "openL3.zip"
!unzip "songs.zip"

Archive:  openL3.zip
   creating: openL3/
  inflating: openL3/tonal_atonal-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/danceability-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/danceability-openl3-music-mel128-emb512-1.json  
  inflating: openL3/mood_acoustic-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/mood_electronic-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/voice_instrumental-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/voice_instrumental-openl3-music-mel128-emb512-1.json  
  inflating: openL3/mood_electronic-openl3-music-mel128-emb512-1.json  
  inflating: openL3/gender-openl3-music-mel128-emb512-1.pb  
  inflating: openL3/mood_acoustic-openl3-music-mel128-emb512-1.json  
  inflating: openL3/mood_aggressive-openl3-music-mel128-emb512-1.json  
  inflating: openL3/mood_party-openl3-music-mel128-emb512-1.json  
  inflating: openL3/gender-openl3-music-mel128-emb512-1.json  
  inflating: openL3/mood_party-openl3-music-mel128-emb512-1.pb  

# OpenL3 models

In [None]:
import essentia
import json
import essentia.standard as es
from essentia import Pool
from essentia.standard import MonoLoader, TensorflowPredictMusiCNN, TensorflowPredict2D

import numpy as np
import pandas as pd
import os

pd.options.display.max_columns = 100

Code from Essentia docs

In [None]:
from pathlib import Path
import essentia.standard as es
import numpy as np
from essentia import Pool

class MelSpectrogramOpenL3:
    def __init__(self, hop_time):
        self.hop_time = hop_time

        self.sr = 48000
        self.n_mels = 128
        self.frame_size = 2048
        self.hop_size = 242
        self.a_min = 1e-10
        self.d_range = 80
        self.db_ref = 1.0

        self.patch_samples = int(1 * self.sr)
        self.hop_samples = int(self.hop_time * self.sr)

        self.w = es.Windowing(
            size=self.frame_size,
            normalized=False,
        )
        self.s = es.Spectrum(size=self.frame_size)
        self.mb = es.MelBands(
            highFrequencyBound=self.sr / 2,
            inputSize=self.frame_size // 2 + 1,
            log=False,
            lowFrequencyBound=0,
            normalize="unit_tri",
            numberBands=self.n_mels,
            sampleRate=self.sr,
            type="magnitude",
            warpingFormula="slaneyMel",
            weighting="linear",
        )

    def compute(self, audio_file):
        audio = es.MonoLoader(filename=audio_file, sampleRate=self.sr)()

        batch = []
        for audio_chunk in es.FrameGenerator(
            audio, frameSize=self.patch_samples, hopSize=self.hop_samples
        ):
            melbands = np.array(
                [
                    self.mb(self.s(self.w(frame)))
                    for frame in es.FrameGenerator(
                        audio_chunk,
                        frameSize=self.frame_size,
                        hopSize=self.hop_size,
                        validFrameThresholdRatio=0.5,
                    )
                ]
            )

            melbands = 10.0 * np.log10(np.maximum(self.a_min, melbands))
            melbands -= 10.0 * np.log10(np.maximum(self.a_min, self.db_ref))
            melbands = np.maximum(melbands, melbands.max() - self.d_range)
            melbands -= np.max(melbands)

            batch.append(melbands.copy())
        return np.vstack(batch)


class EmbeddingsOpenL3:
    def __init__(self, graph_path, hop_time=1, batch_size=60, melbands=128):
        self.hop_time = hop_time
        self.batch_size = batch_size

        self.graph_path = Path(graph_path)

        self.x_size = 199
        self.y_size = melbands
        self.squeeze = False

        self.permutation = [0, 3, 2, 1]

        self.input_layer = "melspectrogram"
        self.output_layer = "embeddings"

        self.mel_extractor = MelSpectrogramOpenL3(hop_time=self.hop_time)

        self.model = es.TensorflowPredict(
            graphFilename=str(self.graph_path),
            inputs=[self.input_layer],
            outputs=[self.output_layer],
            squeeze=self.squeeze,
        )

    def compute(self, audio_file):
        mel_spectrogram = self.mel_extractor.compute(audio_file)
        # in OpenL3 the hop size is computed in the feature extraction level

        hop_size_samples = self.x_size

        batch = self.__melspectrogram_to_batch(mel_spectrogram, hop_size_samples)

        pool = Pool()
        embeddings = []
        nbatches = int(np.ceil(batch.shape[0] / self.batch_size))
        for i in range(nbatches):
            start = i * self.batch_size
            end = min(batch.shape[0], (i + 1) * self.batch_size)
            pool.set(self.input_layer, batch[start:end])
            out_pool = self.model(pool)
            embeddings.append(out_pool[self.output_layer].squeeze())

        return np.vstack(embeddings)

    def __melspectrogram_to_batch(self, melspectrogram, hop_time):
        npatches = int(np.ceil((melspectrogram.shape[0] - self.x_size) / hop_time) + 1)
        batch = np.zeros([npatches, self.x_size, self.y_size], dtype="float32")
        for i in range(npatches):
            last_frame = min(i * hop_time + self.x_size, melspectrogram.shape[0])
            first_frame = i * hop_time
            data_size = last_frame - first_frame

            # the last patch may be empty, remove it and exit the loop
            if data_size <= 0:
                batch = np.delete(batch, i, axis=0)
                break
            else:
                batch[i, :data_size] = melspectrogram[first_frame:last_frame]

        batch = np.expand_dims(batch, 1)
        batch = es.TensorTranspose(permutation=self.permutation)(batch)
        return batch

In [None]:
files = os.listdir()
audio_file_list = [f for f in files if os.path.isfile(f) and f.endswith(".m4a")]

with open('weights_metadata_openl3.json') as json_file:
    model_weights_metadata = json.load(json_file)
path = "openL3/"
song_path = ''
embedding_model_weights = "openL3/openl3-music-mel128-emb512-3.pb"

In [None]:
flag = True
column_labels = ['song name']
rows = []

for song in audio_file_list:
    extractor = EmbeddingsOpenL3(embedding_model_weights)
    embeddings = extractor.compute(song)

    classification_models = {}
    for key in model_weights_metadata:
        value = model_weights_metadata[key]

        weight_file = path + value + ".pb"
        mdata_file = path + value + ".json"

        metadata = json.load(open(mdata_file, 'r'))
        input = metadata['schema']['inputs'][0]['name']
        output = metadata['schema']['outputs'][0]['name']
        classes = metadata['classes']

        model = TensorflowPredict2D(graphFilename=weight_file, output=output, input=input)
        
        classification_models[key] = [model, classes]
    
    song_name = ".".join(song.split('.')[:-1])
    scores = [song_name]

    for key in classification_models:
        value = classification_models[key]
        predictions = np.mean(value[0](embeddings), axis=0)

        if flag:
            for i in range(0, len(predictions)):
                label = str(key) + "_" + str(value[1][i])
                column_labels.append(label)
    
        for i in range(0, len(predictions)):
            scores.append(predictions[i])

    flag = False
    rows.append(scores)

In [None]:
df = pd.DataFrame(rows, columns=column_labels)
df

Unnamed: 0,song name,danceability_danceable,danceability_not_danceable,mood_acoustic_acoustic,mood_acoustic_non_acoustic,mood_aggressive_aggressive,mood_aggressive_not_aggressive,mood_electronic_electronic,mood_electronic_non_electronic,mood_happy_happy,mood_happy_non_happy,mood_party_non_party,mood_party_party,mood_relaxed_non_relaxed,mood_relaxed_relaxed,mood_sad_non_sad,mood_sad_sad,tonal_atonal_tonal,tonal_atonal_atonal,voice_gender_female,voice_gender_male,voice_instrumental_instrumental,voice_instrumental_voice
0,"Kesariya - Pritam, Arijit Singh, Amitabh Bhatt...",0.327049,0.672951,0.366784,0.633216,0.023151,0.976849,0.116046,0.883954,0.272419,0.727581,0.658665,0.341335,0.118784,0.881216,0.186672,0.813327,0.812663,0.187337,0.385912,0.614088,0.142868,0.857132
1,"Tumhi Dekho Naa - Shankar-Ehsaan-Loy, Sonu Nig...",0.158344,0.841655,0.569611,0.430389,0.008512,0.991488,0.155452,0.844548,0.144596,0.855404,0.778762,0.221238,0.025113,0.974887,0.095975,0.904025,0.655382,0.344618,0.371912,0.628088,0.224780,0.775220
2,"Yaar Intha - Haricharan, Yuvan Shankar Raja",0.501876,0.498124,0.459691,0.540309,0.025964,0.974036,0.216236,0.783763,0.284831,0.715168,0.606585,0.393415,0.123932,0.876068,0.333369,0.666631,0.772035,0.227965,0.372645,0.627354,0.235784,0.764216
3,Masakali Lofi - Aelo,0.274075,0.725925,0.611383,0.388617,0.013785,0.986215,0.286667,0.713333,0.086680,0.913321,0.918501,0.081499,0.017125,0.982875,0.142679,0.857321,0.820264,0.179736,0.594566,0.405434,0.199318,0.800682
4,"Mounam Chorum Neram - Rinu Razak, Shaan Rahman",0.222101,0.777899,0.639006,0.360994,0.007554,0.992445,0.112350,0.887650,0.147789,0.852211,0.822626,0.177375,0.027179,0.972821,0.094230,0.905770,0.708135,0.291865,0.384298,0.615702,0.148614,0.851386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Blue Jeans (Remastered 2011) - Lana Del Rey,0.190522,0.809478,0.192048,0.807952,0.179450,0.820550,0.107247,0.892753,0.224718,0.775282,0.565376,0.434625,0.236674,0.763326,0.380252,0.619748,0.918202,0.081798,0.373277,0.626724,0.086718,0.913281
189,"Naatu Naatu - Rahul Sipligunj, Kaala Bhairava,...",0.550202,0.449798,0.066733,0.933267,0.304841,0.695159,0.517704,0.482296,0.117790,0.882211,0.184868,0.815132,0.332171,0.667829,0.620437,0.379563,0.836728,0.163272,0.304262,0.695737,0.420082,0.579918
190,Radioactive - Imagine Dragons,0.292800,0.707200,0.201368,0.798633,0.348434,0.651566,0.195731,0.804269,0.165367,0.834633,0.562312,0.437688,0.307172,0.692828,0.500451,0.499549,0.832805,0.167195,0.310269,0.689731,0.186443,0.813557
191,"Gaandakannazhagi - Anirudh Ravichander, Neeti ...",0.427049,0.572950,0.218301,0.781699,0.154261,0.845739,0.191794,0.808205,0.307215,0.692784,0.292882,0.707118,0.391425,0.608575,0.521874,0.478126,0.838267,0.161733,0.245630,0.754369,0.157606,0.842394


In [None]:
original_labels = df.columns
columns_to_drop = ['danceability_not_danceable', 'mood_acoustic_non_acoustic', 'mood_aggressive_not_aggressive', 'mood_electronic_non_electronic', 'mood_happy_non_happy', 'mood_party_non_party', 'mood_relaxed_non_relaxed', 'mood_sad_non_sad',  'tonal_atonal_atonal']
rename_labels = {'danceability_danceable':'danceability', 'mood_acoustic_acoustic':'mood_acoustic', 'mood_aggressive_aggressive':'mood_aggressive', 'mood_electronic_electronic':'mood_electronic', 
                 'mood_happy_happy':'mood_happy', 'mood_party_party':'mood_party', 'mood_relaxed_relaxed':'mood_relaxed', 'mood_sad_sad':'mood_sad', 
                 'voice_instrumental_instrumental':'instrumental', 'voice_instrumental_voice':'voice', 'voice_gender_female':'voice_female', 'voice_gender_male':'voice_male',
                 'tonal_atonal_tonal':'tonal'
                 }

In [None]:
df.drop(columns=columns_to_drop, inplace=True)
df.rename(columns=rename_labels, inplace=True)

In [None]:
df

Unnamed: 0,song name,danceability,mood_acoustic,mood_aggressive,mood_electronic,mood_happy,mood_party,mood_relaxed,mood_sad,tonal,voice_female,voice_male,instrumental,voice
0,"Kesariya - Pritam, Arijit Singh, Amitabh Bhatt...",0.327049,0.366784,0.023151,0.116046,0.272419,0.341335,0.881216,0.813327,0.812663,0.385912,0.614088,0.142868,0.857132
1,"Tumhi Dekho Naa - Shankar-Ehsaan-Loy, Sonu Nig...",0.158344,0.569611,0.008512,0.155452,0.144596,0.221238,0.974887,0.904025,0.655382,0.371912,0.628088,0.224780,0.775220
2,"Yaar Intha - Haricharan, Yuvan Shankar Raja",0.501876,0.459691,0.025964,0.216236,0.284831,0.393415,0.876068,0.666631,0.772035,0.372645,0.627354,0.235784,0.764216
3,Masakali Lofi - Aelo,0.274075,0.611383,0.013785,0.286667,0.086680,0.081499,0.982875,0.857321,0.820264,0.594566,0.405434,0.199318,0.800682
4,"Mounam Chorum Neram - Rinu Razak, Shaan Rahman",0.222101,0.639006,0.007554,0.112350,0.147789,0.177375,0.972821,0.905770,0.708135,0.384298,0.615702,0.148614,0.851386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Blue Jeans (Remastered 2011) - Lana Del Rey,0.190522,0.192048,0.179450,0.107247,0.224718,0.434625,0.763326,0.619748,0.918202,0.373277,0.626724,0.086718,0.913281
189,"Naatu Naatu - Rahul Sipligunj, Kaala Bhairava,...",0.550202,0.066733,0.304841,0.517704,0.117790,0.815132,0.667829,0.379563,0.836728,0.304262,0.695737,0.420082,0.579918
190,Radioactive - Imagine Dragons,0.292800,0.201368,0.348434,0.195731,0.165367,0.437688,0.692828,0.499549,0.832805,0.310269,0.689731,0.186443,0.813557
191,"Gaandakannazhagi - Anirudh Ravichander, Neeti ...",0.427049,0.218301,0.154261,0.191794,0.307215,0.707118,0.608575,0.478126,0.838267,0.245630,0.754369,0.157606,0.842394


In [None]:
df.to_csv('song_dataset_openl3.csv', index=False)