In [45]:
from pathlib import Path
import essentia.standard as es
import numpy as np
from essentia import Pool

class MelSpectrogramOpenL3:
    def __init__(self, hop_time):
        self.hop_time = hop_time

        self.sr = 48000
        self.n_mels = 128
        self.frame_size = 2048
        self.hop_size = 242
        self.a_min = 1e-10
        self.d_range = 80
        self.db_ref = 1.0

        self.patch_samples = int(1 * self.sr)
        self.hop_samples = int(self.hop_time * self.sr)

        self.w = es.Windowing(
            size=self.frame_size,
            normalized=False,
        )
        self.s = es.Spectrum(size=self.frame_size)
        self.mb = es.MelBands(
            highFrequencyBound=self.sr / 2,
            inputSize=self.frame_size // 2 + 1,
            log=False,
            lowFrequencyBound=0,
            normalize="unit_tri",
            numberBands=self.n_mels,
            sampleRate=self.sr,
            type="magnitude",
            warpingFormula="slaneyMel",
            weighting="linear",
        )

    def compute(self, audio_file):
        audio = es.MonoLoader(filename=audio_file, sampleRate=self.sr)()

        batch = []
        for audio_chunk in es.FrameGenerator(
            audio, frameSize=self.patch_samples, hopSize=self.hop_samples
        ):
            melbands = np.array(
                [
                    self.mb(self.s(self.w(frame)))
                    for frame in es.FrameGenerator(
                        audio_chunk,
                        frameSize=self.frame_size,
                        hopSize=self.hop_size,
                        validFrameThresholdRatio=0.5,
                    )
                ]
            )

            melbands = 10.0 * np.log10(np.maximum(self.a_min, melbands))
            melbands -= 10.0 * np.log10(np.maximum(self.a_min, self.db_ref))
            melbands = np.maximum(melbands, melbands.max() - self.d_range)
            melbands -= np.max(melbands)

            batch.append(melbands.copy())
        return np.vstack(batch)


class EmbeddingsOpenL3:
    def __init__(self, graph_path, hop_time=1, batch_size=60, melbands=128):
        self.hop_time = hop_time
        self.batch_size = batch_size

        self.graph_path = Path(graph_path)

        self.x_size = 199
        self.y_size = melbands
        self.squeeze = False

        self.permutation = [0, 3, 2, 1]

        self.input_layer = "melspectrogram"
        self.output_layer = "embeddings"

        self.mel_extractor = MelSpectrogramOpenL3(hop_time=self.hop_time)

        self.model = es.TensorflowPredict(
            graphFilename=str(self.graph_path),
            inputs=[self.input_layer],
            outputs=[self.output_layer],
            squeeze=self.squeeze,
        )

    def compute(self, audio_file):
        mel_spectrogram = self.mel_extractor.compute(audio_file)
        # in OpenL3 the hop size is computed in the feature extraction level

        hop_size_samples = self.x_size

        batch = self.__melspectrogram_to_batch(mel_spectrogram, hop_size_samples)

        pool = Pool()
        embeddings = []
        nbatches = int(np.ceil(batch.shape[0] / self.batch_size))
        for i in range(nbatches):
            start = i * self.batch_size
            end = min(batch.shape[0], (i + 1) * self.batch_size)
            pool.set(self.input_layer, batch[start:end])
            out_pool = self.model(pool)
            embeddings.append(out_pool[self.output_layer].squeeze())

        return np.vstack(embeddings)

    def __melspectrogram_to_batch(self, melspectrogram, hop_time):
        npatches = int(np.ceil((melspectrogram.shape[0] - self.x_size) / hop_time) + 1)
        batch = np.zeros([npatches, self.x_size, self.y_size], dtype="float32")
        for i in range(npatches):
            last_frame = min(i * hop_time + self.x_size, melspectrogram.shape[0])
            first_frame = i * hop_time
            data_size = last_frame - first_frame

            # the last patch may be empty, remove it and exit the loop
            if data_size <= 0:
                batch = np.delete(batch, i, axis=0)
                break
            else:
                batch[i, :data_size] = melspectrogram[first_frame:last_frame]

        batch = np.expand_dims(batch, 1)
        batch = es.TensorTranspose(permutation=self.permutation)(batch)
        return batch

In [None]:
import pandas as pd
import json
from essentia.standard import MonoLoader, TensorflowPredictEffnetDiscogs, TensorflowPredict2D

In [46]:
df_django = pd.read_csv('data/songs_db.csv')
df_features = pd.read_csv('data/song_dataset_final.csv')

In [47]:
processed_songs = pd.read_csv('data/processed_songs.csv')
df_to_extract = df_django[~df_django['song name'].isin(processed_songs['song name'])]
audio_file_list = df_to_extract['file_path']

In [49]:
with open('data/weights_metadata.json') as json_file:
    model_weights_metadata = json.load(json_file)
json_file.close()
path = "data/all_classifiers_and_metadata/"
song_path = 'media/'
embedding_model_weights_l3 = "data/all_classifiers_and_metadata/openl3-music-mel128-emb512-3.pb"
embedding_model_weights_dsg = "data/all_classifiers_and_metadata/discogs-effnet-bs64-1.pb"

In [50]:
def extract_all_features(all_songs):
    column_labels = ['song name']
    rows = []
    flag = True

    for song in all_songs:
        extractor = EmbeddingsOpenL3(embedding_model_weights_l3)
        embeddings_l3 = extractor.compute(song_path + song)

        audio = MonoLoader(filename=song_path+song, sampleRate=44100, resampleQuality=4)()
        embedding_model = TensorflowPredictEffnetDiscogs(graphFilename=embedding_model_weights_dsg, output="PartitionedCall:1")
        embeddings_dsg = embedding_model(audio)

        classification_models = {}
        for key in model_weights_metadata:
            model_type = model_weights_metadata[key][0]
            file_name = model_weights_metadata[key][1]

            weight_file = path + file_name + ".pb"
            mdata_file = path + file_name + ".json"

            metadata = json.load(open(mdata_file, 'r'))
            input_ = metadata['schema']['inputs'][0]['name']
            output = metadata['schema']['outputs'][0]['name']
            classes = metadata['classes']

            model = TensorflowPredict2D(graphFilename=weight_file, output=output, input=input_)
            
            classification_models[key] = [model, model_type, classes]
            
        scores = [df_to_extract[df_to_extract['file_path'] == song]['song name'].values[0]]

        for key in classification_models:
            model = classification_models[key][0]
            model_type = classification_models[key][1]
            classes = classification_models[key][2]

            if model_type == "openl3":
                predictions = np.mean(model(embeddings_l3), axis=0)
            else:
                predictions = np.mean(model(embeddings_dsg), axis=0)

            if flag:
                for i in range(0, len(predictions)):
                    label = str(key) + "_" + str(classes[i])
                    column_labels.append(label)
        
            for i in range(0, len(predictions)):
                scores.append(predictions[i])
                
        flag = False
        rows.append(scores)
    return column_labels, rows

In [51]:
column_labels, rows = extract_all_features(audio_file_list)

[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/openl3-music-mel128-emb512-3.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/discogs-effnet-bs64-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/danceability-openl3-music-mel128-emb512-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/mood_acoustic-openl3-music-mel128-emb512-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/mood_aggressive-openl3-music-mel128-emb512-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/mood_electronic-openl3-music-mel128-emb512-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `data/all_classifiers_and_metadata/mood_happy-openl3-music-mel128-emb512-1.pb`
[   INFO   ] 

In [52]:
new_df = pd.DataFrame(rows, columns=column_labels)

In [53]:
genre_to_drop = [  'Genre_experimental', 'Genre_alternative', 'Genre_soundtrack',  'Genre_newage', 'Genre_psychedelic', 'Genre_world', 'Genre_singersongwriter', 
'Genre_minimal', 'Genre_progressive', 'Genre_contemporary', 'Genre_grunge', 'Genre_rnb', 'Genre_dance', 'Genre_idm', 'Genre_90s', 'Genre_soul', 'Genre_chanson', 
'Genre_60s', 'Genre_newwave', 'Genre_worldfusion', 'Genre_celtic', 'Genre_alternativerock', 'Genre_electronica', 'Genre_improvisation', 'Genre_80s', 
'Genre_edm', 'Genre_latin', 'Genre_hard','Genre_70s', 'Genre_swing', 'Genre_bossanova', 'Genre_eurodance']

tags_to_drop = [  'Tag_energetic', 'Tag_trance', 'Tag_dance',  'Tag_happy', 'Tag_experimental', 'Tag_soundtrack', 'Tag_alternative', 
'Tag_world', 'Tag_lounge', 'Tag_voice', 'Tag_computer']

instruments_to_drop = ['Instrument_bell', 'Instrument_bongo', 'Instrument_clarinet', 'Instrument_pad', 'Instrument_voice',
        'Instrument_oboe', 'Instrument_rhodes',  'Instrument_computer',
        'Instrument_horn', 'Instrument_viola', 'Instrument_sampler']

other_columns_to_drop = ['danceability_not_danceable', 'mood_acoustic_non_acoustic', 'mood_aggressive_not_aggressive', 'mood_electronic_non_electronic', 
                         'mood_happy_non_happy', 'mood_party_non_party', 'mood_relaxed_non_relaxed', 'mood_sad_non_sad',  'tonal_atonal_atonal', 'voice_gender_male', 
                         'voice_instrumental_instrumental', 'Nsynth_Reverb_dry']

rename_labels = {'danceability_danceable':'danceability', 'mood_acoustic_acoustic':'mood_acoustic', 'mood_aggressive_aggressive':'mood_aggressive', 
                 'mood_electronic_electronic':'mood_electronic', 'mood_happy_happy':'mood_happy', 'mood_party_party':'mood_party', 'mood_relaxed_relaxed':'mood_relaxed', 
                 'mood_sad_sad':'mood_sad', 'voice_instrumental_voice':'overall_voice', 'voice_gender_female':'voice_female', 
                 'tonal_atonal_tonal':'tonal','Engagement_engagement':'Engagement', 'Nsynth_Reverb_wet':'Reverb_wet'
                 }

all_columns_to_drop = genre_to_drop + tags_to_drop + instruments_to_drop + other_columns_to_drop

In [54]:
new_df.drop(columns=all_columns_to_drop, inplace=True)
new_df.rename(columns=rename_labels, inplace=True)
new_df

Unnamed: 0,song name,danceability,mood_acoustic,mood_aggressive,mood_electronic,mood_happy,mood_party,mood_relaxed,mood_sad,tonal,...,Tag_electricpiano,Tag_guitar,Tag_keyboard,Tag_piano,Tag_strings,Tag_synthesizer,Tag_violin,Tag_emotional,Tag_film,Tag_relaxing
0,HITMAN 2 Soundtrack - Main Menu,0.162913,0.423373,0.025627,0.490927,0.030353,0.045771,0.985529,0.88416,0.570752,...,0.008138,0.028909,0.012961,0.140087,0.026664,0.175702,0.024543,0.034719,0.0628,0.062103
1,HITMAN 2 Soundtrack - Results Screen,0.346552,0.143948,0.012917,0.685958,0.004539,0.157035,0.99387,0.948334,0.512509,...,0.014789,0.039162,0.019315,0.154389,0.024648,0.15069,0.02571,0.036803,0.052089,0.071415


# Picking top Genres and Tags

In [55]:
genre_columns = [col for col in new_df.columns if col.startswith("Genre")]
tag_columns = [col for col in new_df.columns if col.startswith("Tag")]

In [56]:
genre_data = new_df[genre_columns].T
tag_data = new_df[tag_columns].T

In [57]:
top_genres, top_tags = [], []
for col in genre_data.columns:
    temp1 = genre_data[col].nlargest(3).index.to_list()
    temp2 = tag_data[col].nlargest(5).index.to_list()
    top_genres.append(temp1)
    top_tags.append(temp2)

In [58]:
new_df['top_genres'] = top_genres
new_df['top_tags'] = top_tags

In [59]:
new_df

Unnamed: 0,song name,danceability,mood_acoustic,mood_aggressive,mood_electronic,mood_happy,mood_party,mood_relaxed,mood_sad,tonal,...,Tag_keyboard,Tag_piano,Tag_strings,Tag_synthesizer,Tag_violin,Tag_emotional,Tag_film,Tag_relaxing,top_genres,top_tags
0,HITMAN 2 Soundtrack - Main Menu,0.162913,0.423373,0.025627,0.490927,0.030353,0.045771,0.985529,0.88416,0.570752,...,0.012961,0.140087,0.026664,0.175702,0.024543,0.034719,0.0628,0.062103,"[Genre_ambient, Genre_electronic, Genre_classi...","[Tag_ambient, Tag_electronic, Tag_classical, T..."
1,HITMAN 2 Soundtrack - Results Screen,0.346552,0.143948,0.012917,0.685958,0.004539,0.157035,0.99387,0.948334,0.512509,...,0.019315,0.154389,0.024648,0.15069,0.02571,0.036803,0.052089,0.071415,"[Genre_ambient, Genre_electronic, Genre_classi...","[Tag_electronic, Tag_ambient, Tag_classical, T..."


# Updating CSV files

In [60]:
right_exclusive_join = new_df['song name'][~new_df['song name'].isin(df_features['song name'])]
new_entries = new_df[new_df['song name'].isin(right_exclusive_join)]

In [61]:
updated_dataframe = pd.concat([df_features, new_entries], ignore_index=True)
updated_dataframe.sort_values('song name', inplace=True)
updated_dataframe.reset_index(drop=True, inplace=True)

In [65]:
updated_dataframe['song name'].to_frame().to_csv('data/processed_songs.csv', index=False)
updated_dataframe.to_csv('data/song_dataset_final.csv', index=False)

# Completion

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import essentia.standard as es
from essentia.standard import TensorflowPredict2D
import numpy as np
from essentia import Pool
import json

In [None]:
class MelSpectrogramOpenL3:
    def __init__(self, hop_time):
        self.hop_time = hop_time

        self.sr = 48000
        self.n_mels = 128
        self.frame_size = 2048
        self.hop_size = 242
        self.a_min = 1e-10
        self.d_range = 80
        self.db_ref = 1.0

        self.patch_samples = int(1 * self.sr)
        self.hop_samples = int(self.hop_time * self.sr)

        self.w = es.Windowing(
            size=self.frame_size,
            normalized=False,
        )
        self.s = es.Spectrum(size=self.frame_size)
        self.mb = es.MelBands(
            highFrequencyBound=self.sr / 2,
            inputSize=self.frame_size // 2 + 1,
            log=False,
            lowFrequencyBound=0,
            normalize="unit_tri",
            numberBands=self.n_mels,
            sampleRate=self.sr,
            type="magnitude",
            warpingFormula="slaneyMel",
            weighting="linear",
        )

    def compute(self, audio_file):
        audio = es.MonoLoader(filename=audio_file, sampleRate=self.sr)()

        batch = []
        for audio_chunk in es.FrameGenerator(
            audio, frameSize=self.patch_samples, hopSize=self.hop_samples
        ):
            melbands = np.array(
                [
                    self.mb(self.s(self.w(frame)))
                    for frame in es.FrameGenerator(
                        audio_chunk,
                        frameSize=self.frame_size,
                        hopSize=self.hop_size,
                        validFrameThresholdRatio=0.5,
                    )
                ]
            )

            melbands = 10.0 * np.log10(np.maximum(self.a_min, melbands))
            melbands -= 10.0 * np.log10(np.maximum(self.a_min, self.db_ref))
            melbands = np.maximum(melbands, melbands.max() - self.d_range)
            melbands -= np.max(melbands)

            batch.append(melbands.copy())
        return np.vstack(batch)


class EmbeddingsOpenL3:
    def __init__(self, graph_path, hop_time=1, batch_size=60, melbands=128):
        self.hop_time = hop_time
        self.batch_size = batch_size

        self.graph_path = Path(graph_path)

        self.x_size = 199
        self.y_size = melbands
        self.squeeze = False

        self.permutation = [0, 3, 2, 1]

        self.input_layer = "melspectrogram"
        self.output_layer = "embeddings"

        self.mel_extractor = MelSpectrogramOpenL3(hop_time=self.hop_time)

        self.model = es.TensorflowPredict(
            graphFilename=str(self.graph_path),
            inputs=[self.input_layer],
            outputs=[self.output_layer],
            squeeze=self.squeeze,
        )

    def compute(self, audio_file):
        mel_spectrogram = self.mel_extractor.compute(audio_file)
        # in OpenL3 the hop size is computed in the feature extraction level

        hop_size_samples = self.x_size

        batch = self.__melspectrogram_to_batch(mel_spectrogram, hop_size_samples)

        pool = Pool()
        embeddings = []
        nbatches = int(np.ceil(batch.shape[0] / self.batch_size))
        for i in range(nbatches):
            start = i * self.batch_size
            end = min(batch.shape[0], (i + 1) * self.batch_size)
            pool.set(self.input_layer, batch[start:end])
            out_pool = self.model(pool)
            embeddings.append(out_pool[self.output_layer].squeeze())

        return np.vstack(embeddings)

    def __melspectrogram_to_batch(self, melspectrogram, hop_time):
        npatches = int(np.ceil((melspectrogram.shape[0] - self.x_size) / hop_time) + 1)
        batch = np.zeros([npatches, self.x_size, self.y_size], dtype="float32")
        for i in range(npatches):
            last_frame = min(i * hop_time + self.x_size, melspectrogram.shape[0])
            first_frame = i * hop_time
            data_size = last_frame - first_frame

            # the last patch may be empty, remove it and exit the loop
            if data_size <= 0:
                batch = np.delete(batch, i, axis=0)
                break
            else:
                batch[i, :data_size] = melspectrogram[first_frame:last_frame]

        batch = np.expand_dims(batch, 1)
        batch = es.TensorTranspose(permutation=self.permutation)(batch)
        return batch

In [None]:
path = "data/openL3/"
song_path = 'media/'
embedding_model_weights = "data/openL3/openl3-music-mel128-emb512-3.pb"

def extract_features(all_songs, df_extract):
    column_labels = ['song name']
    rows = []
    flag = True
    with open('data/openL3/weights_metadata_openl3.json') as json_file:
        model_weights_metadata = json.load(json_file)
    for song in all_songs:
        extractor = EmbeddingsOpenL3(embedding_model_weights)
        embeddings = extractor.compute(song_path + song)

        classification_models = {}
        for key in model_weights_metadata:
            value = model_weights_metadata[key]

            weight_file = path + value + ".pb"
            mdata_file = path + value + ".json"

            metadata = json.load(open(mdata_file, 'r'))
            input_ = metadata['schema']['inputs'][0]['name']
            output = metadata['schema']['outputs'][0]['name']
            classes = metadata['classes']

            model = TensorflowPredict2D(graphFilename=weight_file, output=output, input=input_)
            
            classification_models[key] = [model, classes]
        
        #song_name = ".".join(song.split('.')[:-1])
        scores = [df_extract[df_extract['file_path'] == song]['song name'].values[0]]

        for key in classification_models:
            value = classification_models[key]
            predictions = np.mean(value[0](embeddings), axis=0)

            if flag:
                for i in range(0, len(predictions)):
                    label = str(key) + "_" + str(value[1][i])
                    column_labels.append(label)
        
            for i in range(0, len(predictions)):
                scores.append(predictions[i])

        flag = False
        rows.append(scores)
    return column_labels, rows


columns_to_drop = ['danceability_not_danceable', 'mood_acoustic_non_acoustic', 'mood_aggressive_not_aggressive', 'mood_electronic_non_electronic', 'mood_happy_non_happy', 'mood_party_non_party', 'mood_relaxed_non_relaxed', 'mood_sad_non_sad',  'tonal_atonal_atonal']
rename_labels = {'danceability_danceable':'danceability', 'mood_acoustic_acoustic':'mood_acoustic', 'mood_aggressive_aggressive':'mood_aggressive', 'mood_electronic_electronic':'mood_electronic', 
                 'mood_happy_happy':'mood_happy', 'mood_party_party':'mood_party', 'mood_relaxed_relaxed':'mood_relaxed', 'mood_sad_sad':'mood_sad', 
                 'voice_instrumental_instrumental':'instrumental', 'voice_instrumental_voice':'voice', 'voice_gender_female':'voice_female', 'voice_gender_male':'voice_male',
                 'tonal_atonal_tonal':'tonal'
                 }

def train_model():
    df_django = pd.read_csv('data/songs_db.csv')
    df_features = pd.read_csv('data/song_dataset_openl3.csv')

    processed_songs = pd.read_csv('data/processed_songs.csv')
    df_to_extract = df_django[~df_django['song name'].isin(processed_songs['song name'])]
    audio_file_list = df_to_extract['file_path']

    if len(audio_file_list)==0:
        print("terminating")
        return
    column_labels, rows = extract_features(audio_file_list, df_to_extract)

    new_df = pd.DataFrame(rows, columns=column_labels)
    new_df.drop(columns=columns_to_drop, inplace=True)
    new_df.rename(columns=rename_labels, inplace=True)
    right_exclusive_join = new_df['song name'][~new_df['song name'].isin(df_features['song name'])]
    new_entries = new_df[new_df['song name'].isin(right_exclusive_join)]

    updated_dataframe = pd.concat([df_features, new_entries], ignore_index=True)
    updated_dataframe.sort_values('song name', inplace=True)
    updated_dataframe.reset_index(drop=True, inplace=True)

    updated_dataframe.to_csv('data/song_dataset_openl3.csv', index=False)
    updated_dataframe['song name'].to_frame().to_csv('data/processed_songs.csv', index=False)

In [None]:
train_model()

# Deleting Entries when model is deleted

In [None]:
df_processed = pd.read_csv('data/processed_songs.csv')
df_database = pd.read_csv('data/songs_db.csv')
df_features = pd.read_csv('data/song_dataset_openl3.csv')

In [None]:
indices_to_delete = df_processed['song name'][~df_processed['song name'].isin(df_database['song name'])].index
indices_to_delete2 = df_features['song name'][~df_features['song name'].isin(df_database['song name'])].index

In [None]:
df_processed.drop(indices_to_delete, inplace=True)
df_features.drop(indices_to_delete2, inplace=True)
df_processed.reset_index(drop=True, inplace=True)
df_features.reset_index(drop=True, inplace=True)