## Model Loader
creating models and modeling scenario

In [1]:
import sys
import os
import numpy as np
import pandas as pd
sys.path.insert(0, '../../')
from library.notebook_api.data_loader import  ModelDataLoader,CombinedDataLoader

A scenario has a specific subset and feature treatment of data and trains a specific model for a specific type of classification 
1. subset as informed by genre analysis 

In [123]:
from configuration import  MODEL_INPUT_DATA_PATH
class ModelDataLoader():
    '''Loads and provides access to model input data and related information'''
    def __init__(self,version = '000'):
        self.df = pd.read_parquet(f'{MODEL_INPUT_DATA_PATH}model_input_{version}')
        self.feature_names = ['spectral_centroids_mean',
                'spectral_centroids_delta_mean',
                'spectral_centroids_accelerate_mean',
                'spectral_bandwidth_mean',
                'spectral_rolloff_mean',
                'zero_crossing_rate_mean',
                'rms_mean',
                'chroma_stft_mean',
                'mfccs_mean',
                'onset',
                'tempo',
                'contrast',
                'tonnetz',
                'mfccs_min',
                'mfccs_max']
        self.label_names = self.df.label.unique()


        self.add_named_feature_columns()
        
    def add_named_feature_columns(self):
        for index, feature in enumerate(self.feature_names):
            self.df[feature] = self.df.features.map(lambda features: features[index])

import altair as alt


class ModelScenario():
    def __init__(self,model_data_loader = ModelDataLoader('003'), in_scope_labels = None):
        self.model_data = model_data_loader
        if in_scope_labels == None:
            self.label_names = self.model_data.label_names
            self.df = self.model_data.df[self.model_data.df.label.isin(self.label_names)]
        else:
            self.label_names = in_scope_labels
            self.df = self.model_data.df
        return
    
    def get_class_distribution(self):
        return pd.DataFrame(self.df['label'].value_counts(normalize=True) * 100).reset_index()
    
    def get_class_counts(self):
        return self.df.groupby('label')['label'].count().sort_values(ascending=False)

    
    def get_label_sample_df(self,df, label, sample_size):
        df_label = df[df.label == label]
        #return df_label.sample(sample_size).index
        if sample_size > len(df_label):
            return df_label
        return df_label.sample(sample_size)
    
    
    def get_model_data_sampled_by_label(self, sample_size):
        label_sample_indexes = []

        for index, label in enumerate(self.label_names):
            label_sample_df = self.get_label_sample_df(self.model_data.df, self.label_names[index], sample_size)
            #print("Generate ", len(label_sample_df), ' length sample')
            label_sample_indexes.append(label_sample_df)

        sampled_df = pd.concat(label_sample_indexes)
        return sampled_df
    
    def get_feature_distribution_by_label(self, feature_name):
        chart = alt.Chart(self.df).mark_boxplot(extent="min-max").encode(
            alt.X("label:N"),
            alt.Y(feature_name).scale(zero=False),
            alt.Color("label:N").legend(None),
            )
        return chart
    



    
    

### Custom scenario example
for subset of labels 

In [124]:
in_scope_labels = ['rock', 'electronic', 'hiphop','international', 'classical', 'jazz','country']
s = ModelScenario(in_scope_labels = in_scope_labels)

In [125]:
s.df = s.get_model_data_sampled_by_label(300)

In [114]:
s.get_class_counts()

label
classical        300
electronic       300
hiphop           300
jazz             300
international    300
rock             300
country          277
Name: label, dtype: int64

In [127]:
s.model_data.feature_names

['spectral_centroids_mean',
 'spectral_centroids_delta_mean',
 'spectral_centroids_accelerate_mean',
 'spectral_bandwidth_mean',
 'spectral_rolloff_mean',
 'zero_crossing_rate_mean',
 'rms_mean',
 'chroma_stft_mean',
 'mfccs_mean',
 'onset',
 'tempo',
 'contrast',
 'tonnetz',
 'mfccs_min',
 'mfccs_max']

In [126]:
s.get_feature_distribution_by_label('tempo')

## Default Scenario

In [115]:
s_default=ModelScenario(ModelDataLoader('003'))

In [116]:
s_default.label_names

array(['hiphop', 'pop', 'folk', 'experimental', 'rock', 'international',
       'electronic', 'instrumental', 'blues', 'classical', 'country',
       'soulrnb', 'jazz', 'spoken', 'oldtime / historic',
       'easy listening'], dtype=object)

### Default model scenario

In [108]:
s_default.get_class_distribution()

Unnamed: 0,label,proportion
0,rock,28.081454
1,electronic,24.289784
2,hiphop,8.842097
3,experimental,8.661175
4,folk,5.843406
5,instrumental,5.192855
6,pop,4.950343
7,international,4.303642
8,classical,2.767727
9,oldtime / historic,1.95935


In [110]:
s_default.get_class_counts()

label
rock                  7295
electronic            6310
hiphop                2297
experimental          2250
folk                  1518
instrumental          1349
pop                   1286
international         1118
classical              719
oldtime / historic     509
jazz                   483
country                277
soulrnb                254
blues                  174
spoken                 118
easy listening          21
Name: label, dtype: int64

In [71]:
in_scope_labels = ['rock', 'electronic', 'hiphop','international', 'classical', 'jazz','country']
#in_scope_labels = ['rock']

In [73]:
def get_label_sample_df(df, label, sample_size):
    df_label = df[df.label == label]
    #return df_label.sample(sample_size).index
    if sample_size > len(df_label):
        return df_label
    return df_label.sample(sample_size)


sample_size = 400
label_sample_indexes = []

for index, label in enumerate(in_scope_labels):
    label_sample_df = get_label_sample_df(model_data.df, in_scope_labels[index], sample_size)
    print("Generate ", len(genre_sample_df), ' length sample')
    label_sample_indexes.append(label_sample_df)

sampled_df = pd.concat(label_sample_indexes)

print(len(sampled_df))


Generate  400  length sample
Generate  400  length sample
Generate  400  length sample
Generate  400  length sample
Generate  400  length sample
Generate  400  length sample
Generate  277  length sample
2677


In [77]:
sampled_df.index

Index([992, 476, 353, 273, 411, 352,  26, 365, 358, 422,
       ...
         8,   9, 432, 433, 434, 435, 436, 235, 236, 415],
      dtype='int64', length=2677)

In [74]:
sampled_df.groupby('label')['label'].count().sort_values(ascending=False)

label
classical        400
electronic       400
hiphop           400
jazz             400
international    400
rock             400
country          277
Name: label, dtype: int64

In [76]:
model_data.df.groupby('dataset').count()

Unnamed: 0_level_0,index,track_id,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,file_available,sampling_rate,features,...,zero_crossing_rate_mean,rms_mean,chroma_stft_mean,mfccs_mean,onset,tempo,contrast,tonnetz,mfccs_min,mfccs_max
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fma,24979,24979,24979,24979,24979,24979,24979,24979,24979,24979,...,24979,24979,24979,24979,24979,24979,24979,24979,24979,24979
gtzan,999,999,999,999,999,999,999,999,999,999,...,999,999,999,999,999,999,999,999,999,999
