### Import Library

In [102]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from datetime import datetime

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [103]:
import os, sys

sys.path.insert(0,os.path.realpath(os.path.join(os.path.pardir, '..')))

from config import cfg
from utils.utils import *
from utils.tfidf import *
from utils.feature import *

### Import the feature-extracted data

In [104]:
song_df = pd.read_pickle(os.path.join(cfg.DATA.BASE_PATH, 'fx_th_songs_newlog.pkl')) 

## Prediction

In [105]:
# parameter
# number of sets
n_set = {'train': 20, 'val': 20}

# number of artists per set
n_artist = 3

# minimum number of songs of one artist
n_song_min = 5

# maximum number of song - artist pairs per artist set
n_song_artist_max = 100

In [106]:
song_df.dtypes

artist                 object
song_name              object
href                   object
lyrics                 object
lines                   int64
words                  object
n_words                 int64
words_str              object
eng_word_ratio        float64
words_withinline        int64
n_unique_words          int64
unique_words_ratio    float64
words_per_line        float64
n_verse                 int64
tf_idf_vector          object
tf_idf_score          float64
dtype: object

In [107]:
song_df.columns

Index(['artist', 'song_name', 'href', 'lyrics', 'lines', 'words', 'n_words',
       'words_str', 'eng_word_ratio', 'words_withinline', 'n_unique_words',
       'unique_words_ratio', 'words_per_line', 'n_verse', 'tf_idf_vector',
       'tf_idf_score'],
      dtype='object')

In [108]:
np.random.seed(0)
columns = ['artist', 'song_name', 'lines', 'n_words', 'eng_word_ratio', 'words_withinline', 'unique_words_ratio', 'words_per_line', 'n_verse', 'tf_idf_vector', 'tf_idf_score']
stat_columns = ['n_words', 'eng_word_ratio', 'words_withinline', 'unique_words_ratio', 'words_per_line', 'n_verse', 'tf_idf_score']
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max, columns, stat_columns)

Number of songs in train: 660
Number of songs in val: 660


In [109]:
feature = ['n_words_diff', 'n_words_diff_std',
           'eng_word_ratio_diff',
           'words_withinline_diff', 'words_withinline_diff_std',
           'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
           'words_per_line_diff', 'words_per_line_diff_std',
           'n_verse_diff',
           'tf_idf_score_diff', 'tf_idf_score_diff_std',
           'vector_similarity']
df_lst = []

pd.options.mode.chained_assignment = None  # default='warn'

for f in feature:
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
    
feature_df = pd.concat(df_lst)
feature_df.head()

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,45.6
1,False,n_words_diff,91.6
2,False,n_words_diff,6.6
3,False,n_words_diff,74.6
4,True,n_words_diff,-29.4


In [110]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [111]:
feature_df

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,45.600000
1,False,n_words_diff,91.600000
2,False,n_words_diff,6.600000
3,False,n_words_diff,74.600000
4,True,n_words_diff,-29.400000
...,...,...,...
1975,False,vector_similarity,0.076074
1976,False,vector_similarity,0.035265
1977,True,vector_similarity,0.166132
1978,False,vector_similarity,0.179583


In [118]:
fig = violine_feature_plot(feature_df, ['n_words_diff_std', 'words_withinline_diff_std', 'unique_words_ratio_diff_std', 'words_per_line_diff_std', 'tf_idf_score_diff_std'])
fig.update_layout(
    autosize=False,
    width=2000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

In [119]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_xaxes(range=[-1, 1])
fig.show()

### Prepare data for prediction

In [120]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, pipeline, columns, stat_columns):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max, columns, stat_columns)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

In [121]:
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max, columns, stat_columns)


Number of songs in train: 3300


In [122]:
artist_song_feature['train'].isna().sum()

artist_artist                     0
n_words_mean                      0
n_words_std                       0
eng_word_ratio_mean               0
eng_word_ratio_std                0
words_withinline_mean             0
words_withinline_std              0
unique_words_ratio_mean           0
unique_words_ratio_std            0
words_per_line_mean               0
words_per_line_std                0
n_verse_mean                      0
n_verse_std                       0
tf_idf_score_mean                 0
tf_idf_score_std                  0
songs                             0
tf_idf_vector_mean                0
artist_song                       0
song_name                         0
lines                             0
n_words                           0
eng_word_ratio                    0
words_withinline                  0
unique_words_ratio                0
words_per_line                    0
n_verse                           0
tf_idf_vector                     0
tf_idf_score                

In [123]:
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 100}
n_song_artist_max = 100

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std',
               'eng_word_ratio_diff',
               'words_withinline_diff', 'words_withinline_diff_std',
               'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
               'words_per_line_diff', 'words_per_line_diff_std',
               'n_verse_diff',
               'tf_idf_score_diff', 'tf_idf_score_diff_std',]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline, columns, stat_columns)

Number of songs in train: 3300


In [124]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

## Validation

In [125]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # print(predict_select)
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [126]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.76


In [127]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.86


In [130]:
n_artist_lst = [2, 4, 8, 16, 32]
top_n_lst = [1, 2, 4, 8, 16]
n_song_artist_max = 64
np.random.seed(2)

n_set = {'train': 100, 'val': 100}

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std',
               'eng_word_ratio_diff',
               'words_withinline_diff', 'words_withinline_diff_std',
               'words_per_line_diff', 'words_per_line_diff_std',
               'n_verse_diff',
               'tf_idf_score_diff', 'tf_idf_score_diff_std',]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline, columns, stat_columns)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-05-27 03:06:06.065038
n_artist: 2
Number of songs in train: 3200
Number of songs in val: 3200
top_n: 1
Accuracy: 0.83

2022-05-27 03:06:17.113051
n_artist: 4
Number of songs in train: 1600
Number of songs in val: 1600
top_n: 1
Accuracy: 0.81
top_n: 2
Accuracy: 0.9

2022-05-27 03:06:27.473931
n_artist: 8
Number of songs in train: 800
Number of songs in val: 800
top_n: 1
Accuracy: 0.62
top_n: 2
Accuracy: 0.77
top_n: 4
Accuracy: 0.88

2022-05-27 03:06:37.487504
n_artist: 16
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.52
top_n: 2
Accuracy: 0.66
top_n: 4
Accuracy: 0.77
top_n: 8
Accuracy: 0.93

2022-05-27 03:06:47.221225
n_artist: 32
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.46
top_n: 2
Accuracy: 0.61
top_n: 4
Accuracy: 0.7
top_n: 8
Accuracy: 0.81
top_n: 16
Accuracy: 0.97



In [132]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'}).update_traces(mode='lines+markers')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    yaxis_range=[0,1])
fig.show()