### Import Library

In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from datetime import datetime

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
import os, sys

sys.path.insert(0,os.path.realpath(os.path.join(os.path.pardir, '..')))

from config import cfg
from utils.utils import *
from utils.tfidf import *
from utils.feature import *

[nltk_data] Downloading package words to C:\Users\FACT-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


### Import the feature-extracted data

In [3]:
song_df = pd.read_pickle(os.path.join(cfg.DATA.BASE_PATH, 'fx_th_songs_base.pkl')) 

## Prediction

In [4]:
# parameter
# number of sets
n_set = {'train': 20, 'val': 20}

# number of artists per set
n_artist = 3

# minimum number of songs of one artist
n_song_min = 5

# maximum number of song - artist pairs per artist set
n_song_artist_max = 100

In [5]:
song_df.dtypes

artist                 object
song_name              object
href                   object
lyrics                 object
lines                   int64
words                  object
n_words                 int64
words_str              object
unique_words_ratio    float64
words_per_line        float64
tf_idf_vector          object
tf_idf_score          float64
dtype: object

In [12]:
np.random.seed(0)
columns = ['artist', 'song_name', 'lines', 'n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_vector', 'tf_idf_score']
stat_columns = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max, columns, stat_columns)

Number of songs in train: 660
Number of songs in val: 660


In [13]:
feature = ['n_words_diff', 'n_words_diff_std',
           'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
           'words_per_line_diff', 'words_per_line_diff_std',
           'tf_idf_score_diff', 'tf_idf_score_diff_std',
           'vector_similarity']
df_lst = []

pd.options.mode.chained_assignment = None  # default='warn'

for f in feature:
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
    
feature_df = pd.concat(df_lst)
feature_df.head()

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,-83.457143
1,True,n_words_diff,-19.457143
2,False,n_words_diff,-33.457143
3,True,n_words_diff,41.542857
4,False,n_words_diff,-6.457143


In [15]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [16]:
feature_df

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,-83.457143
1,True,n_words_diff,-19.457143
2,False,n_words_diff,-33.457143
3,True,n_words_diff,41.542857
4,False,n_words_diff,-6.457143
...,...,...,...
1975,False,vector_similarity,0.083269
1976,False,vector_similarity,0.087664
1977,True,vector_similarity,0.206923
1978,False,vector_similarity,0.109726


In [17]:
fig = violine_feature_plot(feature_df, feature[1:])
fig.update_layout(
    autosize=False,
    width=2000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

In [18]:
feature_df

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,-83.457143
1,True,n_words_diff,-19.457143
2,False,n_words_diff,-33.457143
3,True,n_words_diff,41.542857
4,False,n_words_diff,-6.457143
...,...,...,...
1975,False,vector_similarity,0.083269
1976,False,vector_similarity,0.087664
1977,True,vector_similarity,0.206923
1978,False,vector_similarity,0.109726


In [23]:
fig = violine_feature_plot(feature_df, ['n_words_diff_std', 'unique_words_ratio_diff_std', 'words_per_line_diff_std', 'tf_idf_score_diff_std'])
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

In [19]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_xaxes(range=[-1, 1])
fig.show()

### Prepare data for prediction

In [21]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, pipeline, columns, stat_columns):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max, columns, stat_columns)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

In [22]:
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 100}
n_song_artist_max = 100

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline, columns, stat_columns)

Number of songs in train: 3300


In [24]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

## Validation

In [25]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # print(predict_select)
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [26]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.82


In [27]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.95


In [30]:
n_artist_lst = [2, 4, 8, 16, 32]
top_n_lst = [1, 2, 4, 8, 16]
n_song_artist_max = 64
np.random.seed(2)

n_set = {'train': 100, 'val': 100}

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std', ]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline, columns, stat_columns)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-05-27 03:05:50.060658
n_artist: 2
Number of songs in train: 3200
Number of songs in val: 3200
top_n: 1
Accuracy: 0.91

2022-05-27 03:05:59.961302
n_artist: 4
Number of songs in train: 1600
Number of songs in val: 1600
top_n: 1
Accuracy: 0.78
top_n: 2
Accuracy: 0.92

2022-05-27 03:06:09.417159
n_artist: 8
Number of songs in train: 800
Number of songs in val: 800
top_n: 1
Accuracy: 0.81
top_n: 2
Accuracy: 0.9
top_n: 4
Accuracy: 0.97

2022-05-27 03:06:18.583766
n_artist: 16
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.78
top_n: 2
Accuracy: 0.92
top_n: 4
Accuracy: 0.98
top_n: 8
Accuracy: 0.99

2022-05-27 03:06:27.830930
n_artist: 32
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.22
top_n: 2
Accuracy: 0.28
top_n: 4
Accuracy: 0.56
top_n: 8
Accuracy: 0.77
top_n: 16
Accuracy: 0.9



In [33]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'}).update_traces(mode='lines+markers')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    yaxis_range=[0,1])
fig.show()