### Import Library

In [9]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from scipy.sparse import csr_matrix, vstack

import pickle
from datetime import datetime

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Import the feature-extracted data

In [2]:
song_df = pickle.load(open('thaisongs/th_songs_2.bin', 'rb'))

In [3]:
song_df.dtypes

song_name              object
href                   object
lyric                  object
artist                 object
lines                   int64
words                  object
n_words                 int64
words_str              object
artists                object
duplicates              int64
n_artists               int64
n_unique_words          int64
unique_words_ratio    float64
words_per_line        float64
tf_idf_vector          object
tf_idf_score          float64
dtype: object

## Prediction

In [4]:
# parameter
# number of sets
n_set = {'train': 20, 'val': 20}

# number of artists per set
n_artist = 3

# minimum number of songs of one artist
n_song_min = 5

# maximum number of song - artist pairs per artist set
n_song_artist_max = 100

In [5]:
song_df.columns

Index(['song_name', 'href', 'lyric', 'artist', 'lines', 'words', 'n_words',
       'words_str', 'artists', 'duplicates', 'n_artists', 'n_unique_words',
       'unique_words_ratio', 'words_per_line', 'tf_idf_vector',
       'tf_idf_score'],
      dtype='object')

In [7]:
def select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max):
    song_count_df = song_df.groupby('artist')[['artist']].count().rename(columns={'artist': 'count'})
    artist_lst = list(song_count_df.loc[song_count_df['count'] >= n_song_min].index.values)

    n_set_total = sum(n_set.values())

    artist_set = []
    while len(artist_set) < n_set_total:
        new_artist = tuple(np.random.choice(artist_lst, size=n_artist, replace=False))
        if new_artist not in artist_set:
            artist_set.append(new_artist)

    # split artist sets
    artist_select = {}
    for field, n in n_set.items():
        i_select = np.random.choice(range(len(artist_set)), size=n, replace=False)
        artist_list = list(artist_set)
        artist_select[field] = [artist_list[i] for i in i_select]
        artist_set = [s for s in artist_set if s not in artist_select[field]]
    # create dataframe with all features
    feature_dict = {}
    # dictionary to map artist set id to list of artists
    set_id_to_artist_tp = {}

    i = 0
    for field, artist_set in artist_select.items():
        df_lst = []
        for artist_tp in artist_set:
            i += 1
            df = song_df.loc[song_df['artist'].isin(artist_tp), 
                             ['artist', 'song_name', 'n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_vector', 
                              'tf_idf_score']]
            # check if number of songs is too high
            if len(df) * n_artist > n_song_artist_max:
                df = df.sample(int(n_song_artist_max / n_artist), random_state=0)
                
            df['artist_set_id'] = i
            set_id_to_artist_tp[i] = artist_tp
            df_lst.append(df)
        feature_dict[field] = pd.concat(df_lst)  
        print('Number of songs in {}: {}'.format(field, len(feature_dict[field])))

    # get all selected artists
    artist_select_set = set.union(*[set(sum(tp_lst, ())) for tp_lst in artist_select.values()])

    # calculate mean vector
    def get_mean_vector(vec_lst):
        return csr_matrix(vstack(vec_lst).mean(axis=0))

    # create artist dataframe from training data
    df_lst = []
    for artist, df in song_df.loc[song_df['artist'].isin(artist_select_set)].groupby('artist'):
        dic = {'artist': artist}
        # calculate averages and standard diviations
        for field in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            dic[field + '_mean'] = df[field].mean()
            dic[field + '_std'] = df[field].std()

        # number of songs
        dic['songs'] = len(df)

        # calculate average tf idf vector
        dic['tf_idf_vector_mean'] = get_mean_vector(df['tf_idf_vector'])

        df_lst.append(pd.DataFrame(dic, index=[0]))
    artist_feature_df = pd.concat(df_lst)

    def get_features(df):
        # get artist set id
        artist_set_id = df['artist_set_id'].iloc[0]
        
        # get all artists
        artist_feature_select_df = artist_feature_df.loc[artist_feature_df['artist']\
                                                         .isin(set_id_to_artist_tp[artist_set_id])]

        # merge dataframes
        artist_song_feature_df = pd.merge(artist_feature_select_df.assign(key=0), df.assign(key=0), on='key', 
                                          suffixes=['_artist', '_song']).drop('key', axis=1)    
        artist_song_feature_df['same_artist'] = \
            artist_song_feature_df['artist_artist'] == artist_song_feature_df['artist_song']

        # calculate features
        # add feature polarity
        for feature in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            artist_song_feature_df[feature + '_diff'] = \
                artist_song_feature_df[feature] - artist_song_feature_df[feature + '_mean']
            artist_song_feature_df[feature + '_diff_std'] = \
                artist_song_feature_df[feature + '_diff'] / artist_song_feature_df[feature + '_std']
        
        # calculate similarity of artist tf idf vector and song vector
        def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
            # check if song is from same artist
            if same_artist:
                # deduct song vector from artist vector
                artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
            # calculate similarity
            return cosine_similarity(artist_vector, song_vector)[0][0]

        # calculate vector similarity between artist and song
        artist_song_feature_df['vector_similarity'] = \
            artist_song_feature_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_mean'], 
                                                      row['tf_idf_vector'], row['songs'], row['same_artist']), axis=1)    
        return artist_song_feature_df

    artist_song_feature = {}
    for field in feature_dict:
        artist_song_feature[field] = feature_dict[field].groupby('artist_set_id').apply(get_features)\
                                                        .reset_index(drop=True)
        
    return artist_song_feature

In [8]:
np.random.seed(0)
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

Number of songs in train: 660
Number of songs in val: 660


In [10]:
artist_song_feature['train'].iloc[0]

artist_artist                                                      image_suthita
n_words_mean                                                           69.761905
n_words_std                                                            22.500899
unique_words_ratio_mean                                                 0.535736
unique_words_ratio_std                                                  0.107465
words_per_line_mean                                                     2.246001
words_per_line_std                                                      0.634159
tf_idf_score_mean                                                        4.95531
tf_idf_score_std                                                        0.730591
songs                                                                         21
tf_idf_vector_mean               (0, 66)\t0.004827426820205466\n  (0, 76)\t0....
artist_song                                                               liltan
song_name                   

In [11]:
feature = ['n_words_diff', 'n_words_diff_std',
        'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
        'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff',
        'tf_idf_score_diff_std','vector_similarity']
df_lst = []

for f in feature:
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
    
feature_df = pd.concat(df_lst)
feature_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,125.238095
1,True,n_words_diff,9.238095
2,False,n_words_diff,133.238095
3,False,n_words_diff,106.238095
4,False,n_words_diff,23.238095


In [12]:
feature_df.head()

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,125.238095
1,True,n_words_diff,9.238095
2,False,n_words_diff,133.238095
3,False,n_words_diff,106.238095
4,False,n_words_diff,23.238095


In [13]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [14]:
feature_df

Unnamed: 0,same_artist,feature,value
0,False,n_words_diff,125.238095
1,True,n_words_diff,9.238095
2,False,n_words_diff,133.238095
3,False,n_words_diff,106.238095
4,False,n_words_diff,23.238095
...,...,...,...
1975,False,vector_similarity,0.176533
1976,True,vector_similarity,0.046077
1977,True,vector_similarity,0.103734
1978,False,vector_similarity,0.218545


In [15]:
fig = violine_feature_plot(feature_df, feature[1:])
fig.update_layout(
    autosize=False,
    width=2000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

In [16]:
fig = violine_feature_plot(feature_df, ['n_words_diff_std', 'unique_words_ratio_diff_std', 'words_per_line_diff_std', 'tf_idf_score_diff_std'])
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

In [17]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_xaxes(range=[-1, 1])
fig.show()

### Prepare data for prediction

In [18]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, pipeline):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

In [19]:
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 100}
n_song_artist_max = 100

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline)

Number of songs in train: 3296


In [20]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

## Validation

In [21]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # print(predict_select)
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [22]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.77


In [23]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.96


In [24]:
n_artist_lst = [2, 4, 8, 16, 32, 64, 128]
top_n_lst = [1, 2, 4, 8, 16, 32, 64]
n_song_artist_max = 128
np.random.seed(2)

n_set = {'train': 100, 'val': 100}

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std', ]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-04-27 23:48:46.019541
n_artist: 2
Number of songs in train: 4633
Number of songs in val: 4550
top_n: 1
Accuracy: 0.88

2022-04-27 23:48:57.909591
n_artist: 4
Number of songs in train: 3200
Number of songs in val: 3200
top_n: 1
Accuracy: 0.81
top_n: 2
Accuracy: 0.94

2022-04-27 23:49:12.415600
n_artist: 8
Number of songs in train: 1600
Number of songs in val: 1600
top_n: 1
Accuracy: 0.69
top_n: 2
Accuracy: 0.85
top_n: 4
Accuracy: 0.95

2022-04-27 23:49:27.374621
n_artist: 16
Number of songs in train: 800
Number of songs in val: 800
top_n: 1
Accuracy: 0.58
top_n: 2
Accuracy: 0.74
top_n: 4
Accuracy: 0.83
top_n: 8
Accuracy: 0.92

2022-04-27 23:49:41.392654
n_artist: 32
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.38
top_n: 2
Accuracy: 0.5
top_n: 4
Accuracy: 0.63
top_n: 8
Accuracy: 0.78
top_n: 16
Accuracy: 0.9

2022-04-27 23:49:54.958142
n_artist: 64
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.22
top_n: 2
Accuracy: 0.

In [25]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'}).update_traces(mode='lines+markers')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,)
fig.show()

In [27]:
# feature_columns = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']
# # feature_columns = ['unique_words_ratio', 'words_per_line', 'tf_idf_score']

# embeddings = song_df[feature_columns].copy()
# # embeddings['n_words'] = (embeddings['n_words'])/50.
# embeddings = embeddings.values.tolist()

# identities = []

# name_dict = {}

# label_i = 0
# for name in song_df['artist'].values:
#     if name not in name_dict:
#         name_dict[name] = label_i
#         label_i = label_i+1
    
#     identities.append(name_dict[name])

In [28]:
# import torch

# embeddings = torch.Tensor(embeddings)
# identities = torch.Tensor(identities)

In [29]:
# import time 
# from sklearn.manifold import TSNE

# time_start = time.time()
# tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
# tsne_result = tsne.fit_transform(embeddings)
# print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [30]:
# import plotly.express as px

# df_subset = pd.DataFrame({'label': identities})

# df_subset['tsne-2d-one'] = tsne_result[:,0]
# df_subset['tsne-2d-two'] = tsne_result[:,1]

# fig = px.scatter(df_subset, x="tsne-2d-one", y="tsne-2d-two", color="label", height=1000, width=1000)
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
# fig.show()

In [31]:
# import time 
# from sklearn.manifold import TSNE

# time_start = time.time()
# tsne = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)
# tsne_result = tsne.fit_transform(embeddings)
# print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [32]:
# import plotly.express as px

# df_subset = pd.DataFrame({'label': identities})

# df_subset['tsne-3d-one'] = tsne_result[:,0]
# df_subset['tsne-3d-two'] = tsne_result[:,1]
# df_subset['tsne-3d-three'] = tsne_result[:,2]

# fig = px.scatter_3d(df_subset, x="tsne-3d-one", y="tsne-3d-two", z="tsne-3d-three", color="label")
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
# fig.show()