In [714]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import random
import nltk
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs
import pickle
from datetime import datetime
import string
from matplotlib.pyplot import figure

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [715]:
import glob, os

#You may comment this line if *WinError 2* occurs
# os.chdir("/thaisongs")

all_files = glob.glob("*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

song_df = pd.concat(li, axis=0, ignore_index=True)

n_artists: 32


In [716]:
lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))

lyric_in_square_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
print('Number of square brackets: {}'.format(len(lyric_in_square_brackets)))

lyric_in_curly_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\{(.*?)\}',s))), [])
print('Number of curly brackets: {}'.format(len(lyric_in_curly_brackets)))

Number of round brackets: 803
Number of square brackets: 1
Number of curly brackets: 0


In [717]:
# remove round brackets but not text within
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\(|\)', '', s))

lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))

Number of round brackets: 0


In [718]:
dot = sum(list(song_df['lyric'].map(lambda s: re.findall('\.',s))), [])
print('Number of dot: {}'.format(len(dot)))

Number of dot: 788


In [719]:
# remove dot 
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\.', '', s))

dot = sum(list(song_df['lyric'].map(lambda s: re.findall('\.',s))), [])
print('Number of dot: {}'.format(len(dot)))

Number of dot: 0


In [720]:
# count number of lines
song_df['lines'] = song_df['lyric'].map(lambda t: len(re.findall(r'\n', t)))

In [721]:
# remove line breaks
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r' \n|\n', '', s))

In [722]:
lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))
song_df.head()

Number of round brackets: 0


Unnamed: 0,song_name,href,lyric,artist,lines
0,อย่าหยุดฝัน,/music/thailyric/8770,คนบางคนมีฝันแต่ไม่เคยตามหามีกำลังก้าวไปก็ไม่เค...,bie_sukrit,26
1,รักแท้มีอยู่จริง,/music/thailyric/2945,ซ่อนตัวเองมานานเท่าไร จะไปกลัวทำไมความรักสิ่งท...,bie_sukrit,32
2,7 วันที่ฉันเหงา,/music/thailyric/2930,วันอาทิตย์เธอไปจากฉันวันจันทร์ดูเหมือนมันช่างว...,bie_sukrit,46
3,ความทรงจำในลมหายใจ,/music/thailyric/2932,มันคงเป็นโชคชะตาที่ฉันต้องเข้าใจความรักเรานั้น...,bie_sukrit,35
4,มากมาย,/music/thailyric/2943,มองไม่เห็นด้วยตา ถ้าจะรับรู้ต้องด้วยใจถามว่ารั...,bie_sukrit,38


Tokenise

In [723]:
import pythainlp
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords
from pythainlp.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.corpus import words
from stop_words import get_stop_words

In [724]:
import nltk
nltk.download('words')
th_stop = tuple(thai_stopwords())
en_stop = tuple(get_stop_words('en'))
p_stemmer = PorterStemmer()

[nltk_data] Downloading package words to C:\Users\FACT-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [725]:
def split_word(text):
            
    
    tokens = word_tokenize(text,engine='newmm')
    
    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]
    
    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

In [726]:
song_df['words'] = song_df['lyric'].map(lambda s: split_word(s))

In [727]:
song_df[['lyric','words']].tail()

Unnamed: 0,lyric,words
1097,แอบมองเธอมาทั้งวัน ทุกเช้าเย็นไม่รู้ว่าเธอเห็น...,"[แอบมอง, ทั้งวัน, เช้า, เย็น, รู้, ทำ, น่ารัก,..."
1098,ฉันเคยออกตามหา คนๆ นั้นจากโลกทั้งใบแล้วก็ไม่มี...,"[ตามหา, คน, โลก, ใบ, แล้วก็, ความฝัน, โลก, ใบ,..."
1099,อยู่คนเดียวมานานหลายปีไม่มีใครเข้ามาสักทีแต่ชี...,"[คนเดียว, ปี, เข้ามา, สักที, ชีวิต, เดิน, ต่อไ..."
1100,หัวใจมันเกิดเป็นอะไรสักอย่างหลงทางหรือเรียกร้อ...,"[หัวใจ, สัก, หลงทาง, ขอร้อง, ชอบกล, เหมือนว่า,..."
1101,คุณและคุณ และคุณเท่านั้นที่ทำให้ฉันนั้นสตั้นอย...,"[ฉันนั้น, สตั้น, ตรงนี้, ฉันนั้น, อึ้ง, เป็นอย..."


In [728]:
song_df['n_words'] = song_df['words'].map(len)

EDA

In [729]:
# number of songs
print('number of songs: ', str(len(song_df)))

# number of artists
print('number of artists: ', str(len(song_df['artist'].unique())))

number of songs:  1102
number of artists:  32


In [730]:
# distribution songs per artist
song_count_df = song_df.groupby('artist')[['song_name']].count()
print()

fig = px.histogram(song_count_df, x='song_name', title='Songs per artist', labels={'song_name': 'Songs'})
fig.show()




In [731]:
# distribution words per song
fig = px.histogram(song_df, x='n_words', title='Words per song')
fig.show()

In [732]:
# # create dataframe with lists of artists
song_df['words_str'] = song_df['words'].map(lambda lst: ' '.join(lst))

# # map text to artists
# words_to_artist = {}
# for tp in song_df[['artist', 'words_str']].itertuples(index=False):
#     artist = tp[0]
#     words = tp[1]
#     if words in words_to_artist:
#         words_to_artist[words].append(artist)
#     else:
#         words_to_artist[words] = [artist]

# # insert list of artists to dataframe
# song_df['artists'] = song_df['words_str'].map(words_to_artist)
# song_df['duplicates'] = song_df['artists'].map(len) - 1

# # convert list of artists to set of artists
# song_df['artists'] = song_df['artists'].map(set)
# song_df['n_artists'] = song_df['artists'].map(len)

# # remove duplicate songs
# artist_text_df = song_df.drop_duplicates('words_str')

Feature['artist'].value_counts() engineering

In [733]:
song_df

Unnamed: 0,song_name,href,lyric,artist,lines,words,n_words,words_str
0,อย่าหยุดฝัน,/music/thailyric/8770,คนบางคนมีฝันแต่ไม่เคยตามหามีกำลังก้าวไปก็ไม่เค...,bie_sukrit,26,"[คน, บางคน, ฝัน, ตามหา, ก้าว, ไขว่คว้า, กลัว, ...",73,คน บางคน ฝัน ตามหา ก้าว ไขว่คว้า กลัว บางคน ฝั...
1,รักแท้มีอยู่จริง,/music/thailyric/2945,ซ่อนตัวเองมานานเท่าไร จะไปกลัวทำไมความรักสิ่งท...,bie_sukrit,32,"[ซ่อน, ตนเอง, กลัว, ความรัก, รู้จัก, สักครั้ง,...",112,ซ่อน ตนเอง กลัว ความรัก รู้จัก สักครั้ง ลอง ดี...
2,7 วันที่ฉันเหงา,/music/thailyric/2930,วันอาทิตย์เธอไปจากฉันวันจันทร์ดูเหมือนมันช่างว...,bie_sukrit,46,"[อา, เดือน, ดูเหมือน, ช่าง, ว่างเปล่า, นอน, เห...",115,อา เดือน ดูเหมือน ช่าง ว่างเปล่า นอน เหงา อ. เ...
3,ความทรงจำในลมหายใจ,/music/thailyric/2932,มันคงเป็นโชคชะตาที่ฉันต้องเข้าใจความรักเรานั้น...,bie_sukrit,35,"[โชคชะตา, ความรัก, ปล่อย, ทาง, ดีแต่, ยังมี, น...",72,โชคชะตา ความรัก ปล่อย ทาง ดีแต่ ยังมี นึง รู้ ...
4,มากมาย,/music/thailyric/2943,มองไม่เห็นด้วยตา ถ้าจะรับรู้ต้องด้วยใจถามว่ารั...,bie_sukrit,38,"[มองไม่เห็น, ตา, รู้, ด้วยใจ, ถาม, รัก, ดี, ต่...",158,มองไม่เห็น ตา รู้ ด้วยใจ ถาม รัก ดี ต่อให้ ดาว...
...,...,...,...,...,...,...,...,...
1097,รู้ได้แล้วมั้ย,/music/thailyric/20950,แอบมองเธอมาทั้งวัน ทุกเช้าเย็นไม่รู้ว่าเธอเห็น...,zom_marie,40,"[แอบมอง, ทั้งวัน, เช้า, เย็น, รู้, ทำ, น่ารัก,...",146,แอบมอง ทั้งวัน เช้า เย็น รู้ ทำ น่ารัก ใจ สั่น...
1098,โลกอีกใบ,/music/thailyric/17300,ฉันเคยออกตามหา คนๆ นั้นจากโลกทั้งใบแล้วก็ไม่มี...,zom_marie,34,"[ตามหา, คน, โลก, ใบ, แล้วก็, ความฝัน, โลก, ใบ,...",71,ตามหา คน โลก ใบ แล้วก็ ความฝัน โลก ใบ สัก คน ร...
1099,หรือฉันคิดไปเอง,/music/thailyric/17617,อยู่คนเดียวมานานหลายปีไม่มีใครเข้ามาสักทีแต่ชี...,zom_marie,58,"[คนเดียว, ปี, เข้ามา, สักที, ชีวิต, เดิน, ต่อไ...",71,คนเดียว ปี เข้ามา สักที ชีวิต เดิน ต่อไป เข้าม...
1100,อยากจะหายตัว,/music/thailyric/18678,หัวใจมันเกิดเป็นอะไรสักอย่างหลงทางหรือเรียกร้อ...,zom_marie,39,"[หัวใจ, สัก, หลงทาง, ขอร้อง, ชอบกล, เหมือนว่า,...",125,หัวใจ สัก หลงทาง ขอร้อง ชอบกล เหมือนว่า มีเหตุ...


## Feature engineering

### Number of words

In [734]:
n_artist = 32
random.seed(0)

artist_select = random.choices(song_df['artist'].unique(), k=n_artist)

song_filter_df = song_df.loc[song_df['artist'].isin(artist_select)]
print('Total number of songs: {}'.format(len(song_filter_df)))
song_filter_df.groupby('artist')[['song_name']].count().reset_index().rename(columns={'song_name':'n_songs'})

Total number of songs: 609


Unnamed: 0,artist,n_songs
0,bowkylion,25
1,da_endorphine,45
2,dome_jaruwat,16
3,gavin_d,38
4,getsunova,34
5,klear,40
6,kong_saharat,13
7,non_tanont,36
8,nont_tanont,36
9,rose_sirinthip,67


In [735]:
fig = px.box(song_filter_df, x='artist', y='n_words', title='Word count per song by artist')
fig.show()

### Repeated

In [736]:
# number of unique stems
song_df['n_unique_words'] = song_df['words'].map(lambda lst: len(set(lst)))

# ratio of unique stems
song_df['unique_words_ratio'] = song_df['n_unique_words'] / song_df['n_words']

# attach column to selected artists
song_filter_df = song_filter_df.join(song_df['unique_words_ratio'])

In [737]:
fig = px.box(song_filter_df, x='artist', y='unique_words_ratio', title='Ratio of unique words to all words')
fig.show()

### Words per line

In [738]:
# calculate number of words per line
song_df['words_per_line'] = song_df['n_words'] / song_df['lines'].astype(float)

song_filter_df = song_filter_df.join(song_df['words_per_line'])

In [739]:
fig = px.box(song_filter_df, x='artist', y='words_per_line', title='Words per line')
fig.show()

## TFIDF

In [740]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [741]:
# initialise count vectorizer
cv = CountVectorizer(analyzer=lambda x:x.split())

word_count_vector = cv.fit_transform(song_df['words_str'])

# compute idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [742]:
# print idf values
tfidf_df = pd.DataFrame({'word': cv.get_feature_names(), 'weight': tfidf_transformer.idf_})
 
# get lowest weights
tfidf_df.sort_values('weight').head()

Unnamed: 0,word,weight
4468,รู้,1.47637
4376,รัก,1.501501
6828,ใจ,1.527279
2117,คน,1.5569
5300,หัวใจ,1.740488


In [743]:
cv.get_feature_names()

['!',
 '#',
 '%',
 "'",
 "'é",
 '*',
 '**',
 ',',
 '-',
 '/',
 ':',
 '::',
 '?',
 'AD',
 'AK',
 'Ah',
 'Ay',
 'Ba',
 'Be',
 'Br',
 'C',
 'DC',
 'Da',
 'De',
 'Do',
 'El',
 'GG',
 'Go',
 'Hm',
 'Ho',
 'I',
 'If',
 'Ig',
 'Il',
 'In',
 'It',
 'J',
 'Ja',
 'Je',
 'KQ',
 'Ko',
 'LV',
 'La',
 'Lo',
 'Ma',
 'Me',
 'My',
 'No',
 'OK',
 'Oh',
 'On',
 'Se',
 'Si',
 'So',
 'TV',
 'Te',
 'To',
 'Tu',
 'Up',
 'Wa',
 'We',
 'Wo',
 'Ya',
 'Yo',
 '[',
 '\\',
 ']#',
 'addict',
 'adventur',
 'aey',
 'againbitt',
 'againeven',
 'againi',
 'againstorm',
 'againw',
 'againyeah',
 'ah',
 'aha',
 'ahand',
 'ahgiv',
 'ahthat',
 'aim',
 'ain',
 'aint',
 'air',
 'aixd',
 'alert',
 'all',
 'alleven',
 'allmi',
 'allpleas',
 'allw',
 'allwon',
 'alon',
 'aloneand',
 'alonei',
 'alonemeet',
 'along',
 'alreadi',
 'alright',
 'alrightjust',
 'alrightrap',
 'alrightw',
 'alskar',
 'alway',
 'amaz',
 'amcuz',
 'america',
 'amnoth',
 'amo-t',
 'amour',
 'amyou',
 'and',
 'andi',
 'animalit',
 'anniversari',
 'anoth',

In [744]:
# get highest weights
tfidf_df.sort_values('weight', ascending=False).head()

Unnamed: 0,word,weight
6038,เพียบพร้อม,7.312642
2639,ชักพา,7.312642
2625,ชอนไช,7.312642
5522,อู๊อู๊,7.312642
2628,ชอบแล้ว,7.312642


In [745]:
# assign tf idf scores to each song
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

# attach count vectors to dataframe
tf_idf_vector_lst = [-1] * len(song_df)
for i in range(len(song_df)):
    tf_idf_vector_lst[i] = tf_idf_vector[i]
song_df['tf_idf_vector'] = tf_idf_vector_lst    

song_df['tf_idf_score'] = song_df['tf_idf_vector'].map(lambda vec: np.sum(vec.todense()))

# join valus to selected artists
song_filter_df = song_filter_df.join(song_df[['tf_idf_vector', 'tf_idf_score']])

In [746]:
fig = px.box(song_filter_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

In [747]:
# calculate mean vector
def get_mean_vector(vec_lst):
    return csr_matrix(vstack(vec_lst).mean(axis=0))

In [748]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song_name': len}).reset_index()\
                   .rename(columns={'song_name': 'n_songs'})

# get selected artists
artist_filter_df = artist_df.loc[artist_df['artist'].isin(song_filter_df['artist'])]

In [749]:
similarity_matrix = cosine_similarity(vstack(artist_filter_df['tf_idf_vector']), 
                                      vstack(artist_filter_df['tf_idf_vector']))
artist_names = artist_filter_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))
fig.show()

### Similarity of Songs

In [750]:
artist_filter_df

Unnamed: 0,artist,tf_idf_vector,n_songs
3,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25
7,da_endorphine,"(0, 3)\t0.006672825882450504\n (0, 22)\t0.0...",45
9,dome_jaruwat,"(0, 3)\t0.025203730291132886\n (0, 18)\t0.0...",16
12,gavin_d,"(0, 3)\t0.04613606074994175\n (0, 5)\t0.001...",38
13,getsunova,"(0, 3)\t0.013167249460086175\n (0, 10)\t0.0...",34
15,klear,"(0, 7)\t0.012296044527670032\n (0, 15)\t0.0...",40
16,kong_saharat,"(0, 298)\t0.018989843726920103\n (0, 957)\t...",13
18,non_tanont,"(0, 3)\t0.0021825959221359647\n (0, 10)\t0....",36
19,nont_tanont,"(0, 3)\t0.0021825959221359647\n (0, 10)\t0....",36
21,rose_sirinthip,"(0, 3)\t0.001119757153695879\n (0, 23)\t0.0...",67


In [751]:
song_filter_df.head(3)

Unnamed: 0,song_name,href,lyric,artist,lines,words,n_words,words_str,unique_words_ratio,words_per_line,tf_idf_vector,tf_idf_score
157,ก่อนใคร,/music/thailyric/19128,เธอมีใครที่เข้ามาแทนที่ฉันและเขาเป็นคนสำคัญฉัน...,bowkylion,34,"[เข้ามา, แทนที่, คนสำคัญ, รัก, ไหม, ยังมี, รู้...",49,เข้ามา แทนที่ คนสำคัญ รัก ไหม ยังมี รู้ ใจ เหม...,0.44898,1.441176,"(0, 7001)\t0.12408000025924873\n (0, 6943)\...",3.860352
158,เจ้าป่า (Lionheart),/music/thailyric/19127,โปรดฟัง นี่คือเสียงข้างในหัวใจที่ดังหากหมดรักแ...,bowkylion,18,"[ชอบ, ฟัง, เสียง, ข้างใน, หัวใจ, รัก, ศรัทธา, ...",45,ชอบ ฟัง เสียง ข้างใน หัวใจ รัก ศรัทธา ความหลัง...,0.955556,2.5,"(0, 7003)\t0.12204338591091766\n (0, 6828)\...",6.039624
159,ทิวาสวัสดิ์,/music/thailyric/19130,ถ้าเธอยังรักกัน ถ้าฉันยังสำคัญอย่าปล่อยให้ฉันร...,bowkylion,28,"[รัก, อย่า, ปล่อย, ร้องไห้, คนเดียว, ปวดใจ, รั...",70,รัก อย่า ปล่อย ร้องไห้ คนเดียว ปวดใจ รัก ปล่อย...,0.585714,2.5,"(0, 6947)\t0.10752099469619929\n (0, 6617)\...",5.663194


In [752]:
artist_song_filter_df = pd.merge(artist_filter_df[['artist', 'tf_idf_vector', 'n_songs']].assign(key = 0), 
                                 song_filter_df[['artist', 'tf_idf_vector', 'song_name']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)
artist_song_filter_df['same_artist'] = artist_song_filter_df['artist_artist'] == artist_song_filter_df['artist_song']

In [753]:
artist_song_filter_df[artist_song_filter_df['same_artist']==False].head()

Unnamed: 0,artist_artist,tf_idf_vector_artist,n_songs,artist_song,tf_idf_vector_song,song_name,same_artist
25,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25,da_endorphine,"(0, 7003)\t0.05587310558728791\n (0, 6828)\...",Revolution,False
26,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25,da_endorphine,"(0, 7007)\t0.11915105716996659\n (0, 6842)\...",คำอำลา (Have To Say Goodbye),False
27,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25,da_endorphine,"(0, 7001)\t0.03241409833409261\n (0, 6617)\...",ฉันยังอยู่,False
28,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25,da_endorphine,"(0, 7001)\t0.14744108941289008\n (0, 6954)\...",ระหว่างเรา...คืออะไร,False
29,bowkylion,"(0, 3)\t0.003125338259980025\n (0, 9)\t0.00...",25,da_endorphine,"(0, 6954)\t0.29165495800997854\n (0, 6861)\...",ไม่ต้องรู้ว่าเราคบกันแบบไหน,False


In [754]:
# calculate similarity of artist tf idf vector and song vector
def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
    # check if song is from same artist
    if same_artist:
        # deduct song vector from artist vector
        artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
    # calculate similarity
    return cosine_similarity(artist_vector, song_vector)[0][0]

In [755]:
artist_song_filter_df['vector_similarity'] = artist_song_filter_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_artist'], 
                                                                     row['tf_idf_vector_song'], 
                                                                     row['n_songs'], row['same_artist']), axis=1)

In [756]:
df = artist_song_filter_df

fig = go.Figure()

fig.add_trace(go.Violin(x=df['artist_artist'][df['same_artist']],
                        y=df['vector_similarity'][df['same_artist']],
                        legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                        side='negative')
             )
fig.add_trace(go.Violin(x=df['artist_artist'][~df['same_artist']],
                        y=df['vector_similarity'][~df['same_artist']],
                        legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                        side='positive')
             )


fig.update_layout(
    width=1500,
    height=800,)
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(title='Similarity of Songs')
fig.update_xaxes(range=[-0.5, 9.5])
fig.update_yaxes(range=[-0.1, 0.8], title='Similarity')
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

## Sentiment analysis

In [757]:
# polarity_lst = [-1] * len(song_df)
# subjectivity_lst = [-1] * len(song_df)

# for i, text in enumerate(song_df['lyric']):
#     sentiment = TextBlob(text)
#     polarity_lst[i] = sentiment.polarity
#     subjectivity_lst[i] = sentiment.subjectivity
    
# song_df['polarity'] = polarity_lst
# song_df['subjectivity'] = subjectivity_lst

# song_filter_df = song_filter_df.join(song_df[['polarity', 'subjectivity']])

### Polarity and Subjectivity of Songs

In [758]:
# fig = px.scatter(song_filter_df, x='polarity', y='subjectivity', color='artist', hover_data=['song_name'], title='Polarity and Subjectivity of Songs')
# fig.show()

### Polarity by artist

In [759]:
# fig = px.box(song_filter_df, x='artist', y='polarity', title='Polarity by artist')
# fig.show()

In [760]:
# song_filter_df

## Prediction

In [761]:
song_df# parameter
# number of sets
n_set = {'train': 20, 'val': 20}

# number of artists per set
n_artist = 3

# minimum number of songs of one artist
n_song_min = 5

# maximum number of song - artist pairs per artist set
n_song_artist_max = 32

In [762]:
song_df.columns

Index(['song_name', 'href', 'lyric', 'artist', 'lines', 'words', 'n_words',
       'words_str', 'n_unique_words', 'unique_words_ratio', 'words_per_line',
       'tf_idf_vector', 'tf_idf_score'],
      dtype='object')

In [763]:
def select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max):
    song_count_df = song_df.groupby('artist')[['artist']].count().rename(columns={'artist': 'count'})
    artist_lst = list(song_count_df.loc[song_count_df['count'] >= n_song_min].index.values)

    n_set_total = sum(n_set.values())

    artist_set = []
    while len(artist_set) < n_set_total:
        new_artist = tuple(np.random.choice(artist_lst, size=n_artist, replace=False))
        if new_artist not in artist_set:
            artist_set.append(new_artist)

    # split artist sets
    artist_select = {}
    for field, n in n_set.items():
        i_select = np.random.choice(range(len(artist_set)), size=n, replace=False)
        artist_list = list(artist_set)
        artist_select[field] = [artist_list[i] for i in i_select]
        artist_set = [s for s in artist_set if s not in artist_select[field]]
    # create dataframe with all features
    feature_dict = {}
    # dictionary to map artist set id to list of artists
    set_id_to_artist_tp = {}

    i = 0
    for field, artist_set in artist_select.items():
        df_lst = []
        for artist_tp in artist_set:
            i += 1
            df = song_df.loc[song_df['artist'].isin(artist_tp), 
                             ['artist', 'song_name', 'n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_vector', 
                              'tf_idf_score']]
            # check if number of songs is too high
            if len(df) * n_artist > n_song_artist_max:
                df = df.sample(int(n_song_artist_max / n_artist), random_state=0)
                
            df['artist_set_id'] = i
            set_id_to_artist_tp[i] = artist_tp
            df_lst.append(df)
        feature_dict[field] = pd.concat(df_lst)  
        print('Number of songs in {}: {}'.format(field, len(feature_dict[field])))

    # get all selected artists
    artist_select_set = set.union(*[set(sum(tp_lst, ())) for tp_lst in artist_select.values()])

    # create artist dataframe from training data
    df_lst = []
    for artist, df in song_df.loc[song_df['artist'].isin(artist_select_set)].groupby('artist'):
        dic = {'artist': artist}
        # calculate averages and standard diviations
        for field in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            dic[field + '_mean'] = df[field].mean()
            dic[field + '_std'] = df[field].std()

        # number of songs
        dic['songs'] = len(df)

        # calculate average tf idf vector
        dic['tf_idf_vector_mean'] = get_mean_vector(df['tf_idf_vector'])

        df_lst.append(pd.DataFrame(dic, index=[0]))
    artist_feature_df = pd.concat(df_lst)

    def get_features(df):
        # get artist set id
        artist_set_id = df['artist_set_id'].iloc[0]
        
        # get all artists
        artist_feature_select_df = artist_feature_df.loc[artist_feature_df['artist']\
                                                         .isin(set_id_to_artist_tp[artist_set_id])]

        # merge dataframes
        artist_song_feature_df = pd.merge(artist_feature_select_df.assign(key=0), df.assign(key=0), on='key', 
                                          suffixes=['_artist', '_song']).drop('key', axis=1)    
        artist_song_feature_df['same_artist'] = \
            artist_song_feature_df['artist_artist'] == artist_song_feature_df['artist_song']

        # calculate features
        # add feature polarity
        for feature in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            artist_song_feature_df[feature + '_diff'] = \
                artist_song_feature_df[feature] - artist_song_feature_df[feature + '_mean']
            artist_song_feature_df[feature + '_diff_std'] = \
                artist_song_feature_df[feature + '_diff'] / artist_song_feature_df[feature + '_std']

        # calculate vector similarity between artist and song
        artist_song_feature_df['vector_similarity'] = \
            artist_song_feature_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_mean'], 
                                                      row['tf_idf_vector'], row['songs'], row['same_artist']), 
                                         axis=1)    
        return artist_song_feature_df

    artist_song_feature = {}
    for field in feature_dict:
        artist_song_feature[field] = feature_dict[field].groupby('artist_set_id').apply(get_features)\
                                                        .reset_index(drop=True)
        
    return artist_song_feature

In [764]:
np.random.seed(0)
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

Number of songs in train: 200
Number of songs in val: 200


In [765]:
artist_song_feature['train'].iloc[0]

artist_artist                                                       kong_saharat
n_words_mean                                                                71.0
n_words_std                                                            17.949002
unique_words_ratio_mean                                                 0.521636
unique_words_ratio_std                                                  0.109577
words_per_line_mean                                                     2.809433
words_per_line_std                                                      0.927452
tf_idf_score_mean                                                       4.843549
tf_idf_score_std                                                        0.767812
songs                                                                         13
tf_idf_vector_mean               (0, 298)\t0.018989843726920103\n  (0, 957)\t...
artist_song                                                         kong_saharat
song_name                   

Markdown

In [766]:
feature = ['n_words_diff', 'n_words_diff_std',
       'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
       'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff',
       'tf_idf_score_diff_std','vector_similarity']
df_lst = []

for f in feature:
   
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
    
feature_df = pd.concat(df_lst)
feature_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,-2.0
1,False,n_words_diff,45.0
2,False,n_words_diff,69.0
3,False,n_words_diff,-12.0
4,False,n_words_diff,3.0


In [767]:
feature_df.head()

Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,-2.0
1,False,n_words_diff,45.0
2,False,n_words_diff,69.0
3,False,n_words_diff,-12.0
4,False,n_words_diff,3.0


In [768]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [769]:
feature_df

Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,-2.000000
1,False,n_words_diff,45.000000
2,False,n_words_diff,69.000000
3,False,n_words_diff,-12.000000
4,False,n_words_diff,3.000000
...,...,...,...
595,False,vector_similarity,0.118857
596,True,vector_similarity,0.065064
597,False,vector_similarity,0.182692
598,True,vector_similarity,0.218756


In [770]:
fig = violine_feature_plot(feature_df, feature[1:])
fig.update_layout(
    autosize=False,
    width=1500,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

In [771]:
fig = violine_feature_plot(feature_df, ['n_words_diff_std', 'unique_words_ratio_diff_std', 'words_per_line_diff_std', 'tf_idf_score_diff_std'])
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

In [772]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_xaxes(range=[-1, 1])
fig.show()

### Prepare data

In [774]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, pipeline):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

In [775]:
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 40}
n_song_artist_max = 50

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline)

Number of songs in train: 640


In [776]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

In [777]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # print(predict_select)
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [778]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.65


In [779]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.825


In [813]:
n_artist_lst = [2, 4, 8, 16]
top_n_lst = [1, 2, 4, 8]
n_song_artist_max = 16
np.random.seed(2)

n_set = {'train': 40, 'val': 40}

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std', ]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-04-22 16:24:12.671825
n_artist: 2
Number of songs in train: 320
Number of songs in val: 320
top_n: 1
Accuracy: 0.875

2022-04-22 16:24:14.077825
n_artist: 4
Number of songs in train: 160
Number of songs in val: 160
top_n: 1
Accuracy: 0.6
top_n: 2
Accuracy: 0.8

2022-04-22 16:24:15.405824
n_artist: 8
Number of songs in train: 80
Number of songs in val: 80
top_n: 1
Accuracy: 0.475
top_n: 2
Accuracy: 0.725
top_n: 4
Accuracy: 0.85

2022-04-22 16:24:16.752839
n_artist: 16
Number of songs in train: 40
Number of songs in val: 40
top_n: 1
Accuracy: 0.3
top_n: 2
Accuracy: 0.425
top_n: 4
Accuracy: 0.675
top_n: 8
Accuracy: 0.825



In [814]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'}).update_traces(mode='lines+markers')
fig.show()