In [602]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import random
import nltk
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs
import pickle
from datetime import datetime
import string
from matplotlib.pyplot import figure

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [603]:
import glob, os

#You may comment this line if *WinError 2* occurs
# os.chdir("")

all_files = glob.glob("./thaisongs/*.csv")

li = []

for filename in all_files:
    song_df = pd.read_csv(filename, index_col=None, header=0)
    li.append(song_df)

song_df = pd.concat(li, axis=0, ignore_index=True)

In [604]:
lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))

lyric_in_square_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
print('Number of square brackets: {}'.format(len(lyric_in_square_brackets)))

lyric_in_curly_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\{(.*?)\}',s))), [])
print('Number of curly brackets: {}'.format(len(lyric_in_curly_brackets)))

Number of round brackets: 2787
Number of square brackets: 45
Number of curly brackets: 0


In [605]:
# remove round brackets but not text within
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\(|\)', '', s))

lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))

Number of round brackets: 0


In [606]:
dot = sum(list(song_df['lyric'].map(lambda s: re.findall('\.',s))), [])
print('Number of dot: {}'.format(len(dot)))

Number of dot: 6020


In [607]:
# remove dot, star, colon, semicolon 
dot = sum(list(song_df['lyric'].map(lambda s: re.findall('\.',s))), [])
star = sum(list(song_df['lyric'].map(lambda s: re.findall('\*',s))), [])
colon = sum(list(song_df['lyric'].map(lambda s: re.findall('\:',s))), [])
apos = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\'",s))), [])
# e_apos = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\é",s))), [])
exclam_mark = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\!",s))), [])
slash = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\/",s))), [])
question_mark = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\?",s))), [])
hashtag = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\#",s))), [])
comma = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\%",s))), [])
neg = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\-",s))), [])
comma = sum(list(song_df['lyric'].map(lambda s: re.findall(r"\,",s))), [])


song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\.', '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\*', '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r'\:', '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\'", '', s))
# song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\é", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\!", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\/", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\?", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\#", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\%", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\-", '', s))
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r"\,", '', s))


print('Number of dot: {}'.format(len(dot)))
print('Number of star: {}'.format(len(star)))
print('Number of colon: {}'.format(len(colon)))
print('Number of apos: {}'.format(len(apos)))
# print('Number of e_apos: {}'.format(len(e_apos)))
print('Number of exclam_mark: {}'.format(len(exclam_mark)))
print('Number of slash: {}'.format(len(slash)))
print('Number of question_mark: {}'.format(len(question_mark)))
print('Number of hashtag: {}'.format(len(hashtag)))
print('Number of percent: {}'.format(len(percent)))
print('Number of neg: {}'.format(len(neg)))
print('Number of comma: {}'.format(len(neg)))

Number of dot: 6020
Number of star: 86
Number of colon: 90
Number of apos: 1948
Number of exclam_mark: 190
Number of slash: 112
Number of question_mark: 186
Number of hashtag: 15
Number of percent: 1
Number of neg: 165
Number of comma: 165


In [608]:
# count number of lines
song_df['lines'] = song_df['lyric'].map(lambda t: len(re.findall(r'\n', t)))

In [609]:
# remove line breaks
song_df['lyric'] = song_df['lyric'].map(lambda s: re.sub(r' \n|\n', '', s))

In [610]:
lyric_in_round_brackets = sum(list(song_df['lyric'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(lyric_in_round_brackets)))
song_df.head()

Number of round brackets: 0


Unnamed: 0,song_name,href,lyric,artist,lines
0,คนข้างข้าง,/music/thailyric/8582,เธอเห็นขอบฟ้านั้นไหม สักวันจะพาเธอไปบนทางที่มี...,25_hours,36
1,คิดเหมือนกันหรือเปล่า,/music/thailyric/8529,ฉันรู้สึกเราเข้ากันพอดียิ้มที่เธอมีไม่เหมือนใค...,25_hours,34
2,เที่ยงคืนสิบห้านาที,/music/thailyric/11791,เที่ยงคืนสิบห้านาที กับวันที่ฉันนั่งเหม่อที่เด...,25_hours,39
3,ใบไม้,/music/thailyric/11789,เธอใช่ไหม ที่เดินเข้ามาและทำให้ใจฉันไหวดั่งยอด...,25_hours,43
4,ย้อนเวลา,/music/thailyric/11790,สิ่งดีดีที่มีจวบจนวันนี้มันยังคงดีงามเหมือนเก่...,25_hours,26


Tokenise

In [611]:
import pythainlp
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords
from pythainlp.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.corpus import words
from stop_words import get_stop_words
# from pythainlp.ulmfit import process_thai

In [612]:
import nltk
nltk.download('words')
th_stop = tuple(thai_stopwords())
en_stop = tuple(get_stop_words('en'))
p_stemmer = PorterStemmer()

[nltk_data] Downloading package words to C:\Users\FACT-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [613]:
def split_word(text):
            
    
    tokens = word_tokenize(text,engine='newmm')
    
    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]
    
    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

In [614]:
song_df['words'] = song_df['lyric'].map(lambda s: split_word(s))

In [615]:
song_df[['lyric','words']].tail()

Unnamed: 0,lyric,words
3536,แอบมองเธอมาทั้งวัน ทุกเช้าเย็นไม่รู้ว่าเธอเห็น...,"[แอบมอง, ทั้งวัน, เช้า, เย็น, รู้, ทำ, น่ารัก,..."
3537,ฉันเคยออกตามหา คนๆ นั้นจากโลกทั้งใบแล้วก็ไม่มี...,"[ตามหา, คน, โลก, ใบ, แล้วก็, ความฝัน, โลก, ใบ,..."
3538,อยู่คนเดียวมานานหลายปีไม่มีใครเข้ามาสักทีแต่ชี...,"[คนเดียว, ปี, เข้ามา, สักที, ชีวิต, เดิน, ต่อไ..."
3539,หัวใจมันเกิดเป็นอะไรสักอย่างหลงทางหรือเรียกร้อ...,"[หัวใจ, สัก, หลงทาง, ขอร้อง, ชอบกล, เหมือนว่า,..."
3540,คุณและคุณ และคุณเท่านั้นที่ทำให้ฉันนั้นสตั้นอย...,"[ฉันนั้น, สตั้น, ตรงนี้, ฉันนั้น, อึ้ง, เป็นอย..."


In [616]:
song_df['n_words'] = song_df['words'].map(len)

EDA

In [617]:
# number of songs
print('number of songs: ', str(len(song_df)))

# number of artists
print('number of artists: ', str(len(song_df['artist'].unique())))

number of songs:  3541
number of artists:  133


In [618]:
# distribution songs per artist
song_count_df = song_df.groupby('artist')[['song_name']].count()
print()

fig = px.histogram(song_count_df, x='song_name', title='Songs per artist', labels={'song_name': 'Songs'})
fig.show()




In [619]:
# distribution words per song
fig = px.histogram(song_df, x='n_words', title='Words per song')
fig.show()

In [620]:
# # create dataframe with lists of artists
song_df['words_str'] = song_df['words'].map(lambda lst: ' '.join(lst))

# # map text to artists
# words_to_artist = {}
# for tp in song_df[['artist', 'words_str']].itertuples(index=False):
#     artist = tp[0]
#     words = tp[1]
#     if words in words_to_artist:
#         words_to_artist[words].append(artist)
#     else:
#         words_to_artist[words] = [artist]

# # insert list of artists to dataframe
# song_df['artists'] = song_df['words_str'].map(words_to_artist)
# song_df['duplicates'] = song_df['artists'].map(len) - 1

# # convert list of artists to set of artists
# song_df['artists'] = song_df['artists'].map(set)
# song_df['n_artists'] = song_df['artists'].map(len)

# # remove duplicate songs
# artist_text_df = song_df.drop_duplicates('words_str')

Feature['artist'].value_counts() engineering

In [621]:
song_df

Unnamed: 0,song_name,href,lyric,artist,lines,words,n_words,words_str
0,คนข้างข้าง,/music/thailyric/8582,เธอเห็นขอบฟ้านั้นไหม สักวันจะพาเธอไปบนทางที่มี...,25_hours,36,"[ขอบฟ้า, ไหม, สักวัน, หกล้ม, ผู้ร่วมทาง, จับมื...",124,ขอบฟ้า ไหม สักวัน หกล้ม ผู้ร่วมทาง จับมือ เคีย...
1,คิดเหมือนกันหรือเปล่า,/music/thailyric/8529,ฉันรู้สึกเราเข้ากันพอดียิ้มที่เธอมีไม่เหมือนใค...,25_hours,34,"[รู้สึก, แมตช์, ยิ้ม, เหมือน, รู้, เพ้อ, นาที,...",43,รู้สึก แมตช์ ยิ้ม เหมือน รู้ เพ้อ นาที huhuhu ...
2,เที่ยงคืนสิบห้านาที,/music/thailyric/11791,เที่ยงคืนสิบห้านาที กับวันที่ฉันนั่งเหม่อที่เด...,25_hours,39,"[สองยาม, นาที, วัน, นั่ง, เหม่อ, เดิม, ชอบ, นั...",100,สองยาม นาที วัน นั่ง เหม่อ เดิม ชอบ นั่ง ตรงนี...
3,ใบไม้,/music/thailyric/11789,เธอใช่ไหม ที่เดินเข้ามาและทำให้ใจฉันไหวดั่งยอด...,25_hours,43,"[เดิน, เข้ามา, ใจ, ไหว, ยอด, กิ่งไม้, โดน, ลม,...",65,เดิน เข้ามา ใจ ไหว ยอด กิ่งไม้ โดน ลม ไหว ลม ใ...
4,ย้อนเวลา,/music/thailyric/11790,สิ่งดีดีที่มีจวบจนวันนี้มันยังคงดีงามเหมือนเก่...,25_hours,26,"[ดี, ดี, ดีงาม, เหมือน, เก่า, ต่างกัน, ตรงนี้,...",59,ดี ดี ดีงาม เหมือน เก่า ต่างกัน ตรงนี้ คน เดิม...
...,...,...,...,...,...,...,...,...
3536,รู้ได้แล้วมั้ย,/music/thailyric/20950,แอบมองเธอมาทั้งวัน ทุกเช้าเย็นไม่รู้ว่าเธอเห็น...,zom_marie,40,"[แอบมอง, ทั้งวัน, เช้า, เย็น, รู้, ทำ, น่ารัก,...",134,แอบมอง ทั้งวัน เช้า เย็น รู้ ทำ น่ารัก ใจ สั่น...
3537,โลกอีกใบ,/music/thailyric/17300,ฉันเคยออกตามหา คนๆ นั้นจากโลกทั้งใบแล้วก็ไม่มี...,zom_marie,34,"[ตามหา, คน, โลก, ใบ, แล้วก็, ความฝัน, โลก, ใบ,...",71,ตามหา คน โลก ใบ แล้วก็ ความฝัน โลก ใบ สัก คน ร...
3538,หรือฉันคิดไปเอง,/music/thailyric/17617,อยู่คนเดียวมานานหลายปีไม่มีใครเข้ามาสักทีแต่ชี...,zom_marie,58,"[คนเดียว, ปี, เข้ามา, สักที, ชีวิต, เดิน, ต่อไ...",71,คนเดียว ปี เข้ามา สักที ชีวิต เดิน ต่อไป เข้าม...
3539,อยากจะหายตัว,/music/thailyric/18678,หัวใจมันเกิดเป็นอะไรสักอย่างหลงทางหรือเรียกร้อ...,zom_marie,39,"[หัวใจ, สัก, หลงทาง, ขอร้อง, ชอบกล, เหมือนว่า,...",125,หัวใจ สัก หลงทาง ขอร้อง ชอบกล เหมือนว่า มีเหตุ...


## Feature engineering

### Number of words

In [622]:
n_artist = 32
random.seed(0)

artist_select = random.choices(song_df['artist'].unique(), k=n_artist)

song_filter_df = song_df.loc[song_df['artist'].isin(artist_select)]
print('Total number of songs: {}'.format(len(song_filter_df)))
song_filter_df.groupby('artist')[['song_name']].count().reset_index().rename(columns={'song_name':'n_songs'})

Total number of songs: 831


Unnamed: 0,artist,n_songs
0,boy_trai_bhumiratna,25
1,dome_pakorn_lam,24
2,dr_fuu,24
3,gam_wichayanee,42
4,getsunova,34
5,ging_muanpair,16
6,j_jetrin,72
7,jintara_poonlarb,59
8,keng_tachaya,60
9,lazyloxy,17


In [623]:
fig = px.box(song_filter_df, x='artist', y='n_words', title='Word count per song by artist')
fig.show()

### Repeated

In [624]:
# number of unique stems
song_df['n_unique_words'] = song_df['words'].map(lambda lst: len(set(lst)))

# ratio of unique stems
song_df['unique_words_ratio'] = song_df['n_unique_words'] / song_df['n_words']

# attach column to selected artists
song_filter_df = song_filter_df.join(song_df['unique_words_ratio'])

In [625]:
fig = px.box(song_filter_df, x='artist', y='unique_words_ratio', title='Ratio of unique words to all words')
fig.show()

### Words per line

In [626]:
# calculate number of words per line
song_df['words_per_line'] = song_df['n_words'] / song_df['lines'].astype(float)

song_filter_df = song_filter_df.join(song_df['words_per_line'])

In [627]:
fig = px.box(song_filter_df, x='artist', y='words_per_line', title='Words per line')
fig.show()

## TFIDF

In [628]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [629]:
# initialise count vectorizer
cv = CountVectorizer(analyzer=lambda x:x.split())

word_count_vector = cv.fit_transform(song_df['words_str'])

# compute idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [630]:
# print idf values
tfidf_df = pd.DataFrame({'word': cv.get_feature_names(), 'weight': tfidf_transformer.idf_})
 
# get lowest weights
tfidf_df.sort_values('weight').head()

Unnamed: 0,word,weight
5508,คน,1.515164
16157,ใจ,1.535212
10709,รัก,1.540046
10915,รู้,1.578066
12706,หัวใจ,1.789079


In [631]:
cv.get_feature_names()

['&',
 '+',
 '+++',
 ';',
 'AD',
 'AK',
 'Ah',
 'An',
 'As',
 'Ay',
 'Ba',
 'Be',
 'Br',
 'By',
 'C',
 'CA',
 'CC',
 'CD',
 'Cu',
 'D',
 'DC',
 'DM',
 'DO',
 'Da',
 'De',
 'Dm',
 'Do',
 'E',
 'Ed',
 'El',
 'Ey',
 'F',
 'FC',
 'G',
 'GG',
 'GO',
 'Go',
 'H',
 'HD',
 'Ha',
 'He',
 'Hm',
 'Ho',
 'Hu',
 'I',
 'IG',
 'II',
 'IS',
 'Id',
 'If',
 'Ig',
 'Ii',
 'Im',
 'In',
 'Is',
 'It',
 'J',
 'Ja',
 'Je',
 'K',
 'KH',
 'KQ',
 'Ko',
 'LA',
 'LV',
 'La',
 'Lo',
 'MY',
 'Ma',
 'Me',
 'Mo',
 'My',
 'NA',
 'NO',
 'Na',
 'No',
 'OH',
 'OK',
 'Of',
 'Oh',
 'Ok',
 'On',
 'PM',
 'SD',
 'SF',
 'Sa',
 'Se',
 'Si',
 'So',
 'TO',
 'TV',
 'Ta',
 'Te',
 'To',
 'Tu',
 'UN',
 'Uh',
 'Um',
 'Up',
 'Us',
 'VR',
 'Wa',
 'We',
 'Wo',
 'X',
 'XD',
 'YO',
 'Ya',
 'Yo',
 '[',
 '\\',
 ']',
 '_',
 '_________________________________',
 'abandon',
 'aboutthat',
 'abovemi',
 'abovethey',
 'aboveyour',
 'absolut',
 'absolutelyright',
 'accid',
 'account',
 'ace',
 'across',
 'act',
 'action',
 'actionsa',
 'actionsguess'

In [632]:
# get highest weights
tfidf_df.sort_values('weight', ascending=False).head()

Unnamed: 0,word,weight
5608,ครองแชมป์,8.4793
6820,ช่ำใจ,8.4793
14102,เนียส,8.4793
6848,ซม.,8.4793
14099,เนียก,8.4793


In [633]:
# assign tf idf scores to each song
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

# attach count vectors to dataframe
tf_idf_vector_lst = [-1] * len(song_df)
for i in range(len(song_df)):
    tf_idf_vector_lst[i] = tf_idf_vector[i]
song_df['tf_idf_vector'] = tf_idf_vector_lst    

song_df['tf_idf_score'] = song_df['tf_idf_vector'].map(lambda vec: np.sum(vec.todense()))

# join valus to selected artists
song_filter_df = song_filter_df.join(song_df[['tf_idf_vector', 'tf_idf_score']])

In [634]:
fig = px.box(song_filter_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

In [635]:
# calculate mean vector
def get_mean_vector(vec_lst):
    return csr_matrix(vstack(vec_lst).mean(axis=0))

In [636]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song_name': len}).reset_index()\
                   .rename(columns={'song_name': 'n_songs'})

# get selected artists
artist_filter_df = artist_df.loc[artist_df['artist'].isin(song_filter_df['artist'])]

In [637]:
similarity_matrix = cosine_similarity(vstack(artist_filter_df['tf_idf_vector']), 
                                      vstack(artist_filter_df['tf_idf_vector']))
artist_names = artist_filter_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))
fig.show()

### Similarity of Songs

In [638]:
artist_filter_df

Unnamed: 0,artist,tf_idf_vector,n_songs
13,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25
33,dome_pakorn_lam,"(0, 356)\t0.002036753271840002\n (0, 697)\t...",24
34,dr_fuu,"(0, 1780)\t0.0016562054048208882\n (0, 1929...",24
37,gam_wichayanee,"(0, 44)\t0.00121151243514157\n (0, 203)\t0....",42
40,getsunova,"(0, 44)\t0.009561322193419415\n (0, 52)\t0....",34
41,ging_muanpair,"(0, 4808)\t0.011630892785844085\n (0, 4813)...",16
46,j_jetrin,"(0, 14)\t0.005337624609776197\n (0, 15)\t0....",72
54,jintara_poonlarb,"(0, 36)\t0.0009628775792371366\n (0, 44)\t0...",59
57,keng_tachaya,"(0, 36)\t0.004193270792146778\n (0, 44)\t0....",60
62,lazyloxy,"(0, 6)\t0.00410240015932404\n (0, 21)\t0.00...",17


In [639]:
song_filter_df.head(3)

Unnamed: 0,song_name,href,lyric,artist,lines,words,n_words,words_str,unique_words_ratio,words_per_line,tf_idf_vector,tf_idf_score
377,เขาไม่เกี่ยว,/music/thailyric/2732,ก็จะไม่ถามและจะไม่อยากรู้ เรื่องเธอกับเขาไม่เอ...,boy_trai_bhumiratna,30,"[ถาม, อยากรู้, เรื่อง, ไม่ต้อง, เล่า, เรื่อง, ...",78,ถาม อยากรู้ เรื่อง ไม่ต้อง เล่า เรื่อง ฮูฮู อย...,0.294872,2.6,"(0, 16432)\t0.05147682263342569\n (0, 16157...",2.946291
378,คนใจน้อย,/music/thailyric/2733,ทำไมทำไมนะใจฉัน ชอบเป็นอย่างนี้หรือไงทำไมต้องค...,boy_trai_bhumiratna,23,"[ใจ, ชอบ, คอย, รัก, สนใจ, คน, กระเป๋าเงิน, รูป...",55,ใจ ชอบ คอย รัก สนใจ คน กระเป๋าเงิน รูป ดู โทร ...,0.727273,2.391304,"(0, 16432)\t0.07827641134693596\n (0, 16398...",5.732344
379,เชื่อ,/music/thailyric/2735,เปิดประตูกลับมาในห้องว่างเปล่า อยู่คนเดียวอีกแ...,boy_trai_bhumiratna,18,"[ประตู, กลับมา, ห้อง, ว่างเปล่า, คนเดียว, อีกแ...",53,ประตู กลับมา ห้อง ว่างเปล่า คนเดียว อีกแล้ว เง...,0.698113,2.944444,"(0, 16521)\t0.17276222401950928\n (0, 14731...",5.60126


In [640]:
artist_song_filter_df = pd.merge(artist_filter_df[['artist', 'tf_idf_vector', 'n_songs']].assign(key = 0), 
                                 song_filter_df[['artist', 'tf_idf_vector', 'song_name']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)
artist_song_filter_df['same_artist'] = artist_song_filter_df['artist_artist'] == artist_song_filter_df['artist_song']

In [641]:
artist_song_filter_df[artist_song_filter_df['same_artist']==False].head()

Unnamed: 0,artist_artist,tf_idf_vector_artist,n_songs,artist_song,tf_idf_vector_song,song_name,same_artist
25,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25,dome_pakorn_lam,"(0, 16293)\t0.018778465945572514\n (0, 1615...",ชนะด้วยหัวใจ,False
26,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25,dome_pakorn_lam,"(0, 16521)\t0.03341047556161979\n (0, 16475...",เอฟวรี ไนต์ สแตน (Every Night Stand),False
27,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25,dome_pakorn_lam,"(0, 16521)\t0.30762834461190164\n (0, 16157...",สมมติ,False
28,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25,dome_pakorn_lam,"(0, 16521)\t0.27854108239833664\n (0, 16438...",อย่ารักเขาได้ไหม,False
29,boy_trai_bhumiratna,"(0, 356)\t0.007554208269722263\n (0, 2810)\...",25,dome_pakorn_lam,"(0, 16548)\t0.04596100812891486\n (0, 16432...",เต็มที่กับชีวิต,False


In [642]:
# calculate similarity of artist tf idf vector and song vector
def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
    # check if song is from same artist
    if same_artist:
        # deduct song vector from artist vector
        artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
    # calculate similarity
    return cosine_similarity(artist_vector, song_vector)[0][0]

In [643]:
artist_song_filter_df['vector_similarity'] = artist_song_filter_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_artist'], 
                                                                     row['tf_idf_vector_song'], 
                                                                     row['n_songs'], row['same_artist']), axis=1)

In [644]:
df = artist_song_filter_df

fig = go.Figure()

fig.add_trace(go.Violin(x=df['artist_artist'][df['same_artist']],
                        y=df['vector_similarity'][df['same_artist']],
                        legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                        side='negative')
             )
fig.add_trace(go.Violin(x=df['artist_artist'][~df['same_artist']],
                        y=df['vector_similarity'][~df['same_artist']],
                        legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                        side='positive')
             )


fig.update_layout(
    width=1500,
    height=800,)
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(title='Similarity of Songs')
fig.update_xaxes(range=[-0.5, 9.5])
fig.update_yaxes(range=[-0.1, 0.8], title='Similarity')
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

## Sentiment analysis

In [645]:
# polarity_lst = [-1] * len(song_df)
# subjectivity_lst = [-1] * len(song_df)

# for i, text in enumerate(song_df['lyric']):
#     sentiment = TextBlob(text)
#     polarity_lst[i] = sentiment.polarity
#     subjectivity_lst[i] = sentiment.subjectivity
    
# song_df['polarity'] = polarity_lst
# song_df['subjectivity'] = subjectivity_lst

# song_filter_df = song_filter_df.join(song_df[['polarity', 'subjectivity']])

### Polarity and Subjectivity of Songs

In [646]:
# fig = px.scatter(song_filter_df, x='polarity', y='subjectivity', color='artist', hover_data=['song_name'], title='Polarity and Subjectivity of Songs')
# fig.show()

### Polarity by artist

In [647]:
# fig = px.box(song_filter_df, x='artist', y='polarity', title='Polarity by artist')
# fig.show()

In [648]:
# song_filter_df

## Prediction

In [649]:
song_df# parameter
# number of sets
n_set = {'train': 20, 'val': 20}

# number of artists per set
n_artist = 3

# minimum number of songs of one artist
n_song_min = 5

# maximum number of song - artist pairs per artist set
n_song_artist_max = 32

In [650]:
song_df.columns

Index(['song_name', 'href', 'lyric', 'artist', 'lines', 'words', 'n_words',
       'words_str', 'n_unique_words', 'unique_words_ratio', 'words_per_line',
       'tf_idf_vector', 'tf_idf_score'],
      dtype='object')

In [651]:
def select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max):
    song_count_df = song_df.groupby('artist')[['artist']].count().rename(columns={'artist': 'count'})
    artist_lst = list(song_count_df.loc[song_count_df['count'] >= n_song_min].index.values)

    n_set_total = sum(n_set.values())

    artist_set = []
    while len(artist_set) < n_set_total:
        new_artist = tuple(np.random.choice(artist_lst, size=n_artist, replace=False))
        if new_artist not in artist_set:
            artist_set.append(new_artist)

    # split artist sets
    artist_select = {}
    for field, n in n_set.items():
        i_select = np.random.choice(range(len(artist_set)), size=n, replace=False)
        artist_list = list(artist_set)
        artist_select[field] = [artist_list[i] for i in i_select]
        artist_set = [s for s in artist_set if s not in artist_select[field]]
    # create dataframe with all features
    feature_dict = {}
    # dictionary to map artist set id to list of artists
    set_id_to_artist_tp = {}

    i = 0
    for field, artist_set in artist_select.items():
        df_lst = []
        for artist_tp in artist_set:
            i += 1
            df = song_df.loc[song_df['artist'].isin(artist_tp), 
                             ['artist', 'song_name', 'n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_vector', 
                              'tf_idf_score']]
            # check if number of songs is too high
            if len(df) * n_artist > n_song_artist_max:
                df = df.sample(int(n_song_artist_max / n_artist), random_state=0)
                
            df['artist_set_id'] = i
            set_id_to_artist_tp[i] = artist_tp
            df_lst.append(df)
        feature_dict[field] = pd.concat(df_lst)  
        print('Number of songs in {}: {}'.format(field, len(feature_dict[field])))

    # get all selected artists
    artist_select_set = set.union(*[set(sum(tp_lst, ())) for tp_lst in artist_select.values()])

    # create artist dataframe from training data
    df_lst = []
    for artist, df in song_df.loc[song_df['artist'].isin(artist_select_set)].groupby('artist'):
        dic = {'artist': artist}
        # calculate averages and standard diviations
        for field in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            dic[field + '_mean'] = df[field].mean()
            dic[field + '_std'] = df[field].std()

        # number of songs
        dic['songs'] = len(df)

        # calculate average tf idf vector
        dic['tf_idf_vector_mean'] = get_mean_vector(df['tf_idf_vector'])

        df_lst.append(pd.DataFrame(dic, index=[0]))
    artist_feature_df = pd.concat(df_lst)

    def get_features(df):
        # get artist set id
        artist_set_id = df['artist_set_id'].iloc[0]
        
        # get all artists
        artist_feature_select_df = artist_feature_df.loc[artist_feature_df['artist']\
                                                         .isin(set_id_to_artist_tp[artist_set_id])]

        # merge dataframes
        artist_song_feature_df = pd.merge(artist_feature_select_df.assign(key=0), df.assign(key=0), on='key', 
                                          suffixes=['_artist', '_song']).drop('key', axis=1)    
        artist_song_feature_df['same_artist'] = \
            artist_song_feature_df['artist_artist'] == artist_song_feature_df['artist_song']

        # calculate features
        # add feature polarity
        for feature in ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']:
            artist_song_feature_df[feature + '_diff'] = \
                artist_song_feature_df[feature] - artist_song_feature_df[feature + '_mean']
            artist_song_feature_df[feature + '_diff_std'] = \
                artist_song_feature_df[feature + '_diff'] / artist_song_feature_df[feature + '_std']

        # calculate vector similarity between artist and song
        artist_song_feature_df['vector_similarity'] = \
            artist_song_feature_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_mean'], 
                                                      row['tf_idf_vector'], row['songs'], row['same_artist']), 
                                         axis=1)    
        return artist_song_feature_df

    artist_song_feature = {}
    for field in feature_dict:
        artist_song_feature[field] = feature_dict[field].groupby('artist_set_id').apply(get_features)\
                                                        .reset_index(drop=True)
        
    return artist_song_feature

In [652]:
np.random.seed(0)
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

Number of songs in train: 200
Number of songs in val: 200


In [653]:
artist_song_feature['train'].iloc[0]

artist_artist                                                          getsunova
n_words_mean                                                          101.235294
n_words_std                                                            33.895614
unique_words_ratio_mean                                                  0.41301
unique_words_ratio_std                                                   0.09307
words_per_line_mean                                                     2.196801
words_per_line_std                                                      0.393798
tf_idf_score_mean                                                       4.761456
tf_idf_score_std                                                        0.914203
songs                                                                         34
tf_idf_vector_mean               (0, 44)\t0.009561322193419415\n  (0, 52)\t0....
artist_song                                                            getsunova
song_name                   

Markdown

In [654]:
feature = ['n_words_diff', 'n_words_diff_std',
        'unique_words_ratio_diff', 'unique_words_ratio_diff_std',
        'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff',
        'tf_idf_score_diff_std','vector_similarity']
df_lst = []

for f in feature:
   
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
    
feature_df = pd.concat(df_lst)
feature_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,14.764706
1,False,n_words_diff,116.764706
2,False,n_words_diff,-9.235294
3,True,n_words_diff,-26.235294
4,True,n_words_diff,-17.235294


In [655]:
feature_df.head()

Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,14.764706
1,False,n_words_diff,116.764706
2,False,n_words_diff,-9.235294
3,True,n_words_diff,-26.235294
4,True,n_words_diff,-17.235294


In [656]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [657]:
feature_df

Unnamed: 0,same_artist,feature,value
0,True,n_words_diff,14.764706
1,False,n_words_diff,116.764706
2,False,n_words_diff,-9.235294
3,True,n_words_diff,-26.235294
4,True,n_words_diff,-17.235294
...,...,...,...
595,True,vector_similarity,0.210630
596,False,vector_similarity,0.150404
597,True,vector_similarity,0.067630
598,False,vector_similarity,0.145518


In [658]:
fig = violine_feature_plot(feature_df, feature[1:])
fig.update_layout(
    autosize=False,
    width=1500,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

In [659]:
fig = violine_feature_plot(feature_df, ['n_words_diff_std', 'unique_words_ratio_diff_std', 'words_per_line_diff_std', 'tf_idf_score_diff_std'])
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,)
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

In [660]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_xaxes(range=[-1, 1])
fig.show()

### Prepare data

In [661]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, pipeline):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

In [662]:
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 40}
n_song_artist_max = 50

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline)

Number of songs in train: 640


In [663]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

In [664]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # print(predict_select)
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [665]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.875


In [666]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.925


In [667]:
n_artist_lst = [2, 4, 8, 16]
top_n_lst = [1, 2, 4, 8]
n_song_artist_max = 16
np.random.seed(2)

n_set = {'train': 100, 'val': 100}

feature_org = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score', 'vector_similarity']
feature_abs = ['n_words_diff', 'n_words_diff_std', 'unique_words_ratio_diff', 'unique_words_ratio_diff_std', 
               'words_per_line_diff', 'words_per_line_diff_std', 'tf_idf_score_diff', 'tf_idf_score_diff_std', ]

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-04-24 00:53:25.798963
n_artist: 2
Number of songs in train: 800
Number of songs in val: 800
top_n: 1
Accuracy: 0.88

2022-04-24 00:53:30.177958
n_artist: 4
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.71
top_n: 2
Accuracy: 0.9

2022-04-24 00:53:34.712955
n_artist: 8
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.52
top_n: 2
Accuracy: 0.76
top_n: 4
Accuracy: 0.91

2022-04-24 00:53:39.089937
n_artist: 16
Number of songs in train: 100
Number of songs in val: 100
top_n: 1
Accuracy: 0.26
top_n: 2
Accuracy: 0.47
top_n: 4
Accuracy: 0.66
top_n: 8
Accuracy: 0.89



In [668]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'}).update_traces(mode='lines+markers')
fig.show()

In [669]:
feature_columns = ['n_words', 'unique_words_ratio', 'words_per_line', 'tf_idf_score']

embeddings = song_df[feature_columns].copy()
embeddings['n_words'] = (embeddings['n_words'])/(embeddings['n_words'].max()//2)
embeddings = embeddings.values.tolist()

identities = []

name_dict = {}

label_i = 0
for name in song_df['artist'].values:
    if name not in name_dict:
        name_dict[name] = label_i
        label_i = label_i+1
    
    identities.append(name_dict[name])

In [670]:
import time 
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
tsne_result = tsne.fit_transform(embeddings)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3541 samples in 0.003s...
[t-SNE] Computed neighbors for 3541 samples in 0.065s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3541
[t-SNE] Computed conditional probabilities for sample 2000 / 3541
[t-SNE] Computed conditional probabilities for sample 3000 / 3541
[t-SNE] Computed conditional probabilities for sample 3541 / 3541
[t-SNE] Mean sigma: 0.120909
[t-SNE] KL divergence after 250 iterations with early exaggeration: 66.357330
[t-SNE] KL divergence after 3000 iterations: 0.804081
t-SNE done! Time elapsed: 31.747023344039917 seconds


In [671]:
import plotly.express as px

label = [] 
idx = 0 
id2label = dict() 
for identity in identities: 
  identity = int(identity) 
  if identity not in id2label:
    id2label[identity] = idx 
    idx += 1 
  label.append(id2label[identity])

df_subset = pd.DataFrame({'label': label})

df_subset['tsne-2d-one'] = tsne_result[:,0]
df_subset['tsne-2d-two'] = tsne_result[:,1]

fig = px.scatter(df_subset, x="tsne-2d-one", y="tsne-2d-two", color="label", height=1000, width=1000)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [672]:
import time 
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)
tsne_result = tsne.fit_transform(embeddings)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3541 samples in 0.003s...
[t-SNE] Computed neighbors for 3541 samples in 0.062s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3541
[t-SNE] Computed conditional probabilities for sample 2000 / 3541
[t-SNE] Computed conditional probabilities for sample 3000 / 3541
[t-SNE] Computed conditional probabilities for sample 3541 / 3541
[t-SNE] Mean sigma: 0.120909
[t-SNE] KL divergence after 250 iterations with early exaggeration: 66.697823
[t-SNE] KL divergence after 2750 iterations: 0.702298
t-SNE done! Time elapsed: 61.366896629333496 seconds


In [673]:
import plotly.express as px

label = [] 
idx = 0 
id2label = dict() 
for identity in identities: 
  identity = int(identity) 
  if identity not in id2label:
    id2label[identity] = idx 
    idx += 1 
  label.append(id2label[identity])

df_subset = pd.DataFrame({'label': label})

df_subset['tsne-3d-one'] = tsne_result[:,0]
df_subset['tsne-3d-two'] = tsne_result[:,1]
df_subset['tsne-3d-three'] = tsne_result[:,2]

fig = px.scatter_3d(df_subset, x="tsne-3d-one", y="tsne-3d-two", z="tsne-3d-three", color="label")
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()