### Import Library

In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import random
from scipy.sparse import csr_matrix, vstack
# from textblob import TextBlob
# from langdetect import detect_langs
import pickle
import pythainlp

In [2]:
import os, sys

sys.path.insert(0,os.path.realpath(os.path.join(os.path.pardir, '..')))

from config import cfg
from utils.utils import *
from utils.tfidf import *
from utils.feature import *

[nltk_data] Downloading package words to C:\Users\FACT-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


### Import the clean data

In [3]:
song_df = pd.read_pickle(os.path.join(cfg.DATA.BASE_PATH, "clean_th_songs_base.pkl")) 

## Feature engineering

### Number of words

In [4]:
n_artist = len(song_df['artist'].unique())
random.seed(0)

artist_select = random.choices(song_df['artist'].unique(), k=n_artist)

song_filter_df = song_df.loc[song_df['artist'].isin(artist_select)]
print('Total number of songs: {}'.format(len(song_filter_df)))
song_filter_df.groupby('artist')[['song_name']].count().reset_index().rename(columns={'song_name':'n_songs'})

Total number of songs: 1295


Unnamed: 0,artist,n_songs
0,25_hours,35
1,add_carabao,35
2,ann_thitima,35
3,big_ass,35
4,bird_thongchai,35
5,bnk48,35
6,bodyslam,35
7,carabao,35
8,cocktail,35
9,da_endorphine,35


In [5]:
fig = px.box(song_filter_df, x='artist', y='n_words', title='Word count per song by artist')
fig.show()

### Count eng words

In [6]:
song_df['eng_word_ratio'] = song_df['words'].map(lambda s: len([i for i in s if not pythainlp.util.isthai(i, ignore_chars="\r")])) / song_df['n_words']

song_filter_df = song_filter_df.join(song_df[['eng_word_ratio']])

### Word within Line

In [7]:
def countWordWithInLine(l) :
    count = 0
    d = {}
    for i in l :
        if i != '\r' :
            if i in d.keys() :
                d[i] += 1
            else :
                d[i] = 1
        else :
            for val in d.values() :
                if val > 1:
                    count += 1
            d = {}
    for val in d.values() :
        if val > 1 :
            count += 1
    return count

In [8]:
song_df['words_withinline'] = song_df['words'].apply(lambda x: countWordWithInLine(x))

song_filter_df = song_filter_df.join(song_df['words_withinline'])

### Repeated

In [9]:
# number of unique stems
song_df['n_unique_words'] = song_df['words'].map(lambda lst: len(set(lst)))

# ratio of unique stems
song_df['unique_words_ratio'] = song_df['n_unique_words'] / song_df['n_words']

# attach column to selected artists
song_filter_df = song_filter_df.join(song_df['unique_words_ratio'])

In [10]:
fig = px.box(song_filter_df, x='artist', y='unique_words_ratio', title='Ratio of unique words to all words')
fig.show()

### Words per line

In [11]:
# calculate number of words per line
song_df['words_per_line'] = song_df['n_words'] / song_df['lines'].astype(float)

song_filter_df = song_filter_df.join(song_df['words_per_line'])

In [12]:
fig = px.box(song_filter_df, x='artist', y='words_per_line', title='Words per line')
fig.show()

### Number of Verse

In [13]:
song_df['n_verse'] = song_df['lyrics'].map(lambda t: len(re.findall(r'\r\r', t)))

song_filter_df = song_filter_df.join(song_df['n_verse'])

## TFIDF

In [14]:
tfidf = TFIDF(song_df, song_filter_df)

# initialise count vectorizer & compute idf

tfidf_df = tfidf.init_tfidf()

In [15]:
# get lowest weights
tfidf_df.sort_values('weight').head(5)

Unnamed: 0,word,weight
4393,คน,1.492111
14014,ใจ,1.575017
9091,รัก,1.57581
9260,รู้,1.608071
10927,หัวใจ,1.824068


In [16]:
# get highest weights
tfidf_df.sort_values('weight', ascending=False).head(5)

Unnamed: 0,word,weight
4925,คุณพ่อ,8.02153
6016,ตลบตะแลง,8.02153
6045,ตอบสนอง,8.02153
12109,เทียมทัน,8.02153
12110,เทียมทาน,8.02153


In [17]:
# This block can be run only once

song_df, song_filter_df = tfidf.add_tfidf_vector_lst()

fig = px.box(song_filter_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

In [18]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song_name': len}).reset_index()\
                   .rename(columns={'song_name': 'n_songs'})

# get selected artists
artist_filter_df = artist_df.loc[artist_df['artist'].isin(song_filter_df['artist'])]

### Similarity of Songs

In [19]:
similarity_matrix = cosine_similarity(vstack(artist_filter_df['tf_idf_vector']), 
                                      vstack(artist_filter_df['tf_idf_vector']))

artist_names = artist_filter_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))

fig.update_layout(
    width=1500,
    height=1000,)
    
fig.show()

In [20]:
artist_filter_df.head()

Unnamed: 0,artist,tf_idf_vector,n_songs
0,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35
1,add_carabao,"(0, 31)\t0.002004706865703995\n (0, 75)\t0....",35
2,ann_thitima,"(0, 197)\t0.002597594291098225\n (0, 3665)\...",35
6,big_ass,"(0, 576)\t0.015453766351177515\n (0, 2118)\...",35
7,bird_thongchai,"(0, 88)\t0.007229318107487069\n (0, 90)\t0....",35


In [21]:
song_filter_df.head()

Unnamed: 0,artist,song_name,href,lyrics,lines,words,n_words,words_str,eng_word_ratio,words_withinline,unique_words_ratio,words_per_line,n_verse,tf_idf_vector,tf_idf_score
0,bird_thongchai,Okay,/music/thailyric/13588,ไม่ว่าจะเป็นยังไง Baby its Okay\r\nไม่ว่าจะเกิ...,57,"[babi, okay, ill, miss, day, กลับมา, come, bac...",174,babi okay ill miss day กลับมา come back babyil...,0.781609,33,0.41954,3.052632,0,"(0, 14353)\t0.02883230557981512\n (0, 14345...",4.556725
1,bird_thongchai,กว่าจักรวาล,/music/thailyric/13978,จะยอมนั่งจรวด ไปตรวจดาวอังคาร\r\nหากว่าที่แห่ง...,31,"[นั่ง, จรวด, ตรวจ, ดาวอังคาร, ว่าที่, ทิ้ง, ดา...",56,นั่ง จรวด ตรวจ ดาวอังคาร ว่าที่ ทิ้ง ดาว ขอบ จ...,0.0,13,0.482143,1.806452,0,"(0, 14193)\t0.13025943802120013\n (0, 10178...",4.212767
2,bird_thongchai,กำแพง,/music/thailyric/14111,ถ้าเคยพบเจอ กำแพงที่ดูทั้งใหญ่และสูงชัน\r\nเธอ...,39,"[เจอ, กำแพง, ดู, สูงชัน, รู้, คน, สร้าง, หลบ, ...",117,เจอ กำแพง ดู สูงชัน รู้ คน สร้าง หลบ ใจ พัก ท้...,0.0,32,0.504274,3.0,0,"(0, 14345)\t0.11707359429283427\n (0, 14014...",6.564095
3,bird_thongchai,ชีวิตเดี่ยว,/music/thailyric/13796,อยู่ตรงนี้แค่เพียงลำพัง\r\nกับความเหงาที่เป็นด...,50,"[ตรงนี้, เพียงลำพัง, ความเหงา, เงา, แต่ยังไง, ...",115,ตรงนี้ เพียงลำพัง ความเหงา เงา แต่ยังไง ชิน สั...,0.0,28,0.417391,2.3,0,"(0, 14345)\t0.10734013768978112\n (0, 14014...",5.744572
4,bird_thongchai,ผู้ต้องหา,/music/thailyric/13825,แค่ตัวคนเดียวไม่ตายล่ะมั้ง\r\nถามใจกี่ครั้งก็ย...,65,"[ตัว, คนเดียว, ตาย, ล่ะ, มั้ง, ถาม, ใจ, กี่, ร...",153,ตัว คนเดียว ตาย ล่ะ มั้ง ถาม ใจ กี่ รีบ คน รู้...,0.0,24,0.300654,2.353846,0,"(0, 14014)\t0.010143307333446037\n (0, 1396...",3.995738


In [22]:
artist_song_filter_df = pd.merge(artist_filter_df[['artist', 'tf_idf_vector', 'n_songs']].assign(key = 0), 
                                 song_filter_df[['artist', 'tf_idf_vector', 'song_name']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)

artist_song_filter_df['same_artist'] = artist_song_filter_df['artist_artist'] == artist_song_filter_df['artist_song']

artist_song_filter_df[artist_song_filter_df['same_artist']==False].head()

Unnamed: 0,artist_artist,tf_idf_vector_artist,n_songs,artist_song,tf_idf_vector_song,song_name,same_artist
0,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35,bird_thongchai,"(0, 14353)\t0.02883230557981512\n (0, 14345...",Okay,False
1,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35,bird_thongchai,"(0, 14193)\t0.13025943802120013\n (0, 10178...",กว่าจักรวาล,False
2,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35,bird_thongchai,"(0, 14345)\t0.11707359429283427\n (0, 14014...",กำแพง,False
3,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35,bird_thongchai,"(0, 14345)\t0.10734013768978112\n (0, 14014...",ชีวิตเดี่ยว,False
4,25_hours,"(0, 100)\t0.005485496418462423\n (0, 1052)\...",35,bird_thongchai,"(0, 14014)\t0.010143307333446037\n (0, 1396...",ผู้ต้องหา,False


In [23]:
artist_song_filter_df['vector_similarity'] = artist_song_filter_df.apply(lambda row: tf_idf_vector_similarity(
                                                                                row['tf_idf_vector_artist'], 
                                                                                row['tf_idf_vector_song'], 
                                                                                row['n_songs'], row['same_artist']), axis=1)

# This function may take longer than 8 minutes.

In [24]:
df = artist_song_filter_df

fig = go.Figure()

fig.add_trace(go.Violin(x=df['artist_artist'][df['same_artist']],
                        y=df['vector_similarity'][df['same_artist']],
                        legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                        side='negative')
             )

fig.add_trace(go.Violin(x=df['artist_artist'][~df['same_artist']],
                        y=df['vector_similarity'][~df['same_artist']],
                        legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                        side='positive')
             )

fig.update_layout(
    width=2000,
    height=800,)
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(title='Similarity of Songs')
fig.update_xaxes(range=[-0.5, 9.5])
fig.update_yaxes(range=[-0.1, 0.8], title='Similarity')
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

## Export the feature-extracted data

In [25]:
song_df.to_pickle(os.path.join(cfg.DATA.BASE_PATH, "fx_th_songs_newlog.pkl"))