### Import Library

In [33]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import random
from scipy.sparse import csr_matrix, vstack
# from textblob import TextBlob
# from langdetect import detect_langs
import pickle

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Import the clean data

In [34]:
song_df = pd.read_pickle('../../data_lyrics/pd/clean_th_songs.pkl')

In [35]:
song_df.dtypes

artist        object
song_name     object
href          object
lyric         object
lines          int64
words         object
n_words        int64
words_str     object
artists       object
duplicates     int64
n_artists      int64
dtype: object

## Feature engineering

### Number of words

In [36]:
n_artist = len(song_df['artist'].unique())
random.seed(0)

artist_select = random.choices(song_df['artist'].unique(), k=n_artist)

song_filter_df = song_df.loc[song_df['artist'].isin(artist_select)]
print('Total number of songs: {}'.format(len(song_filter_df)))
song_filter_df.groupby('artist')[['song_name']].count().reset_index().rename(columns={'song_name':'n_songs'})

Total number of songs: 1295


Unnamed: 0,artist,n_songs
0,25_hours,35
1,add_carabao,35
2,ann_thitima,35
3,big_ass,35
4,bird_thongchai,35
5,bnk48,35
6,bodyslam,35
7,carabao,35
8,cocktail,35
9,da_endorphine,35


In [37]:
fig = px.box(song_filter_df, x='artist', y='n_words', title='Word count per song by artist')
fig.show()

### Repeated

In [38]:
# number of unique stems
song_df['n_unique_words'] = song_df['words'].map(lambda lst: len(set(lst)))

# ratio of unique stems
song_df['unique_words_ratio'] = song_df['n_unique_words'] / song_df['n_words']

# attach column to selected artists
song_filter_df = song_filter_df.join(song_df['unique_words_ratio'])

In [39]:
fig = px.box(song_filter_df, x='artist', y='unique_words_ratio', title='Ratio of unique words to all words')
fig.show()

### Words per line

In [40]:
# calculate number of words per line
song_df['words_per_line'] = song_df['n_words'] / song_df['lines'].astype(float)

song_filter_df = song_filter_df.join(song_df['words_per_line'])

In [41]:
fig = px.box(song_filter_df, x='artist', y='words_per_line', title='Words per line')
fig.show()

## TFIDF

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [43]:
# initialise count vectorizer
cv = CountVectorizer(analyzer=lambda x:x.split())

word_count_vector = cv.fit_transform(song_df['words_str'])

# compute idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [44]:
# print idf values
tfidf_df = pd.DataFrame({'word': cv.get_feature_names(), 'weight': tfidf_transformer.idf_})
 
# get lowest weights
tfidf_df.sort_values('weight').head()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Unnamed: 0,word,weight
13370,ไม่,1.100365
5494,ที่,1.170345
3815,จะ,1.186346
13339,ไป,1.191737
13219,ให้,1.210286


In [45]:
cv.get_feature_names()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



['[',
 'a',
 'about',
 'abov',
 'absolut',
 'accent',
 'accept',
 'accessori',
 'ace',
 'achiev',
 'across',
 'act',
 'action',
 'ad',
 'adapt',
 'addi',
 'addict',
 'address',
 'adventur',
 'aey',
 'afraid',
 'after',
 'again',
 'agent',
 'agre',
 'ah',
 'aha',
 'ahead',
 'ahh',
 'ai',
 'aim',
 'aint',
 'air',
 'airlin',
 'aitakatta',
 'aixd',
 'ak',
 'album',
 'alcohol',
 'alert',
 'alexand',
 'aliv',
 'all',
 'alley',
 'allow',
 'alon',
 'along',
 'alreadi',
 'alright',
 'alskar',
 'although',
 'alway',
 'am',
 'america',
 'among',
 'amot',
 'amp',
 'an',
 'anata',
 'and',
 'angel',
 'ani',
 'anim',
 'ann',
 'anoth',
 'answer',
 'anybodi',
 'anymor',
 'anyon',
 'anyth',
 'anytim',
 'anyway',
 'aouuu',
 'ap',
 'apart',
 'app',
 'appealin',
 'appear',
 'applic',
 'appreci',
 'approach',
 'aquaman',
 'archer',
 'are',
 'ark',
 'arm',
 'armi',
 'armour',
 'around',
 'art',
 'artist',
 'as',
 'asian',
 'ask',
 'ass',
 'at',
 'attack',
 'attitud',
 'attract',
 'auckland',
 'audi',
 'auto'

In [46]:
# get highest weights
tfidf_df.sort_values('weight', ascending=False).head(5)

Unnamed: 0,word,weight
0,[,8.02153
7153,พูล,8.02153
7184,ฟักไข่,8.02153
7183,ฟักทอง,8.02153
7180,ฟอง,8.02153


In [47]:
# assign tf idf scores to each song
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

# attach count vectors to dataframe
tf_idf_vector_lst = [-1] * len(song_df)
for i in range(len(song_df)):
    tf_idf_vector_lst[i] = tf_idf_vector[i]
song_df['tf_idf_vector'] = tf_idf_vector_lst    

song_df['tf_idf_score'] = song_df['tf_idf_vector'].map(lambda vec: np.sum(vec.todense()))

# join valus to selected artists
song_filter_df = song_filter_df.join(song_df[['tf_idf_vector', 'tf_idf_score']])

In [48]:
tf_idf_vector.shape

(2240, 13513)

In [49]:
fig = px.box(song_filter_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

In [50]:
# calculate mean vector
def get_mean_vector(vec_lst):
    return csr_matrix(vstack(vec_lst).mean(axis=0))

In [51]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song_name': len}).reset_index()\
                   .rename(columns={'song_name': 'n_songs'})

# get selected artists
artist_filter_df = artist_df.loc[artist_df['artist'].isin(song_filter_df['artist'])]

### Similarity of Songs

In [52]:
similarity_matrix = cosine_similarity(vstack(artist_filter_df['tf_idf_vector']), 
                                      vstack(artist_filter_df['tf_idf_vector']))
artist_names = artist_filter_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))

fig.update_layout(
    width=1500,
    height=1000,)
    
fig.show()

In [53]:
artist_filter_df

Unnamed: 0,artist,tf_idf_vector,n_songs
0,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35
1,add_carabao,"(0, 1)\t0.014633107002671019\n (0, 21)\t0.0...",35
2,ann_thitima,"(0, 1)\t0.001199627225432793\n (0, 109)\t0....",35
6,big_ass,"(0, 357)\t0.012350142704695559\n (0, 1269)\...",35
7,bird_thongchai,"(0, 42)\t0.002626827414628101\n (0, 51)\t0....",35
8,bnk48,"(0, 11)\t0.0010226403195978766\n (0, 25)\t0...",35
9,bodyslam,"(0, 2207)\t0.00264755348775342\n (0, 2260)\...",35
10,carabao,"(0, 2204)\t0.0026893738911699747\n (0, 2206...",35
11,cocktail,"(0, 111)\t0.004789324582297804\n (0, 365)\t...",35
12,da_endorphine,"(0, 1)\t0.00044912663312639327\n (0, 59)\t0...",35


In [54]:
song_filter_df.sample(5)

Unnamed: 0,artist,song_name,href,lyric,lines,words,n_words,words_str,artists,duplicates,n_artists,unique_words_ratio,words_per_line,tf_idf_vector,tf_idf_score
667,keng_tachaya,ของรักของหวง,/music/thailyric/9621,ขึ้นทรงคอคชาเอราวัณ ทหารแห่ โห่สนั่น หวั่นไหว\...,0,"[ขึ้น, ทรง, คอ, ค, ชา, เอราวัณ, ทหาร, แห่, โห่...",318,ขึ้น ทรง คอ ค ชา เอราวัณ ทหาร แห่ โห่ สนั่น หว...,{keng_tachaya},0,1,0.371069,inf,"(0, 13488)\t0.03915555302539431\n (0, 13394...",7.256928
1291,maew_jirasak,คำเดียว,/music/thailyric/767,ปากคนอย่างฉัน กับคำหวานๆ\rมันไม่ค่อยเข้ากัน เท...,0,"[ปาก, คน, อย่าง, ฉัน, กับ, คำหวาน, ๆ\r, มัน, ไ...",261,ปาก คน อย่าง ฉัน กับ คำหวาน ๆ\r มัน ไม่ค่อย เข...,{maew_jirasak},0,1,0.241379,inf,"(0, 13488)\t0.05812655795423931\n (0, 13483...",5.986025
1682,25_hours,เที่ยงคืนสิบห้านาที,/music/thailyric/11791,เที่ยงคืนสิบห้านาที กับวันที่ฉันนั่งเหม่อ\rที่...,0,"[เที่ยงคืน, สิบห้า, นาที, กับ, วันที่, ฉัน, นั...",257,เที่ยงคืน สิบห้า นาที กับ วันที่ ฉัน นั่ง เหม่...,{25_hours},0,1,0.22179,inf,"(0, 13460)\t0.22392129673882685\n (0, 13441...",4.911217
2165,nut_meria,ฉันอยู่ตรงไหน,/music/thailyric/2313,มองตาของเธอ ก็เจอเงาเขาอยู่ ไม่เคยจะรู้มีฉันอย...,0,"[มอง, ตา, ของ, เธอ, ก็, เจอ, เงา, เขา, อยู่, ไ...",216,มอง ตา ของ เธอ ก็ เจอ เงา เขา อยู่ ไม่ เคย จะ ...,{nut_meria},0,1,0.333333,inf,"(0, 13468)\t0.03960420886206716\n (0, 13460...",6.540305
2060,mc_king,ส่วนหนึ่ง,/music/thailyric/18372,อย่าลืมเผื่อใจ ถ้าหากว่าเจาไปรักใคร จะได้ไม่ช้...,0,"[อย่า, ลืม, เผื่อ, ใจ, ถ้าหากว่า, เจา, ไป, รัก...",392,อย่า ลืม เผื่อ ใจ ถ้าหากว่า เจา ไป รัก ใคร จะ ...,{mc_king},0,1,0.336735,inf,"(0, 13441)\t0.10462484899008165\n (0, 13438...",8.878277


In [55]:
artist_song_filter_df = pd.merge(artist_filter_df[['artist', 'tf_idf_vector', 'n_songs']].assign(key = 0), 
                                 song_filter_df[['artist', 'tf_idf_vector', 'song_name']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)
artist_song_filter_df['same_artist'] = artist_song_filter_df['artist_artist'] == artist_song_filter_df['artist_song']

In [56]:
artist_song_filter_df[artist_song_filter_df['same_artist']==False].head()

Unnamed: 0,artist_artist,tf_idf_vector_artist,n_songs,artist_song,tf_idf_vector_song,song_name,same_artist
0,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35,bird_thongchai,"(0, 13468)\t0.024545557335906996\n (0, 1346...",Okay,False
1,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35,bird_thongchai,"(0, 13455)\t0.07318947591287757\n (0, 13399...",กว่าจักรวาล,False
2,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35,bird_thongchai,"(0, 13460)\t0.08907922620038226\n (0, 13455...",กำแพง,False
3,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35,bird_thongchai,"(0, 13460)\t0.09079480652329217\n (0, 13455...",ชีวิตเดี่ยว,False
4,25_hours,"(0, 59)\t0.004297334094741244\n (0, 662)\t0...",35,bird_thongchai,"(0, 13455)\t0.03317924596013951\n (0, 13370...",ผู้ต้องหา,False


In [57]:
# calculate similarity of artist tf idf vector and song vector
def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
    # check if song is from same artist
    if same_artist:
        # deduct song vector from artist vector
        artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
    # calculate similarity
    return cosine_similarity(artist_vector, song_vector)[0][0]

In [58]:
artist_song_filter_df['vector_similarity'] = artist_song_filter_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_artist'], 
                                                                     row['tf_idf_vector_song'], 
                                                                     row['n_songs'], row['same_artist']), axis=1)

# This function may take longer than 8 minutes.

In [59]:
df = artist_song_filter_df

fig = go.Figure()

fig.add_trace(go.Violin(x=df['artist_artist'][df['same_artist']],
                        y=df['vector_similarity'][df['same_artist']],
                        legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                        side='negative')
             )
fig.add_trace(go.Violin(x=df['artist_artist'][~df['same_artist']],
                        y=df['vector_similarity'][~df['same_artist']],
                        legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                        side='positive')
             )


fig.update_layout(
    width=2000,
    height=800,)
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(title='Similarity of Songs')
fig.update_xaxes(range=[-0.5, 9.5])
fig.update_yaxes(range=[-0.1, 0.8], title='Similarity')
fig.show()

# Note that you should click *Autoscale* on the figure option to show all artists' violins

## Sentiment analysis

In [60]:
# polarity_lst = [-1] * len(song_df)
# subjectivity_lst = [-1] * len(song_df)

# for i, text in enumerate(song_df['lyric']):
#     sentiment = TextBlob(text)
#     polarity_lst[i] = sentiment.polarity
#     subjectivity_lst[i] = sentiment.subjectivity
    
# song_df['polarity'] = polarity_lst
# song_df['subjectivity'] = subjectivity_lst

# song_filter_df = song_filter_df.join(song_df[['polarity', 'subjectivity']])

### Polarity and Subjectivity of Songs

In [61]:
# fig = px.scatter(song_filter_df, x='polarity', y='subjectivity', color='artist', hover_data=['song_name'], title='Polarity and Subjectivity of Songs')
# fig.show()

### Polarity by artist

In [62]:
# fig = px.box(song_filter_df, x='artist', y='polarity', title='Polarity by artist')
# fig.show()

In [63]:
# song_filter_df

## Export the feature-extracted data

In [66]:
with open('../../data_lyrics/thaisongs/fx_th_songs.bin',"wb") as f:
    pickle.dump(song_df, f)