In [106]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import make_scorer

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.naive_bayes import GaussianNB


from time import time

from tqdm import tqdm

import pickle
import json

# CLustering

In [17]:
K = list(range(2, 10))

In [6]:
df = pd.read_csv(r'./df_tfidf4395.csv')
df.head()

Unnamed: 0,act,afraid,age,ago,ah,ai,aint,air,aliv,alon,...,x2,ya,ye,yea,yeah,year,yesterday,yo,young,index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028271,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085668,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.03901,0.0,0.254412,0.0,0.0,...,0.0,0.0,0.100178,0.0,0.035947,0.188197,0.0,0.0,0.027036,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216461,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [12]:
features = df.columns.tolist()[:-1]

In [16]:
data = df[features].values

## Kmeans

In [20]:
results = {}
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=905).fit(data)
    labels = kmeans.labels_
    results[k] = {
                  'db_index': davies_bouldin_score(data, labels),
                  'labels': kmeans.labels_,
                  'centres': kmeans.cluster_centers_
                 }
    print(k, davies_bouldin_score(data, labels))

2 8.5776640097168
3 7.602568532441171
4 6.5123734239046405
5 7.860200282523778
6 7.593222031160148
7 7.236792062867887
8 6.571751734390039
9 6.643355777503522


## LDA

In [37]:
n_top_words = 20

In [38]:
lda = LatentDirichletAllocation(n_components=4, 
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=905)

In [39]:
t0 = time()
lda.fit(data)
print("done in %0.3fs." % (time() - t0))

done in 3.233s.


In [40]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [41]:
print_top_words(lda, features, n_top_words)

Topic #0: la holi da shall prais glori mr bore lord king hi born power chain fame lil luck ooh sing high
Topic #1: dem doll di wild escap lone choru quick check like war say bout heart want deep day man come da
Topic #2: thou brown touch suck hot gold kill burn great fuck murder bag woah feel tree love bird deeper need wit
Topic #3: love know like got ll time ve oh come want na let say feel make way babi ca day life



# Classification

## Data preparing

In [55]:
with open(r'./lyrics_emotion[0-7480].json', 'r') as fn:
    emo_dict = json.load(fn)

In [42]:
data_cls = pd.read_csv('./df_4395.csv')
data_cls.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,word_count,represent,represent_2nd,word_count_2nd
0,0,louder-flux-pavilion-doctor-p-remix,2012,dj-fresh,Electronic,I can't control this feeling Something's happe...,341.0,I ca n't control feel someth 's happen insid o...,I ca n't control feel someth 's happen insid s...,339
1,1,that-s-my-name,2009,akcent,Pop,In my heart I will keep you In my heart Foreve...,318.0,In heart I keep In heart forev In heart and on...,In heart I keep In heart forev In heart one li...,169
2,2,lemonade,2007,apologetix,Rock,"Oh, Mama, I've been cleared of my crimes and I...",250.0,Oh mama I 've clear crime I 'm law law put end...,Oh mama I 've clear crime I 'm law law put end...,119
3,3,return-of-the-hustle,2007,fabolous,Hip-Hop,Record mode! (Record mode!) Get your money in ...,681.0,record mode record mode get money air like yea...,record record get money air like yeah yeah tim...,338
4,4,so-sad,1974,george-harrison,Rock,Now the winter has come To eclipse out the sun...,163.0,now winter come To eclips sun that light love ...,winter come To sun light love sometim cold win...,72


In [43]:
data_cls['clustered_label'] = pd.Series(results[4]['labels'])
data_cls.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,word_count,represent,represent_2nd,word_count_2nd,clustered_label
0,0,louder-flux-pavilion-doctor-p-remix,2012,dj-fresh,Electronic,I can't control this feeling Something's happe...,341.0,I ca n't control feel someth 's happen insid o...,I ca n't control feel someth 's happen insid s...,339,3
1,1,that-s-my-name,2009,akcent,Pop,In my heart I will keep you In my heart Foreve...,318.0,In heart I keep In heart forev In heart and on...,In heart I keep In heart forev In heart one li...,169,2
2,2,lemonade,2007,apologetix,Rock,"Oh, Mama, I've been cleared of my crimes and I...",250.0,Oh mama I 've clear crime I 'm law law put end...,Oh mama I 've clear crime I 'm law law put end...,119,2
3,3,return-of-the-hustle,2007,fabolous,Hip-Hop,Record mode! (Record mode!) Get your money in ...,681.0,record mode record mode get money air like yea...,record record get money air like yeah yeah tim...,338,0
4,4,so-sad,1974,george-harrison,Rock,Now the winter has come To eclipse out the sun...,163.0,now winter come To eclips sun that light love ...,winter come To sun light love sometim cold win...,72,1


In [44]:
emotions = ['Fear',
            'Sad',
            'Bored',
            'Happy',
            'Excited',
            'Angry']

In [45]:
for e in tqdm(emotions):
    data_cls[e] = np.nan

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1180.50it/s]


In [66]:
for i in tqdm(range(len(data_cls))):
    lyrics_emotion = emo_dict[str(i)]['emotion']
    values = [lyrics_emotion[e] for e in emotions]
    data_cls.at[i, emotions] = values
#     break

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4395/4395 [00:07<00:00, 621.32it/s]


## Modeling

### Measures

In [128]:
def measures(ypred, ytest):
    return f1_score(ypred, ytest, average='micro'), cohen_kappa_score(ypred, ytest)

kappa_scorer = make_scorer(cohen_kappa_score)

scoring = kappa_scorer

### SVM

In [139]:
def svm_model(xtrain, xtest, ytrain, ytest, batch):
    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
                  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], 
                 'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}
    clf = GridSearchCV(
        SVC(), param_grid, scoring=scoring
    )
    searcher = clf.fit(xtrain, ytrain)
    estimator = searcher.best_estimator_
    f1, kappa = measures(estimator.predict(xtest), ytest)
    print('[SVM] training: ', 'f1:', f1, 'kappa:', kappa)
    with open(batch + '_svm_clf.pickle', 'wb') as f:
        pickle.dump(clf, f)
    with open(batch + '_svm_searcher.pickle',
              'wb') as sf:
        pickle.dump(searcher, sf)
    with open(batch + '_svm_estimator.pickle',
      'wb') as sfm:
        pickle.dump(estimator, sfm)
    return searcher, estimator, clf, f1, kappa

### Logistic Regression

In [140]:
def lg_model(xtrain, xtest, ytrain, ytest, batch):
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet'],
                  'max_iter': range(10, 50, 10)}
    clf = GridSearchCV(
        LogisticRegression(multi_class='auto', n_jobs=-1), param_grid, scoring=scoring
    )
    searcher = clf.fit(xtrain, ytrain)
    estimator = searcher.best_estimator_
    f1, kappa = measures(estimator.predict(xtest), ytest)
    print('[LG] training: ', 'f1:', f1, 'kappa:', kappa)
    with open(batch + '_lg_clf.pickle', 'wb') as f:
        pickle.dump(clf, f)
    with open(batch + '_lg_searcher.pickle',
              'wb') as sf:
        pickle.dump(searcher, sf)
    with open(batch + '_lg_estimator.pickle',
      'wb') as sfm:
        pickle.dump(estimator, sfm)
    return searcher, estimator, clf, f1, kappa

### Decision Tree

In [141]:
def dt_model(xtrain, xtest, ytrain, ytest, batch):
    param_grid = {'min_samples_split': range(2, 403, 20)}
    clf = GridSearchCV(
        tree.DecisionTreeClassifier(), param_grid, scoring=scoring
    )
    searcher = clf.fit(xtrain, ytrain)
    estimator = searcher.best_estimator_
    f1, kappa = measures(estimator.predict(xtest), ytest)
    print('[dt] training: ', 'f1:', f1, 'kappa:', kappa)
    with open(batch + '_dt_clf.pickle', 'wb') as f:
        pickle.dump(clf, f)
    with open(batch + '_dt_searcher.pickle',
              'wb') as sf:
        pickle.dump(searcher, sf)
    with open(batch + '_dt_estimator.pickle',
      'wb') as sfm:
        pickle.dump(estimator, sfm)
    return searcher, estimator, clf, f1, kappa

### Naive Bayes

In [142]:
def nb_model(xtrain, xtest, ytrain, ytest, batch):
    param_grid = {}
    clf = GridSearchCV(GaussianNB(), param_grid)
    searcher = clf.fit(xtrain, ytrain)
    estimator = searcher.best_estimator_
    f1, kappa = measures(estimator.predict(xtest), ytest)
    print('[NB] training: ', 'f1:', f1, 'kappa:', kappa)
    with open(batch + '_nb_clf.pickle', 'wb') as f:
        pickle.dump(clf, f)
    with open(batch + '_nb_searcher.pickle',
              'wb') as sf:
        pickle.dump(searcher, sf)
    with open(batch + '_nb_estimator.pickle',
      'wb') as sfm:
        pickle.dump(estimator, sfm)
    return searcher, estimator, clf, f1, kappa

## Data spliting

In [72]:
data_cls['genre'].value_counts()

Rock             1840
Pop               657
Hip-Hop           472
Metal             368
Not Available     325
Country           283
Electronic        135
Jazz              110
R&B                67
Other              60
Indie              54
Folk               24
Name: genre, dtype: int64

In [78]:
target = {}
count = 1
for i in set(data_cls['genre'].tolist()):
    target[i] = count
    count += 1

In [79]:
target

{'Pop': 1,
 'Electronic': 2,
 'Rock': 3,
 'Metal': 4,
 'Country': 5,
 'Jazz': 6,
 'R&B': 7,
 'Folk': 8,
 'Hip-Hop': 9,
 'Other': 10,
 'Not Available': 11,
 'Indie': 12}

In [80]:
data_cls['genre_'] = data_cls['genre']

data_cls['genre'] = data_cls['genre'].map(target)

In [82]:
data_cls.columns

Index(['index', 'song', 'year', 'artist', 'genre', 'lyrics', 'word_count',
       'represent', 'represent_2nd', 'word_count_2nd', 'clustered_label',
       'Fear', 'Sad', 'Bored', 'Happy', 'Excited', 'Angry', 'genre_'],
      dtype='object')

# Training

## Emotion & Topic

In [143]:
X = data_cls[['clustered_label','Fear', 'Sad', 'Bored', 'Happy', 'Excited', 'Angry']].values
Y = data_cls['genre'].astype(int).values

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=31)
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((3516, 7), (879, 7), (3516,), (879,))

In [145]:
# svm_searcher, svm_estimator, svm_clf, svm_f1, svm_kappa = svm_model(xtrain, xtest, ytrain, ytest, batch='Combined')
lg_searcher, lg_estimator, lg_clf, lg_f1, lg_kappa = lg_model(xtrain, xtest, ytrain, ytest, batch='Combined')
dt_searcher, dt_estimator, dt_clf, dt_f1, dt_kappa = dt_model(xtrain, xtest, ytrain, ytest, batch='Combined')
nb_searcher, nb_estimator, nb_clf, nb_f1, nb_kappa = nb_model(xtrain, xtest, ytrain, ytest, batch='Combined')

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[LG] training:  f1: 0.4641638225255973 kappa: 0.15662068923370254
[dt] training:  f1: 0.40386803185437997 kappa: 0.12547111238534292
[NB] training:  f1: 0.43913538111490324 kappa: 0.1412426355621349


### Lyrics

In [146]:
X_ = data
Y_ = data_cls['genre'].astype(int).values

xtrain_, xtest_, ytrain_, ytest_ = train_test_split(X_, Y_, test_size=0.2, random_state=31)
xtrain_.shape, xtest_.shape, ytrain_.shape, ytest_.shape

((3516, 788), (879, 788), (3516,), (879,))

In [147]:
# svm_searcher, svm_estimator, svm_clf, svm_f1, svm_kappa = svm_model(xtrain_, xtest_, ytrain_, ytest_, batch='lyrics')
lg_searcher, lg_estimator, lg_clf, lg_f1, lg_kappa = lg_model(xtrain_, xtest_, ytrain_, ytest_, batch='lyrics')
dt_searcher, dt_estimator, dt_clf, dt_f1, dt_kappa = dt_model(xtrain_, xtest_, ytrain_, ytest_, batch='lyrics')
nb_searcher, nb_estimator, nb_clf, nb_f1, nb_kappa = nb_model(xtrain_, xtest_, ytrain_, ytest_, batch='lyrics')

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[LG] training:  f1: 0.5164960182025028 kappa: 0.2754150252826477
[dt] training:  f1: 0.42548350398179746 kappa: 0.19720075090336286
[NB] training:  f1: 0.24118316268486917 kappa: 0.13869231865385812
