### imports

In [23]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import spacy
import pytextrank
import scipy as sp

### load datasets

In [81]:
songsdata_1 = pd.read_csv('lyrics1.csv')
songsdata_2 = pd.read_csv('lyrics2.csv')
songsdata_4 = pd.read_csv('lyrics4.csv')
songsdata_df = songsdata_1.append(songsdata_2)
songsdata_df = songsdata_df.append(songsdata_4)

for ix, row in songsdata_df.iterrows():
    if isinstance(row['lyrics'], float):
        songsdata_df = songsdata_df.drop(ix)

songsdata_df = songsdata_df.sample(n=2500)
        
display(songsdata_df)

Unnamed: 0.1,Unnamed: 0,track,artist,genre,valence_tags,arousal_tags,lyrics
568,1246,All At Once,Jack Johnson,acoustic,6.345750,3.287750,All at once The world can overwhelm me There'...
1113,2520,Goin Home,Dan Auerbach,rock,5.321748,3.358601,I've spent too long away from home Did all th...
930,2099,She Don't Use Jelly,The Flaming Lips,indie,6.944340,4.753962,I know a girl who thinks of ghosts She'll mak...
206,453,Life's on the Line,50 Cent,rap,5.006667,4.680000,"Nobody likes me Nobody likes me, but that's o..."
247,527,Bright Lights,Pete and the Pirates,indie rock,5.880000,4.050000,"Come on now baby, come with me If you want me,..."
...,...,...,...,...,...,...,...
1262,2745,Dorothy At Forty,Cursive,indie rock,6.044634,5.346829,"Dorothy, I know you've had amazing dreams We c..."
878,1984,Le lapin blanc,A7IE,dark electro,3.950000,5.405000,"Nowhere to go , nowhere to hide I live in fear..."
2985,6574,Can I Kick It?,Sage Francis,hip-hop,6.550000,3.520000,Can I kick it? (yes you can) {*3X*} Well I'm ...
741,1578,Year of the Dog,The Lovely Sparrows,indie,6.650000,5.770000,She whispered and I came running like a comeba...


In [83]:
def transform_data(dataset, column_ix, feature_num=500):
    features = {}
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe("textrank")

    for i, index in enumerate(dataset.index):
        if not i % 100:
            print('done: {}'.format(i))
        # print(dataset.iloc[index, column_ix])
        text = dataset.iloc[i, column_ix]
        doc = nlp(text)

        temp = []
        for phrase in doc._.phrases:
            temp.append([phrase.text, phrase.rank])
            feat = phrase.text
            rank = phrase.rank
            if feat in features:
                features[feat] += rank
            else:
                features[feat] = rank
            dataset.at[index, feat] = 1

    print(len(features))
    features_sorted = sorted([[features[x], x] for x in features.keys()], key=lambda x: x[0], reverse=True)[:feature_num]
    features_sorted = [x[1] for x in features_sorted]
    return dataset, features_sorted


songsdata_df, features = transform_data(songsdata_df, 6, feature_num=400)
songsdata_df = songsdata_df.fillna(0)

done: 0
done: 100
done: 200
done: 300
done: 400
done: 500
done: 600
done: 700
done: 800
done: 900
done: 1000
done: 1100
done: 1200
done: 1300
done: 1400
done: 1500
done: 1600
done: 1700
done: 1800
done: 1900
done: 2000
done: 2100
done: 2200
done: 2300
done: 2400
39885


In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(songsdata_df[features],
                                                    songsdata_df[['valence_tags','arousal_tags']], 
                                                    test_size=0.30)

In [93]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [191]:
model = Sequential()
model.add(Dense(350, activation='sigmoid'))
model.add(Dense(80))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=50, batch_size=25)
model.evaluate(X_test, y_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


1.8961278200149536

In [170]:
model.predict(X_test)

array([[5.987743 , 4.3749137],
       [5.54212  , 4.2980914],
       [6.177072 , 4.4395785],
       ...,
       [5.293986 , 4.239616 ],
       [5.551117 , 4.2943397],
       [4.35848  , 3.994205 ]], dtype=float32)

In [171]:
y_test

Unnamed: 0,valence_tags,arousal_tags
296,7.974299,6.073254
5,5.000000,5.280000
1060,6.858814,4.247627
409,5.187500,4.795000
705,5.386042,4.469583
...,...,...
1829,4.308663,3.143465
2395,4.067500,4.980000
1785,6.287748,4.052162
1260,5.886429,3.213571


In [192]:
model.save('lyrics_regression.h5')