# Preprocessing

In [None]:
from gensim import corpora
import pandas as pd
from nltk.tokenize.casual import casual_tokenize
from nltk.corpus import stopwords

def tokenize(text, stopset):
    tokens = casual_tokenize(text)
    return [w for w in tokens if not w in stopset]

df = pd.read_csv("lyrics.csv")
stopset = set(stopwords.words('english')).union(set(['chorus','verse','[',']','(',')']))
df = df[['index','song','artist','genre','lyrics']]
df = df[df.apply(lambda x: not (pd.isnull(x['lyrics']) or pd.isnull(x['song']) or x['genre'] == 'Not Available' or x['genre'] == 'Other'), 
        axis=1, reduce=True)]
df['lyrics'] = df.apply(lambda x: tokenize(x['lyrics'], stopset), axis=1, reduce=True)
df = df[df.apply(lambda x: len(x['lyrics']) > 20, axis=1, reduce=True)]
df = df.groupby('genre').apply(lambda x: x.sample(n=2000))

df.to_pickle('tickle_my.pkl')



# Word/Document Embedding

In [1]:
from gensim import corpora
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [2]:
model.init_sims(replace=False)

In [4]:
#create seeds
happy = ['happy']
love = ['love','lust']
sad = ['sad']
hate = ['anger','hate']

#Generate related words from seeds
happy_words50 = [a[0] for a in model.most_similar(positive=sad,negative=happy, topn=50)]
sad_words50 = [a[0] for a in model.most_similar(positive=happy,negative=sad, topn=50)]
love_words50 = [a[0] for a in model.most_similar(positive=love,negative=hate, topn=50)]
hate_words50 = [a[0] for a in model.most_similar(positive=hate,negative=love, topn=50)]


In [5]:
import string
import numpy as np
from numpy import linalg

def normalize(vector):
    norm = linalg.norm(vector)
    nrmlz = np.vectorize(lambda v: v/norm)
    return nrmlz(vector)
    
def doc2vec(tokenized_doc, w2vmodel):
    count = 0    
    res = np.zeros(300)
    for word in tokenized_doc:
        if word not in string.punctuation:
            if word in w2vmodel:
                count += 1
                res += w2vmodel[word]
    if count != 0:
        res /= count
    return res

happy_words50_vec = normalize(doc2vec(happy_words50,model))
sad_words50_vec = normalize(doc2vec(sad_words50, model))
love_words50_vec = normalize(doc2vec(love_words50, model))
hate_words50_vec = normalize(doc2vec(hate_words50, model))


In [67]:
import pandas as pd

df = pd.read_pickle('tickle_my.pkl')['lyrics']
df = df.apply(lambda x: normalize(doc2vec(x,model)))
df = df.apply(lambda x: np.nan_to_num(x))
df.to_pickle('purple.pkl')

In [7]:
import pandas as pd

from numpy import dot
from numpy import linalg

def cosine_simularity(a,b):
    return dot(a, b)/(linalg.norm(a)*linalg.norm(b))

df = pd.read_pickle('purple.pkl')

happy = df.apply(lambda x: cosine_simularity(x,happy_words50_vec))
happy.to_pickle('happy.pkl')

sad = df.apply(lambda x: cosine_simularity(x,sad_words50_vec))
sad.to_pickle('sad.pkl')

love = df.apply(lambda x: cosine_simularity(x,love_words50_vec))
love.to_pickle('love.pkl')

hate = df.apply(lambda x: cosine_simularity(x,hate_words50_vec))
hate.to_pickle('hate.pkl')

# Survey Test Results

In [30]:
from nltk.tokenize.casual import casual_tokenize
from nltk.corpus import stopwords

def tokenize(text, stopset):
    tokens = casual_tokenize(text)
    return [w for w in tokens if not w in stopset]

stopset = set(stopwords.words('english')).union(set(['chorus','verse','[',']','(',')']))
survey_lyrics = pd.read_csv('Lyricstest.csv')
survey_lyrics['Lyrics'] = survey_lyrics.apply(lambda x: tokenize(x['Lyrics'], stopset), axis=1, reduce=True)
survey_lyrics['Lyrics'] = survey_lyrics['Lyrics'].apply(lambda x: normalize(doc2vec(x,model)))

happytest = survey_lyrics['Lyrics'].apply(lambda x: cosine_simularity(x,happy_words50_vec))
happytest.to_pickle('happytest.pkl')

sadtest = survey_lyrics['Lyrics'].apply(lambda x: cosine_simularity(x,sad_words50_vec))
sadtest.to_pickle('sadtest.pkl')

lovetest = survey_lyrics['Lyrics'].apply(lambda x: cosine_simularity(x,love_words50_vec))
lovetest.to_pickle('lovetest.pkl')

hatetest = survey_lyrics['Lyrics'].apply(lambda x: cosine_simularity(x,hate_words50_vec))
hatetest.to_pickle('hatetest.pkl')

# Results Visualizations

In [39]:
pd.read_pickle('hatetest.pkl')

0     0.226313
1     0.240795
2     0.247406
3     0.350521
4     0.284313
5     0.200364
6     0.253698
7     0.235807
8     0.222950
9     0.241046
10    0.280124
11    0.224897
12    0.267555
13    0.244790
14    0.305451
Name: Lyrics, dtype: float64

In [38]:
print(survey_lyrics[['Artist','Song Title']])

                             Artist                       Song Title
0                    Rolling Stones                     Satisfaction
1                    Kelly Clarkson                   Because of You
2                        Paula Cole  Where Have All the Cowboys Gone
3           The Black Dahlia Murder                       Widowmaker
4   Frank Carter & the Rattlesnakes                     Wild Flowers
5                           Beatles             All You Need Is Love
6           William Elliot Whitmore            There is Hope For You
7                       David Bowie       The Man Who Sold the World
8                        MC Lucious       Boom! I Got Your Boyfriend
9                       Roy Orbison                   Running Scared
10                      Patsy Cline                            Crazy
11                         Tiny Tim           Living in the Sunlight
12                        Metallica                    Enter Sandman
13                      Ace of Bas