# Preprocessing

In [None]:
from gensim import corpora
import pandas as pd
from nltk.tokenize.casual import casual_tokenize
from nltk.corpus import stopwords

def tokenize(text, stopset):
    tokens = casual_tokenize(text)
    return [w for w in tokens if not w in stopset]

df = pd.read_csv("lyrics.csv")
stopset = set(stopwords.words('english')).union(set(['chorus','verse','[',']','(',')']))
df = df[['index','song','artist','genre','lyrics']]
df = df[df.apply(lambda x: not (pd.isnull(x['lyrics']) or pd.isnull(x['song']) or x['genre'] == 'Not Available' or x['genre'] == 'Other'), 
        axis=1, reduce=True)]
df['lyrics'] = df.apply(lambda x: tokenize(x['lyrics'], stopset), axis=1, reduce=True)
df = df[df.apply(lambda x: len(x['lyrics']) > 20, axis=1, reduce=True)]
df = df.groupby('genre').apply(lambda x: x.sample(n=2000))

df.to_pickle('tickle_my.pkl')



# Word/Document Embedding

In [1]:
from gensim import corpora
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


In [2]:
model.init_sims(replace=False)

In [3]:
#create seeds
happy = ['happy']
love = ['love','lust']
sad = ['sad']
hate = ['anger','hate']

#Generate related words from seeds
happy_words50 = [a[0] for a in model.most_similar(positive=sad,negative=happy, topn=50)]
sad_words50 = [a[0] for a in model.most_similar(positive=happy,negative=sad, topn=50)]
love_words50 = [a[0] for a in model.most_similar(positive=love,negative=hate, topn=50)]
hate_words50 = [a[0] for a in model.most_similar(positive=hate,negative=love, topn=50)]


In [4]:
import string
import numpy as np
from numpy import linalg

def normalize(vector):
    norm = linalg.norm(vector)
    nrmlz = np.vectorize(lambda v: v/norm)
    return nrmlz(vector)
    
def doc2vec(tokenized_doc, model):
    count = 0    
    res = np.zeros(300)
    for word in tokenized_doc:
        if word not in string.punctuation:
            if word in model:
                count += 1
                res += model[word]
    if count != 0:
        res /= count
    return res

happy_words50_vec = normalize(doc2vec(happy_words50,model))
sad_words50_vec = normalize(doc2vec(sad_words50, model))
love_words50_vec = normalize(doc2vec(love_words50, model))
hate_words50_vec = normalize(doc2vec(hate_words50, model))


In [67]:
import pandas as pd

df = pd.read_pickle('tickle_my.pkl')['lyrics']
df = df.apply(lambda x: normalize(doc2vec(x,model)))
df = df.apply(lambda x: np.nan_to_num(x))
df.to_pickle('purple.pkl')

20000
