In [90]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')


from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression as log 
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics    

In [2]:
df = pd.read_csv('labeled_lyrics_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


# Only 15k lyrics

In [3]:
df = df.head(15000)
df = df.drop(columns = 'Unnamed: 0',axis = 1)
df.count()

artist    15000
seq       15000
song      15000
label     15000
dtype: int64

# converting label to categorical data

In [4]:
def getSentiment(num):
    if num > .66:
        return 'positive'
    elif num > .33:
        return 'neutral'
    else :
        return 'negative'

In [5]:
df['sentiment'] = df.label.apply(lambda x: getSentiment(x))

In [6]:
df.head()

Unnamed: 0,artist,seq,song,label,sentiment
0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,neutral
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,neutral
2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,negative
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,neutral
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,neutral


In [99]:
df.sentiment.value_counts()

neutral     6181
negative    4728
positive    4091
Name: sentiment, dtype: int64

# Preprocessing

### lowercase

In [7]:
def make_lower(lyric):
    return lyric.lower()

### removing punctuation

In [14]:


def remove_punctuation(lyric):
    p = re.compile("[^\w\s]")
    lyric = re.sub(p,'',lyric)
    return lyric



### removing stopwords

In [15]:
def remove_stopwords(lyric):
    # Break the sentence down into a list of words
    words = word_tokenize(lyric)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    lyric = ' '.join(valid_words)

    return lyric

### text pipeline

In [16]:
def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string

In [17]:
test = 'ever trapped out the bando\r'
test = text_pipeline(test)
test

'ever trapped bando'

## cleaning lyrics

In [18]:
df['clean_lyrics'] = df.seq.apply(text_pipeline)

In [19]:
df.head()

Unnamed: 0,artist,seq,song,label,sentiment,clean_lyrics
0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,neutral,aint ever trapped bando oh lord dont get wrong...
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,neutral,drinks go smoke goes feel got let go cares get...
2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,negative,dont live planet earth found love venus thats ...
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,neutral,trippin grigio mobbin lights low trippin grigi...
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,neutral,see midnight panther gallant brave found found...


# feed tokenize lyrics to word2vec

In [95]:
X = df.clean_lyrics.apply(lambda x: word_tokenize(x))
X.head()

0    [aint, ever, trapped, bando, oh, lord, dont, g...
1    [drinks, go, smoke, goes, feel, got, let, go, ...
2    [dont, live, planet, earth, found, love, venus...
3    [trippin, grigio, mobbin, lights, low, trippin...
4    [see, midnight, panther, gallant, brave, found...
Name: clean_lyrics, dtype: object

In [96]:
model = Word2Vec(X, vector_size = 200, sg = 1)


In [97]:
model.train(X,total_examples= 15000, epochs=100)

(152098833, 175345500)

In [98]:
model.save("word2vec.model")

# Use this line for pre-trained model


In [None]:
model = Word2vec.load("word2vec.model")

In [75]:
model.wv.most_similar(positive ='gun')



[('rattatat', 0.462798535823822),
 ('trigger', 0.4259166717529297),
 ('guns', 0.4148804843425751),
 ('sheriff', 0.3890664875507355),
 ('loaded', 0.3836432695388794),
 ('radar', 0.37760934233665466),
 ('shoot', 0.37534812092781067),
 ('deputy', 0.37455499172210693),
 ('61', 0.37330228090286255),
 ('dealer', 0.3701655864715576)]

In [76]:
model.wv.most_similar(positive ='see')

[('look', 0.489340603351593),
 ('kitchenette', 0.46089768409729004),
 ('elimination', 0.43351733684539795),
 ('know', 0.42610299587249756),
 ('industrial', 0.42162269353866577),
 ('mckennedy', 0.412797212600708),
 ('eyes', 0.4127568304538727),
 ('rummy', 0.4077410101890564),
 ('ception', 0.4076326787471771),
 ('icu', 0.4048142731189728)]

# Using word2vec to create a lyric vector by taking the average of words present in lyric

Reference to method used https://www.kaggle.com/code/nitin194/twitter-sentiment-analysis-word2vec-doc2vec/notebook

In [77]:
w2v_words = list(model.wv.key_to_index)
def lyric_vector(tokens, size):
    sent = np.zeros(200)
    count = 0
    for word in tokens:
        if word in w2v_words:
            vec = model.wv[word]
            sent += vec
            count += 1
    if count != 0:
        sent /= count
    return sent

In [78]:
vector = []
for sent in X:
    sentence = lyric_vector(sent, 200)
    vector.append(sentence)

In [79]:
lyric_vector = pd.DataFrame(vector)
lyric_vector['sentiment'] = pd.DataFrame(df['sentiment'])
lyric_vector.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,sentiment
0,0.088659,0.112138,0.161445,0.001931,0.316668,0.179604,-0.214201,0.16726,0.085163,0.194139,...,0.008272,0.060889,-0.08872,0.122241,0.190579,0.021987,-0.051598,-0.005356,0.134056,neutral
1,0.134244,0.058785,0.230777,0.053445,0.146164,0.061137,-0.075396,0.110987,-0.053461,0.155229,...,-0.005166,0.066333,-0.141629,0.035456,0.329494,-0.048746,-0.096891,-0.019531,0.076015,neutral
2,0.267114,-0.152474,0.165373,0.157845,0.093746,0.082521,-0.252882,0.093548,-0.156137,0.172663,...,-0.062171,-0.090976,-0.085709,0.106496,0.096173,0.193045,-0.021638,-0.194595,-0.037425,negative
3,0.206498,0.074521,0.174805,0.071128,0.256123,0.156303,-0.143186,0.114222,-0.055132,0.036289,...,-0.039531,0.068759,-0.129851,-0.141814,0.221379,0.072579,-0.101745,-0.137928,0.219422,neutral
4,0.064239,-0.07962,0.062374,0.000292,0.297556,0.206739,-0.216018,0.049906,0.066528,0.191623,...,-0.009193,-0.097276,-0.107703,0.152456,0.249087,0.017335,-0.083465,-0.069013,-0.087316,neutral


# Preparing training and test set

In [80]:
X = lyric_vector.drop('sentiment',axis = 1)
y = lyric_vector['sentiment']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state =42)

# Classification!

## RandomForestClassifier

In [81]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier()


# Fit our model with our training data.
rf_model.fit(X_train, y_train)


# Make new predictions of our testing data. 
y_pred = rf_model.predict(X_test)


# Make predicted probabilites of our testing data
y_pred_proba = rf_model.predict_proba(X_test)

# Evaluate our model
accuracy =  rf_model.score(X_test, y_test)

# Print our evaluation metrics
print("Model Accuracy: %f" % accuracy)


Model Accuracy: 0.467667


In [88]:
params = {
    'n_estimators' : [5, 100, 1000],
    'criterion' : ['gini', 'entropy'],
    
}

grid_search_cv = GridSearchCV( 
    estimator=RandomForestClassifier(), 
    param_grid=params,
    scoring='f1_macro', )


# Now, with one easy command, fit all combination of trees. 
grid_search_cv.fit(X_train, y_train)


# Print the best parameters it found
print(grid_search_cv.best_params_)


# This command gives you model that has the highest f1-score. 
model = grid_search_cv.best_estimator_

{'criterion': 'gini', 'n_estimators': 1000}


In [92]:


# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print('F1 Score: %f' % f1)

Accuracy Score: 0.488333
Precision Score: 0.520504
Recall Score: 0.454463


NameError: name 'f1_score' is not defined

In [94]:
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print("training score:", training_score)
print("testing score:", testing_score)

training score: 0.9726666666666667
testing score: 0.48833333333333334


In [83]:


logit = log(solver='lbfgs',max_iter=5000).fit(X_train , y_train)
training_score = logit.score(X_train, y_train)
testing_score = logit.score(X_test, y_test)
y_pred = logit.predict(X_test)

print("training score:", training_score)
print("testing score:", testing_score)
print("precision score:" ,precision_score(y_test,y_pred, average = 'macro'))
print("recall score:", recall_score(y_test,y_pred, average = 'macro'))
print("f1Score:",f1_score(y_test,y_pred,average='macro'))
print('Classification Report: \n',metrics.classification_report(y_test, y_pred))



training score: 0.5008333333333334
testing score: 0.4663333333333333


NameError: name 'precision_score' is not defined