In [1]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')


from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression as log 
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics    

In [4]:
df = pd.read_csv('...data/labeled_lyrics_cleaned.csv')
df.head()

FileNotFoundError: [Errno 2] File ...data/labeled_lyrics_cleaned.csv does not exist: '...data/labeled_lyrics_cleaned.csv'

# Only 15k lyrics

In [None]:
df = df.head(15000)
df = df.drop(columns = 'Unnamed: 0',axis = 1)
df.count()

# converting label to categorical data

In [None]:
def getSentiment(num):
    if num > .66:
        return 'positive'
    elif num > .33:
        return 'neutral'
    else :
        return 'negative'

In [None]:
df['sentiment'] = df.label.apply(lambda x: getSentiment(x))

In [None]:
df.head()

In [None]:
df.sentiment.value_counts()

# Preprocessing

### lowercase

In [None]:
def make_lower(lyric):
    return lyric.lower()

### removing punctuation

In [None]:


def remove_punctuation(lyric):
    p = re.compile("[^\w\s]")
    lyric = re.sub(p,'',lyric)
    return lyric



### removing stopwords

In [None]:
def remove_stopwords(lyric):
    # Break the sentence down into a list of words
    words = word_tokenize(lyric)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    lyric = ' '.join(valid_words)

    return lyric

### text pipeline

In [None]:
def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string

In [None]:
test = 'ever trapped out the bando\r'
test = text_pipeline(test)
test

## cleaning lyrics

In [None]:
df['clean_lyrics'] = df.seq.apply(text_pipeline)

In [None]:
df.head()

# feed tokenize lyrics to word2vec

In [None]:
X = df.clean_lyrics.apply(lambda x: word_tokenize(x))
X.head()

In [None]:
model = Word2Vec(X, vector_size = 200, sg = 1)


In [None]:
model.train(X,total_examples= 15000, epochs=100)

In [None]:
model.save("word2vec.model")

# Use this line for pre-trained model


In [None]:
model = Word2vec.load("word2vec.model")

In [None]:
model.wv.most_similar(positive ='gun')



In [None]:
model.wv.most_similar(positive ='see')

# Using word2vec to create a lyric vector by taking the average of words present in lyric

Reference to method used https://www.kaggle.com/code/nitin194/twitter-sentiment-analysis-word2vec-doc2vec/notebook

In [None]:
w2v_words = list(model.wv.key_to_index)
def lyric_vector(tokens, size):
    sent = np.zeros(200)
    count = 0
    for word in tokens:
        if word in w2v_words:
            vec = model.wv[word]
            sent += vec
            count += 1
    if count != 0:
        sent /= count
    return sent

In [None]:
vector = []
for sent in X:
    sentence = lyric_vector(sent, 200)
    vector.append(sentence)

In [None]:
lyric_vector = pd.DataFrame(vector)
lyric_vector['sentiment'] = pd.DataFrame(df['sentiment'])
lyric_vector.head()

# Preparing training and test set

In [None]:
X = lyric_vector.drop('sentiment',axis = 1)
y = lyric_vector['sentiment']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state =42)

# Classification!

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier()


# Fit our model with our training data.
rf_model.fit(X_train, y_train)


# Make new predictions of our testing data. 
y_pred = rf_model.predict(X_test)


# Make predicted probabilites of our testing data
y_pred_proba = rf_model.predict_proba(X_test)

# Evaluate our model
accuracy =  rf_model.score(X_test, y_test)

# Print our evaluation metrics
print("Model Accuracy: %f" % accuracy)


In [None]:
params = {
    'n_estimators' : [5, 100, 1000],
    'criterion' : ['gini', 'entropy'],
    
}

grid_search_cv = GridSearchCV( 
    estimator=RandomForestClassifier(), 
    param_grid=params,
    scoring='f1_macro', )


# Now, with one easy command, fit all combination of trees. 
grid_search_cv.fit(X_train, y_train)


# Print the best parameters it found
print(grid_search_cv.best_params_)


# This command gives you model that has the highest f1-score. 
model = grid_search_cv.best_estimator_

In [None]:


# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred,average = 'macro')
print('F1 Score: %f' % f1)

In [None]:
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print("training score:", training_score)
print("testing score:", testing_score)

In [None]:


logit = log(solver='lbfgs',max_iter=5000).fit(X_train , y_train)
training_score = logit.score(X_train, y_train)
testing_score = logit.score(X_test, y_test)
y_pred = logit.predict(X_test)

print("training score:", training_score)
print("testing score:", testing_score)
print("precision score:" ,precision_score(y_test,y_pred, average = 'macro'))
print("recall score:", recall_score(y_test,y_pred, average = 'macro'))
print("f1Score:",f1_score(y_test,y_pred,average='macro'))
print('Classification Report: \n',metrics.classification_report(y_test, y_pred))

