In [27]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.utils import pad_sequences

from keras.utils.np_utils import to_categorical
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')




from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics

df = pd.read_csv('data/labeled_lyrics_cleaned.csv')
df.shape

(158353, 5)

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


In [29]:
df = df.head(30000)
df = df.sample(frac=1)
df =df.reset_index(drop =True)
df.shape

(30000, 5)

In [41]:


def getSentiment(num):
    if num > .66:
        return 2
    elif num > .33:
        return 1
    else :
        return 0






In [42]:
df['sentiment'] = df.label.apply(lambda x: getSentiment(x))


df.head()



Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,sentiment,clean_lyrics
18662,812,Juelz Santana,[Sizzla (Juelz Santana)]\r\nOww!! Ha ha!! \r\n...,Shottas,0.577,1,sizzla juelz santana oww ha ha click clack dea...
24615,4469,The Business,Half past 5 I'm in the pub\r\nSix O'clock it's...,Drinking and Driving,0.97,2,half past 5 im pub six oclock home grub eight ...
17929,18153,Ronnie Cuber,If you hear\r\nA song in blue\r\nLike a flower...,Prelude to a Kiss,0.179,0,hear song blue like flower crying dew heart se...
5673,4686,DJ Bobo,[Chorus 1]\r\nWe are what we are - Together\r\...,Together,0.768,2,chorus 1 together went far together together l...
27467,27292,Johnny Cash,Will you walk another mile turn a frown with a...,What on Earth Will You Do (For Heaven's Sake),0.38,1,walk another mile turn frown smile live lowly ...


In [43]:
df['sentiment'] = df['sentiment'].astype(object)
df.sentiment.value_counts()
df.dtypes

Unnamed: 0        int64
artist           object
seq              object
song             object
label           float64
sentiment        object
clean_lyrics     object
dtype: object

In [45]:
def make_lower(lyric):
    return lyric.lower()
def remove_punctuation(lyric):
    p = re.compile("[^\w\s]")
    lyric = re.sub(p,'',lyric)
    return lyric


def remove_stopwords(lyric):
    # Break the sentence down into a list of words
    words = word_tokenize(lyric)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    lyric = ' '.join(valid_words)

    return lyric



def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string



In [46]:
df['clean_lyrics'] = df.seq.apply(text_pipeline)

df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,sentiment,clean_lyrics
18662,812,Juelz Santana,[Sizzla (Juelz Santana)]\r\nOww!! Ha ha!! \r\n...,Shottas,0.577,1,sizzla juelz santana oww ha ha click clack dea...
24615,4469,The Business,Half past 5 I'm in the pub\r\nSix O'clock it's...,Drinking and Driving,0.97,2,half past 5 im pub six oclock home grub eight ...
17929,18153,Ronnie Cuber,If you hear\r\nA song in blue\r\nLike a flower...,Prelude to a Kiss,0.179,0,hear song blue like flower crying dew heart se...
5673,4686,DJ Bobo,[Chorus 1]\r\nWe are what we are - Together\r\...,Together,0.768,2,chorus 1 together went far together together l...
27467,27292,Johnny Cash,Will you walk another mile turn a frown with a...,What on Earth Will You Do (For Heaven's Sake),0.38,1,walk another mile turn frown smile live lowly ...


In [39]:
df['sentiment_int'] 

0

In [52]:
df = df.sample(frac=1)

X = df['clean_lyrics']

y = df['sentiment'].values

y = to_categorical( y )
X_text = X

In [53]:
y[0]

array([0., 0., 1.], dtype=float32)

In [54]:
# Limiting our tokenizers vocab size
max_words = 20000
 
    
# create the tokenizer
tokenizer = Tokenizer(num_words=max_words)


# Fit the tokenizer
tokenizer.fit_on_texts(X)


# Create the sequences for each sentence, basically turning each word into its index position
sequences = tokenizer.texts_to_sequences(X)


index_word = tokenizer.index_word


# # Limiting our sequencer to only include 500 words
max_length = 500


# # Convert the sequences to all be the same length of 500
X = pad_sequences(sequences, maxlen=max_length, padding='post')
print(X.shape)

(30000, 500)


In [55]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 


model.add(Flatten())

model.add(Dense(128, activation='relu'))


model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 32)           640000    
                                                                 
 flatten_2 (Flatten)         (None, 16000)             0         
                                                                 
 dense_4 (Dense)             (None, 128)               2048128   
                                                                 
 dense_5 (Dense)             (None, 3)                 387       
                                                                 
Total params: 2,688,515
Trainable params: 2,688,515
Non-trainable params: 0
_________________________________________________________________


In [56]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=20, batch_size=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [74]:
X.shape,y.shape

((30000, 500), (30000,))