# TL;DR skip to conclusion at the bottom

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from tensorflow.keras import layers, models, losses, optimizers

from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D,LSTM,SpatialDropout1D
from keras.utils import pad_sequences

from keras.utils.np_utils import to_categorical
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')

In [2]:
df = pd.read_csv('data/labeled_lyrics_w_genres.csv')
df.shape

(145250, 7)

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B


In [4]:
lemmatizer = WordNetLemmatizer()
# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 
def lem_words(a_string):
    # Initalize our Stemmer
    lemmatizer = WordNetLemmatizer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        lemmed_word = lemmatizer.lemmatize(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(lemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [5]:
# Pipeline

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    a_string = remove_stopwords(a_string)
    a_string = lem_words(a_string)
    return a_string

In [6]:
df['clean_lyrics'] = df.seq.apply(text_processing_pipeline)

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,clean_lyrics
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop,drink go smoke go feel got let go care get los...
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B,trippin grigio mobbin light low trippin grigio...
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [7]:
df = df.sample(frac=1)

X = df['clean_lyrics']

y = df['label']


X_text = X

In [8]:
# Limiting our tokenizers vocab size
max_words = 10000
 
    
# create the tokenizer
tokenizer = Tokenizer(num_words=max_words)


# Fit the tokenizer
tokenizer.fit_on_texts(X)


# Create the sequences for each sentence, basically turning each word into its index position
sequences = tokenizer.texts_to_sequences(X)


index_word = tokenizer.index_word


# # Limiting our sequencer to only include 500 words
max_length = 300


# # Convert the sequences to all be the same length of 500
X = pad_sequences(sequences, maxlen=max_length, padding='post')
print(X.shape)

(145250, 300)


In [25]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 





model.add(LSTM(50,dropout =0.2))


model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 300, 32)           320000    
                                                                 
 lstm_18 (LSTM)              (None, 50)                16600     
                                                                 
 dense_7 (Dense)             (None, 1)                 51        
                                                                 
Total params: 336,651
Trainable params: 336,651
Non-trainable params: 0
_________________________________________________________________


In [13]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=30, batch_size=20)

Epoch 1/30


  return t[start:end]


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

KeyboardInterrupt: 

In [27]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,dropout =0.2,recurrent_dropout=0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 300, 32)           320000    
                                                                 
 lstm_19 (LSTM)              (None, 300, 50)           16600     
                                                                 
 lstm_20 (LSTM)              (None, 50)                20200     
                                                                 
 dense_8 (Dense)             (None, 1)                 51        
                                                                 
Total params: 356,851
Trainable params: 356,851
Non-trainable params: 0
_________________________________________________________________


In [28]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15
  19/5810 [..............................] - ETA: 53:31 - loss: 0.1184 - mse: 0.1184

KeyboardInterrupt: 

In [29]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 300, 32)           320000    
                                                                 
 lstm_21 (LSTM)              (None, 300, 50)           16600     
                                                                 
 lstm_22 (LSTM)              (None, 50)                20200     
                                                                 
 dense_9 (Dense)             (None, 1)                 51        
                                                                 
Total params: 356,851
Trainable params: 356,851
Non-trainable params: 0
_________________________________________________________________


In [30]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [31]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 300, 32)           320000    
                                                                 
 lstm_23 (LSTM)              (None, 300, 50)           16600     
                                                                 
 lstm_24 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_25 (LSTM)              (None, 50)                20200     
                                                                 
 dense_10 (Dense)            (None, 1)                 51        
                                                                 
Total params: 377,051
Trainable params: 377,051
Non-trainable params: 0
_________________________________________________________________


In [32]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [33]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 300, 32)           320000    
                                                                 
 lstm_26 (LSTM)              (None, 300, 50)           16600     
                                                                 
 lstm_27 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_28 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_29 (LSTM)              (None, 50)                20200     
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 397,251
Trainable params: 397,251
Non-t

In [34]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15

KeyboardInterrupt: 

In [35]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 64, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 300, 64)           640000    
                                                                 
 lstm_30 (LSTM)              (None, 300, 50)           23000     
                                                                 
 lstm_31 (LSTM)              (None, 50)                20200     
                                                                 
 dense_12 (Dense)            (None, 1)                 51        
                                                                 
Total params: 683,251
Trainable params: 683,251
Non-trainable params: 0
_________________________________________________________________


In [None]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

In [9]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,dropout =0.2,recurrent_dropout=0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 32)           320000    
                                                                 
 lstm (LSTM)                 (None, 300, 50)           16600     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 356,851
Trainable params: 356,851
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


In [None]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=15, batch_size=20)

Epoch 1/15


  return t[start:end]


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
 872/5810 [===>..........................] - ETA: 11:43 - loss: 0.0388 - mse: 0.0388

# Conclusion
## so it seems that after experimenting, two lstm layers seem to be the best. Doing one last model with training and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
X_train.shape,y_train.shape

((116200, 300), (116200,))

In [14]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 32)           320000    
                                                                 
 lstm_2 (LSTM)               (None, 300, 50)           16600     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 356,851
Trainable params: 356,851
Non-trainable params: 0
_________________________________________________________________


In [15]:
hist = model.fit(X_train, y_train, 
                 validation_split=0.2, 
                 epochs=4, batch_size=20)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
mse= model.evaluate(X_test, y_test, verbose=0)[1]

print('Test mse with stacked LSTM:', mse)

Test mse with stacked LSTM: 0.05032280460000038
