# Load libraries

In [6]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from tensorflow.keras import layers, models, losses, optimizers,callbacks

from keras.layers import Dense,LSTM
from keras.utils import pad_sequences


import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
stopwords = stopwords.words('english')

# load preprocessed data

In [7]:
df = pd.read_csv('preprocessed_data.csv')
df.shape

(158353, 6)

In [8]:
df['clean_lyrics'] = df['clean_lyrics'].astype(str)

# Create X and y feature

In [14]:
df = df.sample(frac=1)

X = df['clean_lyrics']

y = df['label']

# Vectorize X

In [15]:
# Limiting our tokenizers vocab size
max_words = 10000
 
    
# create the tokenizer
tokenizer = Tokenizer(num_words=max_words)


# Fit the tokenizer
tokenizer.fit_on_texts(X)


# Create the sequences for each sentence, basically turning each word into its index position
sequences = tokenizer.texts_to_sequences(X)


index_word = tokenizer.index_word


# # Limiting our sequencer to only include 300 words
max_length = 300


# # Convert the sequences to all be the same length of 300
X = pad_sequences(sequences, maxlen=max_length, padding='post')
print(X.shape)

(158353, 300)


# Split train and test sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
X_train.shape,y_train.shape

((126682, 300), (126682,))

# Building LSTM nueral net 

In [17]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 

model.add(LSTM(50,return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(1,kernel_initializer='normal',  activation='linear'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse']) 

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 32)           320000    
                                                                 
 lstm_2 (LSTM)               (None, 300, 50)           16600     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 356,851
Trainable params: 356,851
Non-trainable params: 0
_________________________________________________________________


# Training

In [18]:
callback = callbacks.EarlyStopping(monitor='val_mse',patience = 2,restore_best_weights=True)
hist = model.fit(X_train, y_train, 
                 validation_split=0.2,
                 epochs=15, batch_size=20,callbacks=[callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


In [19]:
model.save('LSTM_Valence_model.h5')

In [20]:
model = keras.models.load_model('LSTM_Valence_model.h5')


In [21]:
model

<keras.engine.sequential.Sequential at 0x23fdb4a6e20>

# Testing

In [22]:
mse= model.evaluate(X_test, y_test, verbose=0)[1]

print('Test mse with stacked LSTM:', mse)

Test mse with stacked LSTM: 0.04978620260953903
