# Applied Natural Language Processing - AT2 - HDInnovators
__Content:__

1. Import packages and create functions
2. Load the dataset
3. Data preparation
4. Data exploration
5. Sentiment analysis

  5.1. RandomForest 
  
  5.2. LSTM

6. Topic Modelling

## 1. Import packages and create function

In [2]:
!pip install -q -U keras-tuner

In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

Found GPU at: /device:GPU:0


In [4]:
# data processing
import pandas as pd
import re
import spacy
import string
import numpy as np

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# drive
from google.colab import drive
drive.mount('/content/drive')

# data splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV
import keras_tuner as kt

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tensorflow deep learning 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Flatten, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# RandomForest classifier
from sklearn.ensemble import RandomForestClassifier

# evaluation 
from sklearn.metrics import accuracy_score

# ignore warnings 
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#Create a function to transform valence sentiment labels
def create_sentiment_from_valence(valence):
   if valence < 0.45:
     return 0
   elif valence > 0.55:
     return 2
   else:
     return 1

## 2. Load and explore the dataset

__Load dataset__

In [6]:
# Loading dataset
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/content/lyrics_all_combined.csv')

## 3. Data preparation

__Lowercase__

In [7]:
# Convert all data of lyrics to string type.
df['lyrics'] = df['lyrics'].apply(str)

# Convert all string of lyrics to lowercase.
df['lyrics'] = df['lyrics'].str.lower()

 __Tokenize__

In [8]:
# Tokenize text from lyrics. 
df['tokenized'] = df['lyrics'].apply(word_tokenize)

__Remove punctuations__

In [9]:
# All punctuations
punc_marks = list(string.punctuation)
# Remove all punctuations.
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if word not in punc_marks])

__Remove stopwords__

In [10]:
# All stopwords of nltk
stopwords = nltk.corpus.stopwords.words('english')

# Customized list of stop words.
stopwords.extend(["'m", "'s", "'d", "hi", "im", "wa", "n't", "'get", "'ll", "'re", "'ve", "get", "still", "mmm", "ooh", "oooh", "yah", "yeh","mmm", "hmm"])

# Remove the stop words from the dataset and save the result to new column. 
df['cleaned_stopwords'] = df['tokenized'].apply(lambda x: [word for word in x if word not in stopwords])

__Lemmatize__

In [11]:
# Function to attain part of speech of words.
def determine_wordnet_speech(word_tag):
    if word_tag.startswith('J'):
        return wordnet.ADJ
    elif word_tag.startswith('V'):
        return wordnet.VERB
    elif word_tag.startswith('N'):
        return wordnet.NOUN
    elif word_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Add part of speech tags and save the result to new column.
df['pos_tags'] = df['cleaned_stopwords'].apply(nltk.tag.pos_tag)

# Function to get part of speech in WordNet format.
df['pos_tags'] = df['pos_tags'].apply(lambda x: [(word, determine_wordnet_speech(pos_tag)) for (word, pos_tag) in x])

# Lemmatize words and save the result to new column.
word_lemmatizer = WordNetLemmatizer()
df['lyrics_lemmatized'] = df['pos_tags'].apply(lambda x: [word_lemmatizer.lemmatize(word, tag) for word, tag in x])

# Convert list to string datatype. 
df['lyrics_cleaned'] = [' '.join(map(str,l)) for l in df['lyrics_lemmatized']]

# Check few rows.
df.head(3)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,valence,tokenized,cleaned_stopwords,pos_tags,lyrics_lemmatized,lyrics_cleaned
0,13923,santana,wham!,1978,blues,cold chamber smoke kush gettin higher plane sw...,198,0.857791,"[cold, chamber, smoke, kush, gettin, higher, p...","[cold, chamber, smoke, kush, gettin, higher, p...","[(cold, a), (chamber, n), (smoke, v), (kush, n...","[cold, chamber, smoke, kush, gettin, high, pla...",cold chamber smoke kush gettin high plane swan...
1,14958,marvin sease,show me what you got,1991,blues,public service announcement weezy baby best ra...,198,0.79493,"[public, service, announcement, weezy, baby, b...","[public, service, announcement, weezy, baby, b...","[(public, a), (service, n), (announcement, n),...","[public, service, announcement, weezy, baby, b...",public service announcement weezy baby best ra...
2,15045,the robert cray band,1040 blues,1993,blues,gotta rapper today forget fuck smokin brain ce...,198,0.613561,"[got, ta, rapper, today, forget, fuck, smokin,...","[got, ta, rapper, today, forget, fuck, smokin,...","[(got, v), (ta, a), (rapper, n), (today, n), (...","[get, ta, rapper, today, forget, fuck, smokin,...",get ta rapper today forget fuck smokin brain c...


__Encode label__

In [12]:
# Transform valence to sentiment labels
df['sentiment'] = df['valence'].apply(create_sentiment_from_valence)

__Data splitting__

In [13]:
# Extract labels 
y = df['sentiment']
# Extract independent variables
X = df['lyrics_cleaned']
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)
# Create 3 cv for cross-validation 
cv = StratifiedKFold(n_splits=3, random_state=8, shuffle=True).split(X_train, y_train)

## 4. Data exploration

## 5. Sentiment analysis 

### 5.1 RandomForest

__Vectorization__

In [13]:
# Instantiate CountVectorizer
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.2, max_df=0.7, max_features=100)
# Learn a vocabulary dictionary of all tokens in the training set.
vectorizer.fit(X_train)
# Transform training set and testing set to document-term matrix.
X_train_count_vectorized = vectorizer.transform(X_train)
X_test_count_vectorized = vectorizer.transform(X_test)

In [14]:
# Instantiate TfidfTransformer
tfidf_transformer = TfidfTransformer()
# Learn vocabulary and idf from training set.
tfidf_transformer.fit(X_train_count_vectorized)
# Transform a train and set count matrix to a tf-idf representation.
X_train_tfidf = tfidf_transformer.transform(X_train_count_vectorized)
X_test_tfidf = tfidf_transformer.transform(X_test_count_vectorized)

__Hyperparameter tuning (Grid search)__

In [15]:
rf1 = RandomForestClassifier(random_state=8, n_estimators=50).fit(X_train_tfidf, y_train)

In [16]:
np.mean([estimator.tree_.max_depth for estimator in rf1.estimators_])

54.14

In [17]:
hyperparams_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [40, 55, 60],
    'min_samples_split': [2, 4, 8]
    }
hyperparams_grid

{'n_estimators': [50, 100, 150],
 'max_depth': [40, 55, 60],
 'min_samples_split': [2, 4, 8]}

In [18]:
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=8), hyperparams_grid, cv=3, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   1.9s
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   2.0s
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   2.0s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   4.8s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   6.8s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   5.9s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   6.8s
[CV] END .max_depth=40, min_samples_split=4, n_estimators=50; total time=   1.9s
[CV] END .max_depth=40, min_samples_split=4, n_estimators=50; total time=   1.9s
[CV] END .max_depth=40, min_samples_split=4, n_e

In [19]:
best_params = grid_search_rf.best_params_
best_params

{'max_depth': 55, 'min_samples_split': 8, 'n_estimators': 150}

In [20]:
grid_search_rf.best_score_

0.544696330148704

__Assessment on test set__

In [21]:
rf_best = RandomForestClassifier(random_state=8, max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], n_estimators=best_params['n_estimators']).fit(X_train_tfidf, y_train)

In [22]:
y_test_preds = rf_best.predict(X_test_tfidf)

In [23]:
accuracy_score(y_test_preds, y_test)

0.547245409015025

### 5.2 LTSM

__Split training set into another training set and validation set__

In [14]:
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, random_state=42, stratify = y_train)

__Vectorization__

In [15]:
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()

# Create a dict of word and index from the list of sentences. Required before texts_to_sequences
word_tokenizer.fit_on_texts(X_train_2)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

29711

In [16]:
# Pad sequences
longest_train_lyrics = max(X_train_2, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train_lyrics))

# texts_to_sequences: Transforms each text in texts to a sequence of integers (integers = index of word by fit_on_texts)
padded_train_lyrics = pad_sequences(
    word_tokenizer.texts_to_sequences(X_train_2),
    length_long_sentence, 
    padding='post'
)

padded_val_lyrics = pad_sequences(
    word_tokenizer.texts_to_sequences(X_val),
    length_long_sentence, 
    padding='post'
)

padded_test_lyrics = pad_sequences(
    word_tokenizer.texts_to_sequences(X_test),
    length_long_sentence, 
    padding='post'
)

__Manual Hyperparameter Tuning__

In [27]:
def calculate_accuracy(prediction):
  prediction2 = []
  for i in prediction:
    max_value = max(i)
    prediction2.append(list(i).index(max_value))

  return accuracy_score(prediction2, y_val)

In [28]:
def model_builder(vector_size, lstm_layers, dense_layers, learning_rate):
    with tf.device(device_name):
      model = Sequential()
      # embedding
      model.add(
          Embedding(input_dim=vocab_length,
                    output_dim=vector_size,
                    input_length=length_long_sentence))

      # add lstm layers
      for lstm_layer in lstm_layers:
        if lstm_layer['regularized'] != None:
          model.add(LSTM(lstm_layer['unit'], return_sequences=lstm_layer['return_sequences'], kernel_regularizer=regularizers.l2(lstm_layer['regularized'])))
        else:
          model.add(LSTM(lstm_layer['unit'], return_sequences=lstm_layer['return_sequences']))
        
        # add dropout
        if lstm_layer['dropout'] != None:
          model.add(Dropout(lstm_layer['dropout']))

      # add dense layers
      for dense_layer in dense_layers: 
        model.add(Dense(dense_layer['unit'], activation=dense_layer['activation']))
      
      # compile
      model.compile(optimizer=Adam(learning_rate=learning_rate),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
      return model

In [29]:
# Trial 1: Accuracy - 0.8915
lstm_layers = [{'unit': 50, 'return_sequences': False, 'dropout': None, 'regularized': None}]
dense_layers = [{'unit': 3, 'activation': 'softmax'}]
model1 = model_builder(100, lstm_layers, dense_layers, 0.001)
model1.fit(padded_train_lyrics, y_train_2, batch_size = 256, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f927afe1180>

In [30]:
# Trial 1: Val accuracy - 0.4452
prediction1 = model1.predict(padded_val_lyrics)
calculate_accuracy(prediction1)



0.5458593054318789

In [31]:
# Trial 2: Accuracy - 0.5456
lstm_layers = [{'unit': 50, 'return_sequences': False, 'dropout': 0.2, 'regularized': None}]
dense_layers = [{'unit': 3, 'activation': 'softmax'}]
model2 = model_builder(100, lstm_layers, dense_layers, 0.001)
model2.fit(padded_train_lyrics, y_train_2, batch_size = 256, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f927e89b370>

In [32]:
# Trial 2: Val accuracy - 0.5458
prediction2 = model2.predict(padded_val_lyrics)
calculate_accuracy(prediction2)



0.5458593054318789

In [33]:
# Trial 3: Accuracy - 0.5456
lstm_layers = [{'unit': 50, 'return_sequences': False, 'dropout': None, 'regularized': 0.01}]
dense_layers = [{'unit': 3, 'activation': 'softmax'}]
model3 = model_builder(100, lstm_layers, dense_layers, 0.001)
model3.fit(padded_train_lyrics, y_train_2, batch_size = 256, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f932b82ecb0>

In [34]:
# Trial 3: Val accuracy - 0.5458
prediction3 = model3.predict(padded_val_lyrics)
calculate_accuracy(prediction3)



0.5458593054318789

__RandomSearch Hyperparameter Tuning__

In [18]:
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
      with tf.device(device_name):
        model = Sequential()
        # embedding
        hp_vector_size = hp.Int('vector_size', 
                                min_value=100, 
                                max_value=200, 
                                step=50)
        model.add(
            Embedding(input_dim=vocab_length,
                      output_dim=hp_vector_size,
                      input_length=length_long_sentence))

        # first lstm
        hp_lstm_units1 = hp.Int('lstm_units1', 
                                min_value=32, 
                                max_value=128, 
                                step=32)
        model.add(LSTM(hp_lstm_units1, return_sequences=True, kernel_initializer="glorot_uniform"))

        # drop out
        hp_dropout_rate = hp.Float('dropout_rate', 
                                  min_value=0.0, 
                                  max_value=0.4, 
                                  step=0.1)
        model.add(Dropout(hp_dropout_rate))

        # second lstm
        hp_lstm_units2 = hp.Int('lstm_units2', 
                                min_value=32, 
                                max_value=128, 
                                step=32)
        model.add(LSTM(hp_lstm_units2, kernel_regularizer=regularizers.l2(0.01), kernel_initializer="glorot_uniform"))

        # dense layer
        model.add(Dense(3,activation='softmax'))
        
        # compile
        hp_learning_rate = hp.Choice('learning_rate', 
                                    values=[1e-3, 1e-4])
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [256, 512]),
            **kwargs,
        )

In [20]:
tuner = kt.RandomSearch(
    MyHyperModel(),
    objective=kt.Objective("val_accuracy", direction="max"),
    max_trials=20,
    # overwrite=True,
    project_name="tune_hypermodel",
    seed=42
)

In [21]:
# stop_early = EarlyStopping(monitor='loss', patience=5)

# random search 
tuner.search(padded_train_lyrics, 
             y_train_2, 
             verbose = 2,
             epochs=40,
             validation_data=(padded_val_lyrics, y_val)
            #  callbacks=[stop_early]       
             )

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 20 Complete [00h 03m 26s]
val_accuracy: 0.5458592772483826

Best val_accuracy So Far: 0.5463045239448547
Total elapsed time: 00h 31m 26s


In [22]:
print(best_hps.get('vector_size'))
print(best_hps.get('lstm_units1'))
print(best_hps.get('dropout_rate'))
print(best_hps.get('lstm_units2'))
print(best_hps.get('learning_rate'))
print(best_hps.get('batch_size'))

150
128
0.0
128
0.001
512


In [24]:
model_best = tuner.hypermodel.build(best_hps)
history = model_best.fit(padded_train_lyrics, y_train_2, epochs=100, batch_size=best_hps.get('batch_size'), validation_data=(padded_val_lyrics, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [25]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 1


In [26]:
model_best_epoch = tuner.hypermodel.build(best_hps)

# Retrain the model
model_best_epoch.fit(padded_train_lyrics, y_train_2, epochs=best_epoch, batch_size=best_hps.get('batch_size'))



<keras.callbacks.History at 0x7f5e8994a7d0>

In [27]:
accuracy_best_epoch = model_best_epoch.evaluate(padded_test_lyrics, y_test)
accuracy_best_epoch



[2.27274751663208, 0.5459098219871521]

__Glove embedding__

In [17]:
embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('/content/drive/My Drive/Colab Notebooks/content/glove.6B.100d.txt', encoding='utf-8') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

# embeddings_dictionary

In [18]:
# Now we will load embedding vectors of those words that appear in the
# Glove dictionary. Others will be initialized to 0.

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.2687    ,  0.81708002,  0.69896001, ..., -0.40110001,
         0.74656999,  0.31121999],
       [ 0.19073001,  0.56863999,  0.72026998, ..., -0.33460999,
         0.044349  ,  0.57541001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41383001, -0.063647  ,  0.34494001, ..., -0.20672999,
         0.22294   , -0.56507999],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [22]:
model_glove = Sequential()
# embedding

model_glove.add(
    Embedding(input_dim=embedding_matrix.shape[0],
              output_dim=embedding_matrix.shape[1],
              weights = [embedding_matrix],
              input_length=length_long_sentence))

# first lstm
model_glove.add(LSTM(128, return_sequences=True, kernel_initializer='glorot_uniform'))

# drop out
model_glove.add(Dropout(0.1))

# second lstm
model_glove.add(LSTM(128, kernel_initializer='glorot_uniform'))


# dense layer
model_glove.add(Dense(3,activation='softmax'))

# compile
model_glove.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model_glove.fit(padded_train_lyrics, y_train_2, epochs=100, batch_size = 512, validation_data = (padded_val_lyrics, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fe665db4ee0>

In [24]:
accuracy_glove = model_glove.evaluate(padded_test_lyrics, y_test)



In [23]:
model_glove.fit(padded_train_lyrics, y_train_2, epochs=19, batch_size = 512, validation_data = (padded_val_lyrics, y_val))

Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19


<keras.callbacks.History at 0x7fe712c434c0>