# Notebook to fine-tune the LSTM-GloVe model. 
We used the Keras [blog post](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) explaining how to use pretrained word embeddings in a Keras model when creating this notebook.

This notebook assumes that you have the [preprocessed dataset](https://drive.google.com/file/d/11W4I3tqU7bOsbLLHlcJ66fqHXGpygmF9/view) in the "/content" folder. It also assumes that you have the [pickled GloVe embedding matrices](https://mcgill-my.sharepoint.com/personal/alexa_hernandez_mail_mcgill_ca/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Falexa%5Fhernandez%5Fmail%5Fmcgill%5Fca%2FDocuments%2FGloVe&originalPath=aHR0cHM6Ly9tY2dpbGwtbXkuc2hhcmVwb2ludC5jb20vOmY6L2cvcGVyc29uYWwvYWxleGFfaGVybmFuZGV6X21haWxfbWNnaWxsX2NhL0V2allBS1JnaV9SR3FYYVpnZXZnNUxNQm51VHIzQkJwYjJaRDVGZkpfN2U4MlE_cnRpbWU9Rm1OcHladWcyRWc) in the "/content" folder.   

## Load Data and Prepare Label Index


In [14]:
import pandas as pd

# Load data
df = pd.read_csv("/content/scraped-lyrics-v2-preprocessed.csv")
lyrics = df.lyrics.tolist()

# Generate genres index to map label names to numeric ids
genres = df.category.tolist()
labels_index = {}
labels = []
idx = 0

for g in genres:
  if g not in labels_index:
    labels_index[g] = idx
    idx += 1
  labels.append(labels_index[g])
 
print(f"Labels index: {labels_index}")
print(f"Labels: {labels}")

Labels index: {'Hard Rock': 0, 'Heavy Metal': 1, 'Hip Hop': 2, 'Indie': 3, 'Rock': 4, 'R&B': 5, 'Soul Music': 6, 'Pop': 7, 'Country': 8, 'Rock Alternativo': 9}
Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

## Tokenize Lyrics 


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(lyrics)
sequences = tokenizer.texts_to_sequences(lyrics)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

Found 108740 unique tokens.


## Pad Lyrics

In [6]:
from keras.utils import to_categorical
import numpy as np

data = pad_sequences(sequences, maxlen=600)
labels = to_categorical(np.asarray(labels))
print(f"Shape of data tensor: {data.shape}")
print(f"Shape of label tensor: {labels.shape}")

Shape of data tensor: (58719, 600)
Shape of label tensor: (58719, 10)


## Split data into train-val-test subsets



In [7]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
train_end = int(0.7 * data.shape[0])
val_end = int(0.85 * data.shape[0])

x_train = data[:train_end]
y_train = labels[:train_end]
x_val = data[train_end:val_end]
y_val = labels[train_end:val_end]
x_test = data[val_end:]
y_test = labels[val_end:]

print(f"{len(x_train)} training samples, {len(x_val)} validation samples and {len(x_test)} test samples.")

41103 training samples, 8808 validation samples and 8808 test samples.


## Prepare embedding layers for each dimension (e.g., 50, 100, 200, 300)

In [8]:
!pip install pickle5

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |██▌                             | 10kB 6.5MB/s eta 0:00:01[K     |█████                           | 20kB 9.6MB/s eta 0:00:01[K     |███████▍                        | 30kB 11.0MB/s eta 0:00:01[K     |██████████                      | 40kB 7.9MB/s eta 0:00:01[K     |████████████▍                   | 51kB 4.2MB/s eta 0:00:01[K     |██████████████▉                 | 61kB 4.8MB/s eta 0:00:01[K     |█████████████████▍              | 71kB 4.5MB/s eta 0:00:01[K     |███████████████████▉            | 81kB 5.1MB/s eta 0:00:01[K     |██████████████████████▎         | 92kB 5.1MB/s eta 0:00:01[K     |████████████████████████▉       | 102kB 4.1MB/s eta 0:00:01[K     |███████████████████████████▎    | 112kB 4.1MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122kB 4.1MB/s eta 0:00:01[K

In [9]:
# Prepare embeddings layers
import pickle5 as pickle 


def load_embedding_index(dim):
  """Loads the pickled embedding matrix corresponding to the pretrained GloVe embeddings with the inputted dimension."""
  with open(f"/content/glove.6B.{dim}d.pickle", "rb") as f:
    embedding_index = pickle.load(f)
  return embedding_index


def compute_embedding_layer(word_index, embedding_index, dim):
  """Computes the embedding layers for the given word and embedding index."""
  embedding_matrix = np.zeros((len(word_index) + 1, dim))
  for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
  return embedding_matrix

# Acceptable embedding dimensions 
dims = [50, 100, 200, 300]

# Load the embedding index of each dimension
embedding_indices = {dim: load_embedding_index(dim) for dim in dims}
print(f"Loaded embedding index for the following dimensions: {embedding_indices.keys()}")

# Compute embedding layer for each dimension
embedding_matrices = {dim: compute_embedding_layer(word_index, embedding_indices[dim], dim) for dim in dims}
print(f"Computed embedding matrix for the following dimensions: {embedding_matrices.keys()}")
print(f"The embedding matrix for dimension 50 is {embedding_matrices[50]}")

Loaded embedding index for the following dimensions: dict_keys([50, 100, 200, 300])
Computed embedding matrix for the following dimensions: dict_keys([50, 100, 200, 300])
The embedding matrix for dimension 50 is [[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.41800001  0.24968    -0.41242    ... -0.18411    -0.11514
  -0.78580999]
 [ 0.11891     0.15255    -0.082073   ... -0.57511997 -0.26671001
   0.92120999]
 ...
 [-0.56676    -1.09870005  0.39249    ... -0.0083835  -0.14105
  -0.43687999]
 [ 0.66856003  0.53061998 -2.50189996 ...  0.51340997 -1.22749996
  -0.31600001]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


## Build LSTM Model

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout


def build_model(word_index, labels_index, embedding_dim, embedding_matrix, hidden_units, dropout):
  """Returns LSTM model with inputted configuration."""
  model = Sequential() 
  model.add(Embedding(
      len(word_index) + 1, 
      embedding_dim,
      weights=[embedding_matrix],
      input_length=600,
      trainable=False))
  model.add(LSTM(units=hidden_units)) 
  model.add(Dropout(dropout))
  model.add(Dense(len(labels_index), activation='softmax'))
  model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc']) 
  return model

## Fine-tune LSTM Model

In [11]:
# Hyperparameter values to search over
hidden_units = [50, 75, 100]
dropouts = [0.10, 0.20, 0.30]
embedding_dims = [50, 100, 200, 300]
best_val_acc = 0

# Perform exhaustive grid search over hyperparameter values
for hu in hidden_units:
  for d in dropouts:
    for dim in embedding_dims:
      model = build_model(word_index, labels_index, dim, embedding_matrices[dim], hu, d)
      print(f"Hyperparameters: hidden units = {hu}, dropout = {d}, embedding dimension = {dim}")
      print(model.summary()) 
      hist = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=32) 
      score = model.evaluate(x_test, y_test, verbose=1)
      print(f"Test accuracy: {score[1]}")

Hyperparameters: hidden units = 100, dropout = 0.2, embedding dimension = 200
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 600, 200)          21748200  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               120400    
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1010      
Total params: 21,869,610
Trainable params: 121,410
Non-trainable params: 21,748,200
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.40497276186943054
Hyperparameters: hidden units = 100, dropout = 0.2, embedding dimens