In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install gensim
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# import relevant libraries
import re
import random
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm.notebook import tqdm

# import nltk and gensim library for token level embeddings
import nltk
import gensim
from nltk.data import find


In [None]:
# Be sure to change to your working directory
print('GPU name: ', tf.config.list_physical_devices('GPU'))
DATA_DIR = "/content/drive/MyDrive/W266 Project/w266-finalproj/data"

GPU name:  []


In [None]:
# instantiate Word2Vec token level embedding
nltk.download('word2vec_sample')
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [None]:
#@title Embedding Matrix Creation
# let's try 300 dimension Word2Vec Embedding
EMBEDDING_DIM = 300

# initialize embedding matrix and word-to-id map:
# embedding_matrix = np.zeros((len(model.key_to_index) + 1, EMBEDDING_DIM))       
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       

vocab_dict = {}

# build the embedding matrix and the word-to-id map:
# for i, word in enumerate(model.key_to_index):
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

# we can use the last index at the end of the vocab for unknown tokens
vocab_dict['[UNK]'] = len(vocab_dict)
embedding_matrix.shape

(43982, 300)

In [None]:
# Define Hard Code Max Sequence Length
MAX_SEQUENCE_LENGTH = 16

# define some preprosessing functions
# training data processing
stopChars = [',','(',')','.','-','[',']','"']

# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText

# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList
    

def tokenize_data(data):
    data = data.copy()
    word_list = []
    for i, row in data.iterrows():
        lyric = row['lyrics']
        lyric = lyric.replace('\n', ' \n ')
        lyric = lyric.replace('!', ' ! ')
        lyric = lyric.replace('?', ' ? ')
        lyric = lyric.replace('#', ' ')
        lyric = re.findall(r'\S+|\n', lyric)
        word_list.append(lyric)
    # concatenating the word list to create a corpus
    DP_text = [j for i in word_list for j in i]
    print('corpus length:', len(DP_text))
    return(DP_text)

# Create a dictionary of characters, see the index of characters.
def dictionary_maker(words):
    char_to_int = dict((c, i) for i, c in enumerate(words))
    int_to_char = dict((i, c) for i, c in enumerate(words))
    char_to_int['UNK'] = len(int_to_char)
    int_to_char[len(int_to_char)] = 'UNK'
    return(char_to_int, int_to_char)


def make_sentences_and_next_chars(seq_length, DP_text, step):
    sentences = []
    next_chars = []

    # Create Target and sentences window
    for i in range(0, len(DP_text) - seq_length, step):
        # range from current index to sequence length charaters
        sentences.append(DP_text[i: i + seq_length])  
        next_chars.append(DP_text[i + seq_length]) # the next character
    
    sentences = np.array(sentences)
    next_chars = np.array(next_chars)
    return(sentences, next_chars)

In [None]:
# fetching genre specific genertion corpus
pop_data = pd.read_csv(f"{DATA_DIR}/02_intermediate/soul_sample.csv")
pop_data = pop_data.loc[:, ~pop_data.columns.str.contains('^Unnamed')]
print(pop_data.head(20).to_string())

DP_pop = tokenize_data(pop_data)
pop_words = sorted(list(set(DP_pop)))
pop_char_to_int, pop_int_to_char = dictionary_maker(pop_words)
pop_sentences, pop_next_chars = make_sentences_and_next_chars(MAX_SEQUENCE_LENGTH, DP_pop, 1)
print(pop_sentences[0])
print(pop_next_chars[0])

                      artist genre                                 title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [None]:
# helper functions
# use to generate lyrics
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds[0], 1)
    return np.argmax(probas)


def generate_lyric(sentence, 
                   lstm_model,
                   seq_length=16,
                   char_to_int = pop_char_to_int, 
                   int_to_char = pop_int_to_char):
    variance = .5
    generated = []
    original = sentence
    window = sentence

    for i in tqdm(range(100)):
        x = np.zeros((1, seq_length))
        for t, char in enumerate(window):
          try:
            x[0, t] = char_to_int[char] # Change the sentence to index vector shape (1,50)
          except:
            x[0, t] = char_to_int['UNK'] # tokenize the unkown token if the word is not there

        x_in = x
        pred = lstm_model(x_in)
        next_index = sample(pred, variance)
        next_char = int_to_char[next_index] # index to char
        generated = generated + [next_char]
        window = window[1:] + [next_char] # Update Window for next char predict
    return ( " ".join(original + generated) )

# Baseline LSTM Loading
- Load Baseline LSTM Model

In [None]:
# Design Neural Network without Attention
def baseline_LSTM(lstm_dim,
                dense_layer_dim,
                dropout_rate=0.2):
    """
        LSTM Layer with attention
    """
    
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)

    # declare a source input
    # apply embedding layer to the input
    source_input_no_att = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), 
                                                  dtype='int64', 
                                                  name='source_input_no_att')
    

    source_embeddings_no_att = tf.keras.layers.Embedding(input_dim = embedding_matrix.shape[0],
                                                          output_dim = embedding_matrix.shape[1],
                                                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                                          input_length=MAX_SEQUENCE_LENGTH,
                                                          trainable=False, 
                                                          name = 'source_embedding_layer_no_att')
    
    src_embedding = source_embeddings_no_att(source_input_no_att)
    
    # LSTM layer
    encoder_lstm_layer_with_att = tf.keras.layers.LSTM(lstm_dim, 
                                                       return_sequences=False, 
                                                       return_state=False, 
                                                       recurrent_dropout=dropout_rate,
                                                       name='LSTM_Layer')
    encoder_out_with_att = encoder_lstm_layer_with_att(src_embedding)
    last_hidden_output = encoder_out_with_att

    # add 1 dense layer (later?)
    last_hidden_output = keras.layers.Dense(dense_layer_dim, activation='relu')(last_hidden_output)
    last_hidden_output = keras.layers.Dropout(rate=dropout_rate)(last_hidden_output)

    lstm_prediction = keras.layers.Dense(len(pop_words), activation='softmax')(last_hidden_output)

    lstm_model = keras.Model(inputs=source_input_no_att, outputs=lstm_prediction)
    lstm_model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',  # From information theory notebooks.
                      metrics=['accuracy'])
    return lstm_model

In [None]:
lstm_model = baseline_LSTM(lstm_dim=256,
                dense_layer_dim=256,
                dropout_rate=0.2)
lstm_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 source_input_no_att (InputL  [(None, 16)]             0         
 ayer)                                                           
                                                                 
 source_embedding_layer_no_a  (None, 16, 300)          13194600  
 tt (Embedding)                                                  
                                                                 
 LSTM_Layer (LSTM)           (None, 256)               570368    
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 27429)             704925

In [None]:
lstm_model.load_weights(f"{DATA_DIR}/03_model_training/lstm_models/soul_lstm_baseline.h5") 

# Attention LSTM
LSTM model with attention

In [None]:
# Design Neural Network without Attention
def multiheaded_attention_lstm(lstm_dim,
                               dense_layer_dim,
                               num_heads,
                               key_dim,
                               dropout_rate=0.2):
    """
        LSTM Layer with multiheaded self-attention
    """
    
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)

    # declare a source input
    # apply embedding layer to the input
    source_input_with_att = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), 
                                                  dtype='int64', 
                                                  name='source_input_with_att')
    

    source_embeddings_with_att = tf.keras.layers.Embedding(input_dim = embedding_matrix.shape[0],
                                                          output_dim = embedding_matrix.shape[1],
                                                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                                          input_length=MAX_SEQUENCE_LENGTH,
                                                          trainable=False, 
                                                          name = 'source_embedding_layer_wit_att')

    src_embedding = source_embeddings_with_att(source_input_with_att)

    # define the lstm layer
    encoder_lstm_layer_with_att = tf.keras.layers.LSTM(embedding_matrix.shape[1], return_sequences=True, return_state=True, recurrent_dropout=dropout_rate,name='LSTM_Layer')
    encoder_out_with_att, encoder_state_h_with_att, encoder_state_c_with_att = encoder_lstm_layer_with_att(src_embedding)

    attention_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                        key_dim=key_dim, 
                                                        dropout=dropout_rate,
                                                        name = 'attention_layer')
    context_vector = attention_layer(encoder_out_with_att, encoder_out_with_att)

    # reshape the context vector:
    context_vector = tf.keras.layers.GlobalAveragePooling1D()(context_vector)
    
    # concatenate the context vector and query vector
    # Attention Mechanism to be feed into a fully connected Neural Net
    input_layer = tf.keras.layers.Concatenate()([context_vector, encoder_state_h_with_att])
    last_hidden_output = input_layer

    # add 1 dense layer (later?)
    last_hidden_output = keras.layers.Dense(dense_layer_dim, activation='relu')(last_hidden_output)
    last_hidden_output = keras.layers.Dropout(rate=dropout_rate)(last_hidden_output)

    lstm_prediction = keras.layers.Dense(len(pop_words), activation='softmax')(last_hidden_output)

    lstm_model = keras.Model(inputs=source_input_with_att, outputs=lstm_prediction)
    lstm_model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',  # From information theory notebooks.
                      metrics=['accuracy'])
    return lstm_model

In [None]:
attention_lstm_model = multiheaded_attention_lstm(lstm_dim=256,
                                        dense_layer_dim=256,
                                        num_heads = 10,
                                        key_dim = 5,
                                        dropout_rate=0.2)
attention_lstm_model.load_weights(f"{DATA_DIR}/<replace with the model h5 file>") 