Homework 5: Neural Language Models  (& 🎃 SpOoKy 👻 authors 🧟 data) - Task 3
---

Task 3: Feedforward Neural Language Model (60 points)
--------------------------

For this task, you will create and train neural LMs for both your word-based embeddings and your character-based ones. You should write functions when appropriate to avoid excessive copy+pasting.

### a) First, encode  your text into integers (5 points)

In [1]:
# Importing utility functions from Keras
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

# necessary
from keras.models import Sequential
from keras.layers import Dense

# optional
# from keras.layers import Dropout

# if you want fancy progress bars
from tqdm import notebook
from IPython.display import display

# your other imports here
import time
import nltk

import os
import tensorflow as tf

import neurallm_utils as nutils

2023-11-07 11:23:06.703485: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexkramer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
# constants you may find helpful. Edit as you would like.
EMBEDDINGS_SIZE = 50
NGRAM = 3 # The ngram language model you want to train

In [4]:
# load in necessary data
TRAIN_FILE = 'spooky_author_train.csv' # The file to train your language model on
data_word = nutils.read_file_spooky("spooky_author_train.csv", NGRAM, by_character=False)
data_char = nutils.read_file_spooky("spooky_author_train.csv", NGRAM, by_character=True)

In [5]:
# Initialize a Tokenizer and fit on your data
# do this for both the word and character data

# It is used to vectorize a text corpus. Here, it just creates a mapping from
# word to a unique index. (Note: Indexing starts from 0)

# CHARACTERS
tokenizer_char = Tokenizer()
tokenizer_char.fit_on_texts(data_char)
encoded_char = tokenizer_char.texts_to_sequences(data_char)

# WORDS
tokenizer_word = Tokenizer()
tokenizer_word.fit_on_texts(data_word)
encoded_word = tokenizer_word.texts_to_sequences(data_word)

In [6]:
# print out the size of the word index for each of your tokenizers
# this should match what you calculated in Task 2 with your embeddings


print("Characters:",len(tokenizer_char.index_word))
print("Words:",len(tokenizer_word.index_word))

Characters: 60
Words: 25374


### b) Next, prepare the sequences to train your model from text (5 points)

#### Fixed n-gram based sequences

In [7]:
# generate your training samples for both word and character data
# print out the first 5 training samples for each
# we have displayed the number of sequences
# to expect for both characters and words

def generate_ngram_training_samples(encoded: list, ngram: int) -> list:
    '''
    Takes the encoded data (list of lists) and
    generates the training samples out of it.
    Parameters:
    up to you, we've put in what we used
    but you can add/remove as needed
    return:
    list of lists in the format [[x1, x2, ... , x(n-1), y], ...]
    '''

    ngrams = []
    for text in encoded:
        for i in range(len(text)-ngram+1):
            ngrams.append([text[x] for x in range(i, i+ngram)])
    return ngrams

# WORDS
ngrams3_word = generate_ngram_training_samples(encoded_word, 3)

#CHARACTERS
ngrams3_char = generate_ngram_training_samples(encoded_char, 3)


# WORDS
print('Words: ', ngrams3_word[:5])
print(len(ngrams3_word))

#CHARACTERS
print('Characters: ', ngrams3_char[:5])
print(len(ngrams3_char))

# Spooky data by character should give 2957553 sequences

# Spooky data by words shoud give 634080 sequences


Words:  [[1, 1, 32], [1, 32, 2956], [32, 2956, 3], [2956, 3, 155], [3, 155, 3]]
634080
Characters:  [[21, 21, 3], [21, 3, 9], [3, 9, 7], [9, 7, 8], [7, 8, 1]]
2957553


### c) Then, split the sequences into X and y and create a Data Generator (20 points)

In [9]:
# 2.5 points

# Note here that the sequences were in the form:
# sequence = [x1, x2, ... , x(n-1), y]
# We still need to separate it into [[x1, x2, ... , x(n-1)], ...], [y1, y2, ...]]
# do that here

# RUN THIS FOR WORDS
X, y = [], []
for item in ngrams3_word:
    X += [item[:-1]]
    y += [item[-1]]

# print out the shapes to verify that they are correct
print(len(X), len(y))

634080 634080


In [None]:
# RUN THIS FOR CHARACTERS
X, y = [], []
for item in ngrams3_char:
    X += [item[:-1]]
    y += [item[-1]]

# print out the shapes to verify that they are correct
print(len(X), len(y))

In [10]:
# 2.5 points
# Initialize a function that reads the word embeddings you saved earlier
# and gives you back mappings from words to their embeddings and also
# indexes from the tokenizers to their embeddings

# the "0" index of the Tokenizer is assigned for the padding token. Initialize
# the vector for padding token as all zeros of embedding size
# this adds one to the number of embeddings that were initially saved
# (and increases your vocab size by 1)

def read_embeddings(filename: str, tokenizer: Tokenizer) -> (dict, dict):
    '''Loads and parses embeddings trained in earlier.
    Parameters:
        filename (str): path to file
        Tokenizer: tokenizer used to tokenize the data (needed to get the word to index mapping)
    Returns:
        (dict): mapping from word to its embedding vector
        (dict): mapping from index to its embedding vector
    '''
    word_dict = dict() # also works for characters
    index_dict = dict()
    with open(filename, "r") as embeddings_file:
        dimensions = embeddings_file.readline()
        index_dict[0] = [0.0] * int(dimensions.split(' ')[1])
        for line in embeddings_file.readlines():
            key_word = line.split(' ')[0]
            key_index = tokenizer.word_index[key_word]
            value = [float(x) for x in line.split(' ')[1:]]

            word_dict[key_word] = value
            index_dict[key_index] = value
    return word_dict, index_dict

In [11]:
# 10 points
## WE NEED THE OUTPUT TO BE X, y
# X is an arrayed list of everything in the batch concatenated!!!
# y is an array of the one-hot encoded

def data_generator(X: list, y: list, num_sequences_per_batch: int, index_2_embedding: dict, tokenizer, for_feedforward: bool = False) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/

    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels
    (see the to_categorical function)

    If for_feedforward is True:
    Returns data generator to be used by feed_forward
    else: Returns data generator for RNN model
    '''
    while True:
        num_classes = len(tokenizer.index_word)+1
        for start in range(0,len(X),num_sequences_per_batch):
            batch_X,batch_y = [],[]
            i = start
            #for i in range(start,start+num_sequences_per_batch):
            while i < start+num_sequences_per_batch and i < len(X):
                sequence = []
                for word in X[i]:
                    sequence += index_2_embedding[word]
                batch_X += [sequence]
                batch_y += [to_categorical(y[i], num_classes=num_classes)] # label one hot encoded
                #if for_feedforward:
                #    batch += [sequence]#[(sequence, y[start:start+num_sequences_per_batch])]
                i+=1
            yield np.array(batch_X), np.array(batch_y)

In [12]:
# 5 points

# initialize your data_generator for both word and character data
# print out the shapes of the first batch to verify that it is correct for both word and character data

## RUN THIS FOR WORDS
word_2_embeddings, index_2_embeddings = read_embeddings("spooky_embedding_word.txt", tokenizer_word)

# Examples:
num_sequences_per_batch = 128 # this is the batch size
steps_per_epoch = len(X)//num_sequences_per_batch  # Number of batches per epoch # sequences > X
train_generator = data_generator(X, y, num_sequences_per_batch, index_2_embeddings, tokenizer_word, for_feedforward=True)

#sample=next(train_generator) # this is how you get data out of generators
#sample[0].shape # (batch_size, (n-1)*EMBEDDING_SIZE)  (128, 200)
#sample[1].shape   # (batch_size, |V|) to_categorical


In [None]:
# RUN THIS FOR CHARACTERS
word_2_embeddings, index_2_embeddings = read_embeddings("spooky_embedding_char.txt", tokenizer_char)


# Examples:
num_sequences_per_batch = 128 # this is the batch size
steps_per_epoch = len(X)//num_sequences_per_batch  # Number of batches per epoch # sequences > X
train_generator = data_generator(X, y, num_sequences_per_batch, index_2_embeddings, tokenizer_char, for_feedforward=True)


### d) Train & __save__ your models (15 points)

In [30]:
# 15 points

# code to train a feedforward neural language model for
# both word embeddings and character embeddings
# make sure not to just copy + paste to train your two models
# (define functions as needed)

#5ep H
model = Sequential(
[Dense(100, activation="exponential", name="1"), # relu > softmax
 Dense(100, activation="exponential", name="2"), # relu > softmax
 Dense(len(tokenizer_word.word_index)+1, activation ='sigmoid', name="3")])


# layers.Activation('softmax')

loss_fn = 'categorical_crossentropy'
model.compile(
    loss=loss_fn,
    optimizer='adam',
    metrics=[keras.metrics.Accuracy()])


# train your models for between 3 & 5 epochs
# on Felix's machine, this takes ~ 24 min for character embeddings and ~ 10 min for word embeddings
# DO NOT EXPECT ACCURACIES OVER 0.5 (and even that is very for this many epochs)
# We recommend starting by training for 1 epoch

# Define your model architecture using Keras Sequential API
# Use the adam optimizer instead of sgd
# add cells as desired


In [31]:
### TESTING 
"""
model = Sequential(
[Dense(100, activation="exponential", name="1"),
 Dense(100, activation="exponential", name="2"),
 Dense(len(tokenizer.word_index)+1, activation ="sigmoid", name="3")]
)
#5ep G
model = Sequential(
[Dense(100, activation="exponential", name="1"), # relu > softmax
 Dense(160, activation="exponential", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")])

#5ep e
model = Sequential(
[Dense(100, activation="relu", name="1"), # relu > softmax
 Dense(160, activation="relu", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")])
#sigmoid >  > relu

#5ep F
model = Sequential(
[Dense(100, activation="selu", name="1"), # relu > softmax
 Dense(160, activation="selu", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")])
Epoch 1/2
4953/4953 [==============================] - 453s 91ms/step - loss: 5.7491 - accuracy: 1.3446e-07
Epoch 2/2
4953/4953 [==============================] - 456s 92ms/step - loss: 5.3615 - accuracy: 1.5243e-06

#5ep E accuracy: 0.0285
model = Sequential(
[Dense(100, activation="relu", name="1"), # relu > softmax
 Dense(160, activation="relu", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")])

#5ep A
model = Sequential(
[Dense(128*200, activation="relu", name="1"), # relu > softmax
 Dense(160, activation="relu", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")]
) -> loss: 4.9513 - accuracy: 0.0329 @epoch 5

#5ep D
model = Sequential(
[Dense(128*200, activation="relu", name="1"), # relu > softmax
 Dense(200, activation="relu", name="2"), # relu > softmax
 Dense(200, activation="relu", name="3"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="4")]
)
# loss: 4.8354 - accuracy: 0.0300

#5ep C
model = Sequential(
[Dense(128*200, activation="relu", name="1"), # relu > softmax
 Dense(180, activation="relu", name="2"), # relu > softmax
 Dense(160, activation="relu", name="3"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="4")]
) -> loss: 4.8550 - accuracy: 0.0312

#5ep B
model = Sequential(
[Dense(128*200, activation="relu", name="1"), # relu > softmax
 Dense(180, activation="relu", name="2"), # relu > softmax
 Dense(len(tokenizer.word_index)+1, activation ='sigmoid', name="3")]
) -> loss: 4.9408 - accuracy: 0.0022
"""

In [32]:
# Here is some example code to train a model with a data generator
model.fit(x=train_generator,
          steps_per_epoch=steps_per_epoch,
          batch_size=num_sequences_per_batch,
          epochs=3)

In [None]:
# spooky data model by character for 5 epochs takes ~ 24 min on Felix's computer
# with adam optimizer, gets accuracy of 0.3920



# spooky data model by word for 5 epochs takes 10 min on Felix's computer
# results in accuracy of 0.2110


In [None]:
# save your trained models so you can re-load instead of re-training each time
# also, you'll need these to generate your sentences!
filepath = 'word_FFNN_3epG.keras' 
model.save(filepath)

In [None]:
# RUN FOR CHARACTERS
model = Sequential(
[Dense(100, activation="exponential", name="1"), # relu > softmax
 Dense(225, activation="relu", name="2"), # relu > softmax
 Dense(len(tokenizer_char.word_index) + 1, activation ='sigmoid', name="3")])

loss_fn = 'categorical_crossentropy'
model.compile(
    loss=loss_fn,
    optimizer='adam',
    metrics=[keras.metrics.Accuracy()])


In [None]:
# RUN FOR CHARACTERS
model.fit(x=train_generator,
          steps_per_epoch=steps_per_epoch,
          batch_size=num_sequences_per_batch,
          epochs=3)

In [None]:
# RUN FOR CHARACTERS
filepath = 'char_FFNN_3epG.keras' 
model.save(filepath)

### e) Generate Sentences (15 points)

In [None]:
# load your models if you need to
tf.keras.saving.load_model(
    filepath, custom_objects=None, compile=True, safe_mode=True)

In [None]:
# 10 points
# # generate a sequence from the model until you get an end of sentence token
import random
def generate_seq(model: Sequential, tokenizer: Tokenizer, word_2_embedding, seed: list, verbose: bool = False, words: bool = True, max_len: int = 100, binary: bool=True):
    '''
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        word_2_embedding: the word to embedding dict
        seed: [w1, w2, w(n-1)]
    Returns: string sentence
    '''
    sentence = [word for word in seed]
    while sentence[-1] not in ["</s>", ""] and len(sentence) <= max_len:
        X = []
        for word in sentence[-2:]:
            X += word_2_embeddings[word]
        next_vector = model.predict(np.array([X]), verbose=verbose)[0] # this comes out as an array 
        if binary:
            best = 0.01
        else:
            best = max(next_vector)
        next_token = []
        for i in range(0,len(tokenizer.index_word)+1):
            if next_vector[i] >= best:
                if i == 0:
                    next_token.append('')
                else:
                    next_token.append(tokenizer.index_word[i])
        if len(next_token) == 0:
            break
        else:
            sentence += [random.choice(next_token)]
        if verbose and words:
            print(' '.join(sentence), end="/r")
        elif verbose:
            print(''.join(sentence), end="/r")
    return format_sentence(sentence, words)


def format_sentence(sentence: list, words: bool = True):
    if sentence[0] == "<s>":
        sentence = sentence[1:]
    if sentence[-1] == "</s>":
        sentence = sentence[:-1]
    if words:
        sentence = [' '+word for word in sentence if word not in '.,!?;']
    else:
        sentence = [letter.replace('_', ' ') for letter in sentence]
    return ''.join(sentence).strip(' ')

In [None]:
word_results = generate_seq(model, tokenizer_word, word_2_embeddings, ['<s>','earth'], verbose=True, words=True)

In [None]:
char_results = generate_seq(model, tokenizer_char, word_2_embeddings, ['<s>','t'], verbose=True, words=False)

In [None]:
# 5 points
print("Sentence generated by words: \n", word_results, '\n\n')

print("Sentence generated by characters: \n", char_results)
# generate and display one sequence from both the word model and the character model
# do not include <s> or </s> in your displayed sentences
# make sure that you can read the output easily (i.e. don't just print out a list of tokens)

# you may leave _ as _ or replace it with a space if you prefer

In [None]:
# generate 100 example sentences with each model and save them to a file, one sentence per line
# do not include <s> and </s> in your saved sentences (you'll use these sentences in your next task)
# this will produce two files, one for each model
def generate_multiple(n: int, filename: str, model, tokenizer, word_2_embedding, words: bool = True): # seeds: list,
    sentences = ""
    while n > 0:
        try:
            seed = ['<s>'] + [random.choice(list(tokenizer.word_index))]
            seq = generate_seq(model, tokenizer, word_2_embedding, seed, verbose=False, words=words)
            sentences += seq + '\n'
            n -= 1
            print(seq, end='\n\n\r')
        except:
            pass
        with open(filename, 'w') as outputfile:
            outputfile.write(sentences[:-1])

In [None]:
generate_multiple(100, "100_sentences_word.txt", model, tokenizer, word_2_embeddings, words=True) 
# Alex fix me --> change tokenizer to tokenizer_word



In [None]:
generate_multiple(20, "100_sentences_word.txt", model, tokenizer, word_2_embeddings, words=True) 

In [None]:
generate_multiple(100, "100_sentences_char.txt", model, tokenizer_char, word_2_embeddings, words=False) 
# Alex fix me --> change tokenizer to tokenizer_word