# Session 8 - Language modelling with RNNs (Text Generation)

In [21]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

## Some helper functions

In [37]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower() # make all text lowercase, keep strings that arent punctuation
    txt = txt.encode("utf8").decode("ascii",'ignore') # utf8 encoding makes up for accents
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 
                        10, 
                        input_length=input_len))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1)) # remove 10% of the weight when the model is training
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    return model

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], 
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list),
                                            axis=1)   
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

## Load the data

In [23]:
data_dir = os.path.join("../in/news_data")

We're then going to load the data one at a time and append *only* the headlines to our list of data.

In [24]:
all_comments = []
for filename in os.listdir(data_dir):
    if 'Comments' in filename:
        comment_df = pd.read_csv(data_dir + "/" + filename)
        all_comments.extend(list(comment_df["commentBody"].values))

We then clean up a little bit and see how many data points we have.

In [25]:
all_comments = [c for c in all_comments if c != "Unknown"] # remove all unknown headlines
len(all_comments)

418481

In [26]:
# create sample from all_comments
import random
sample_comments = random.sample(all_comments, 10)
len(sample_comments)

10

We call out ```clean_text()``` function and then inspect the first 10 texts.

In [27]:
corpus = [clean_text(x) for x in sample_comments]
corpus[:10]

['guess i dont get the joke here  ',
 'fuggetabout the opioid crisis brbrmeanwhile let anyone purchase several semiautomatic guns and or a bumpstock kit at will and carry them across state lines or purchase alcohol at any time of day for legal consumption brbrbut get stopped for a broken tail light and have one cannabis cigarette in possession and go to jail ',
 'the diacritical marks in mtley cre arent umlauts but rather dieresesbrbrwhatever they are called they are simply gratitous',
 'vote for candidates who publicly and loudly denounce the nra and its blood money they use to bribe spineless politicians  vote for candidates who have never accepted money from the nra  vote for candidates who pledge never to touch one thin dime of nra moneybrvote against any candidate who has ever accepted nra money with the exception of a politician who gave back the money and publicly and loudly denounces the nra',
 'how about once we stop focusing on the source of these cyber attacks we make a conc

## Tokenize

We're then going to tokenize our data, using the ```Tokenizer()``` class from ```TensorFlow```, about which you can read more [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

We then use the ```get_sequence_of_tokens()``` function we defined above, which turns every text into a sequence of tokens based on the vocabulary from the tokenizer.

In [28]:
# this code creates an index of the words in the corpus
tokenizer = Tokenizer()
## tokenization
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 # +1 accounts for words that arent in the vocabulary. 

In [29]:
# take input and turn it into numerical output
inp_sequences = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10]
# the length of the input sequences are defined by how the words relate to each other

[[92, 15],
 [92, 15, 93],
 [92, 15, 93, 50],
 [92, 15, 93, 50, 1],
 [92, 15, 93, 50, 1, 94],
 [92, 15, 93, 50, 1, 94, 95],
 [96, 1],
 [96, 1, 97],
 [96, 1, 97, 98],
 [96, 1, 97, 98, 99]]

We then want to *pad* our input sequences to make them all the same length.

In [30]:
# padding makes different inputs the same length as the longest input by adding 0's where words are missing in inputs
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
# E.g. with "My cat" and "The big dog" a 0 would be added in the front like "0 My cat"

## Create model

We then use the ```create_model()``` function created above to initialize a model, telling the model the length of sequences and the total size of the vocabulary.

In [31]:
model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 206, 10)           3740      
                                                                 
 lstm_1 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 374)               37774     
                                                                 
Total params: 85,914
Trainable params: 85,914
Non-trainable params: 0
_________________________________________________________________


2023-03-28 15:32:46.019399: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 15:32:46.022604: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 15:32:46.025451: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model training is exactly the same as last week, but instead of document labels, we're fitting the model to predict next word.

*NB!* This will take some time to train! It took me 35 minutes on UCloud 32xCPU.

In [33]:
# creating history of the model
model.fit(predictors,
          label, 
          epochs=100,
          batch_size=128, 
          verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x137b56050>

When the model has trained, we can then use this to generate *new text*.

In [34]:
# print text based on the word/words and what the model has predicted would go along with it
print(generate_text("danish", 5, model, max_sequence_len)) # the 5 is the number of words we want to come after

2023-03-28 15:42:39.677312: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 15:42:39.680683: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 15:42:39.684715: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Danish I I Dont Is The


In [35]:
# saving
outpath = os.path.join("../model/rnn_model.keras")
tf.keras.saving.save_model(model, outpath, overwrite=True, save_format=None)

In [36]:
# importing
loaded_model = tf.keras.saving.load_model("../model/rnn_model.keras")

2023-03-28 15:42:53.274212: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 15:42:53.279169: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 15:42:53.282803: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [19]:
print(generate_text("danish", 5, loaded_model, max_sequence_len))

TypeError: generate_text() missing 1 required positional argument: 'max_sequence_len'

In [None]:
# OTHER METHOD

import joblib
from joblib import dump
# saving trained model
dump(history, "../out/rnn_model.joblib")

# load model
loaded_model = joblib.load("../out/rnn_model.joblib")
print(generate_text("danish", 5, model, max_sequence_len))

## Using pre-trained word embeddings

Instead of having the embedding layer as a trainable parameter, we can instead using a *pretrained word embedding* model like ```word2vec```.

In the following examples, we're using [GloVe embeddings](https://nlp.stanford.edu/projects/glove/). These are trained a little differently from ```word2vec``` but they behave in the same way.

We make use of already trained word embeddings, which are better than what we ever could train on a model ourselves.

In [42]:
# defining path
path_to_glove_file = os.path.join("../data/glove/")

# creating pre-trained word embedding index
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

KeyboardInterrupt: 

We can define some variables that we're going to use later.

With hits and misses, we're counting how many words in the corpus vocabulary have a corresponding GloVe embedding; misses are the words which appear in our vocabulary but which do not have a GloVe embedding.

In [None]:
num_tokens = total_words
embedding_dim = 100
hits = 0
misses = 0

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer - notice that this is different
    model.add(Embedding(
            total_words,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=input_len)
    )
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(500))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, 
                    verbose=1)

In [None]:
print (generate_text("china", 30, model, max_sequence_len))