# Text Generation with LSTM

In [20]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger','ner'])
# nlp.max_length = 1198623

## 1. Load the data

In [21]:
# Read text file
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

# Some preprocessing
def separate_punc(doc_text):
    """
    If the token is in one of the following symbols,
    get rid of it.

    The regex string can be found in keras.preprocessing.text.Tokenizer.
    In addition `\n` have been added to ignore double and triple line breaks
    which are particular to this document.
    """
    return [
        token.text.lower() for token in nlp(doc_text) 
            if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [22]:
# moby_dick = read_file('melville-moby_dick.txt')
moby_dick = read_file('moby_dick_four_chapters.txt')
tokens = separate_punc(moby_dick)

print(f'{len(tokens)} tokens in the data')
print(tokens[0:50])

11338 tokens in the data
['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', 'i', 'thought', 'i', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', 'it', 'is', 'a', 'way', 'i', 'have', 'of']




## 2. Prepare the data

### 2.1. Create Text Sequences

To train the model, we need to create sequences of texts.
A sequence has a length. For a sequence of 5 words, the goal
would be to predict the 6th word.

So, we train the model on 25 words and try to predict the 26th word.

For example, the text "call me ishmael some years ago never" converted to sequences
of 5 words would be: 

```
['call', 'me', 'ishmael', 'some', 'years']
['me', 'ishmael', 'some', 'years', 'ago']
['ishmael', 'some', 'years', 'ago', 'never]
```

In the first sequence the prediction would be "ago".

In [23]:
train_size = 25 + 1
text_sequences = []
# stop at 26+1 words before the end since tokens[i:i+train_size]
# will take the 26 remaining tokens at once
stop = len(tokens)-train_size+1     
for i in range(0, stop):
    seq = tokens[i:i+train_size]
    text_sequences.append(seq)

print(len(text_sequences[-1]))

26


### 2.2.Transforming text sequences to indexes sequences

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

# Updates internal vocabulary based on the sequences.
tokenizer.fit_on_texts(text_sequences)

# Transforms each text in text_sequences to a sequence of integers.
sequences = tokenizer.texts_to_sequences(text_sequences)

print(f'{len(sequences)} sequences of 26 words')
print(sequences[0])
# len_tokens (1338) - len_sequences (11313) = sequence_size (25)

11313 sequences of 26 words
[956, 14, 263, 51, 262, 408, 87, 219, 129, 111, 954, 260, 50, 43, 38, 314, 7, 23, 546, 3, 150, 259, 6, 2713, 14, 24]


We can map a sequence to its corresponding word by using index_word

In [25]:
print(f'{len(tokenizer.index_word)} unique words in the vocabulary')
tokenizer.index_word[956]

2719 unique words in the vocabulary


'call'

In [26]:
# tokenizer.index_word
# tokenizer.word_index
# tokenizer.word_counts
# tokenizer.word_docs

### 2.3. Separating features from labels

In [27]:
import numpy as np

# convert sequences to numpy array
sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  262, ...,   24,  957,    5],
       ...,
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26],
       [2712,    3,   10, ..., 2718,   26, 2719]])

In [28]:
X = sequences[:, :-1]
y = sequences[:, -1]
X.shape, y.shape

((11313, 25), (11313,))

### 2.4. One-hot encoding the target

to_categorical() needs `n_classes+1`. Here `n_classes` is the size of the vocabulary.
Each word's index will be one-hot encoded.

*The +1 is the padding token, useful to have a more flexible sequence lenght. In the case of `to_categorical`, it is just the `0` index.*

See the note below to learn more about the padding token.

If `num_classes` is not specified, `num_classes=max(y)+1`.


In [29]:
from tensorflow.keras.utils import to_categorical 

# number of classes
vocab_size = len(tokenizer.word_counts)

# Add 1 to account for the padding token of the model
y = to_categorical(y, num_classes=vocab_size+1)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

## 3. Building the Model

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.utils import plot_model

In [31]:
seq_len = X.shape[1]
vocab_size = len(tokenizer.word_counts)
print(f'seq_len: {seq_len}, vocab_size: {vocab_size}')

seq_len: 25, vocab_size: 2719


**The padding token**

Setting a padding token for example '11' in our case will serve as a reference token
to be used when the model encounters short sequences.  This way the model will learn
that token '11' does not provide any meaningful information.

This is why we add 1 to the `input_dim`. This allows us to have a padding token which is useufl for shorter sequences.

- Review 1: "This movie was amazing!"
- Review 2: "I didn't like the ending."
- Review 3: "The acting and cinematography were top-notch."

*Tokenize*
- Review 1: [1, 2, 3, 4, 5]
- Review 2: [6, 7, 8, 9, 10]
- Review 3: [11, 12, 13, 14, 15, 16]

As you can see, the lengths of these sequences differ. Review 1 has 5 words, Review 2 has 5 words, and Review 3 has 6 words.

To feed these sequences into a model, we need them to have the same length. Let's assume we decide on a maximum sequence length of 8. We can use padding to make the sequences uniform:

*Add padding*
- Review 1 (padded): [1, 2, 3, 4, 5, 0, 0, 0]
- Review 2 (padded): [6, 7, 8, 9, 10, 0, 0, 0]
- Review 3 (padded): [11, 12, 13, 14, 15, 16, 0, 0]

In [32]:
"""The Embedding layer maps the sequence indexes into their embedding vectors
so the model can assoiciate semantic information with each token."""
model = Sequential([
    Embedding(input_dim=vocab_size+1, output_dim=seq_len, input_length=seq_len),
    LSTM(50, return_sequences=True),
    LSTM(50),
    Dense(50, activation='relu'),
    Dense(vocab_size+1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# plot_model(model, show_shapes=True, to_file='mobydick_model.png')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            68000     
                                                                 
 lstm (LSTM)                 (None, 25, 50)            15200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 2720)              138720    
                                                                 
Total params: 244670 (955.74 KB)
Trainable params: 244670 (955.74 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Training**

In [33]:
model.fit(X, y, batch_size=128, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x20db7011250>

**Saving the model**

In [43]:
import pickle 
from tensorflow.keras.models import save_model, load_model 

with open('models/mobydick_4_chapters_tokenizer', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

model.save('models/mobydick_4_chapters_model.h5')

  saving_api.save_model(


## 4. Making Predictions

In [35]:
from tensorflow.keras.utils import pad_sequences

**Load the model**

In [44]:
model = load_model('models/mobydick_4_chapters_model.h5')

**Encode the input_text**

In [45]:
input_text = 'call me ishmael some years ago never mind how long precisely'

encoded_text = tokenizer.texts_to_sequences([input_text])[0]
encoded_text

[956, 14, 263, 51, 262, 408, 87, 219, 129, 111, 954]

**Padd the sequences**

We add a padding to the sequence if its is shorter than the sequence's length
otherwise, we remove the values at the beginning truncating='pre'.

In [46]:
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
pad_encoded

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0, 956,  14, 263,  51, 262, 408,  87, 219, 129, 111, 954]])

**Predict the probability of each word**

In [47]:
pred_word_proba = model.predict(pad_encoded, verbose=0)[0]
print('Predicted Probbilites:', len(pred_word_proba))
pred_word_proba

Predicted Probbilites: 2720


array([0.0000000e+00, 5.5717385e-07, 2.7174196e-09, ..., 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00], dtype=float32)

**Get the class of the word with the highest probability**

In [48]:
pred_word_class = np.argmax(pred_word_proba, axis=-1)
pred_word_class

27

**Map the class to its corresponding word**

In [49]:
pred_word = tokenizer.index_word[pred_word_class]
pred_word

'not'

**Print the result**

In [50]:
print(input_text, pred_word)

call me ishmael some years ago never mind how long precisely not


## 5. Text generation

**Picking random text from the sequences**

In [51]:
import random
random.seed(42)
random_pick = random.randint(0, len(text_sequences))

random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)
seed_text

"a horse collar and suddenly felt a slight scratch throwing aside the counterpane there lay the tomahawk sleeping by the savage 's side as if it"

**Generating words from input text**

In [70]:
from time import sleep 

def generate_text(model, tokenizer, seq_len, input_text, n_outputs):
    """
    Takes in an input text, then generate `n_outputs` words
    using the specified `model`.
    
    Parameters
    ----------
    model : keras.engine.sequential.Sequential
        The trained model.
    tokenizer: keras.preprocessing.text.Tokenizer
        The tokenizer used to train the model.
    seq_len : int
        The length of the sequence used to train the model.
    input_text : str
        The input text.
    n_outpus: int
        The number of words to generate.
    """

    output_text = ''
    print(input_text)
    for i in range(n_outputs):
        # On the next iteration, the predicted word becomes the last word in the input text
        # so that the model will predict the word that comes after the predicted word
        if (i > 0):
            input_text += ' ' + pred_word

        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_proba = model.predict(pad_encoded, verbose=0)[0]
        pred_word_class = np.argmax(pred_word_proba, axis=-1)
        pred_word = tokenizer.index_word[pred_word_class]

        output_text += ' ' + pred_word
        sleep(0.3)
        print(pred_word, end=' ')

    # return output_text        

In [71]:
generate_text(model, tokenizer, seq_len, seed_text, n_outputs=15)

a horse collar and suddenly felt a slight scratch throwing aside the counterpane there lay the tomahawk sleeping by the savage 's side as if it
were a hatchet faced baby a pretty pickle truly thought i abed here in a 