# Text prediction at Gabo style using LSTM

In [2]:
#Library loading
import numpy as np
import csv
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import load_model

In [3]:
# Define path for file with sonnets
GABO_FILE = './gabo.txt'

# Read the data
with open("./gabo.txt", encoding = "utf-8") as f:
    data = f.read()

# Convert to lower case and save as a list
corpus = data.lower().split("\n")

print(f"The first 5 lines look like this:\n")
for i in range(5):
  print(corpus[i])

The first 5 lines look like this:

muchos años después, frente al pelotón de fusilamiento, el coronel
aureliano buendía había de recordar aquella tarde remota en que su
padre lo llevó a conocer el hielo. macondo era entonces una aldea de
veinte casas de barro y cañabrava construidas a la orilla de un río de
aguas diáfanas que se precipitaban por un lecho de piedras pulidas,


## Tokenizing the text

Now fit the Tokenizer to the corpus and save the total number of words.

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [5]:
corpus[0]

'muchos años después, frente al pelotón de fusilamiento, el coronel'

In [6]:
tokenizer.texts_to_sequences([corpus[0]])[0]

[221, 64, 50, 158, 22, 536, 1, 508, 5, 41]

## Generating n_grams

The `n_gram_seqs` receives the fitted tokenizer and the corpus (which is a list of strings) and should return a list containing the `n_gram` sequences for each line in the corpus:

In [7]:
def n_gram_seqs(corpus, tokenizer):
    """
    Generates a list of n-gram sequences
    
    Args:
        corpus (list of string): lines of texts to generate n-grams for
        tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
    
    Returns:
        input_sequences (list of int): the n-gram sequences for each line in the corpus
    """
    input_sequences = []
    
    # Loop over every line
    for line in corpus:

	    # Tokenize the current line
	    token_list = tokenizer.texts_to_sequences([line])[0]

	    # Loop over the line several times to generate the subphrases
	    for i in range(1, len(token_list)):
		
		    # Generate the subphrase
		    n_gram_sequence = token_list[:i+1]

		    # Append the subphrase to the sequences list
		    input_sequences.append(n_gram_sequence)
            
    return input_sequences

In [8]:
# Test your function with one example
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)

print("n_gram sequences for first example look like this:\n")
first_example_sequence

n_gram sequences for first example look like this:



[[221, 64],
 [221, 64, 50],
 [221, 64, 50, 158],
 [221, 64, 50, 158, 22],
 [221, 64, 50, 158, 22, 536],
 [221, 64, 50, 158, 22, 536, 1],
 [221, 64, 50, 158, 22, 536, 1, 508],
 [221, 64, 50, 158, 22, 536, 1, 508, 5],
 [221, 64, 50, 158, 22, 536, 1, 508, 5, 41]]

In [9]:
# Test your function with a bigger corpus
next_3_examples_sequence = n_gram_seqs(corpus[1:4], tokenizer)

print("n_gram sequences for next 3 examples look like this:\n")
next_3_examples_sequence

n_gram sequences for next 3 examples look like this:



[[23, 36],
 [23, 36, 20],
 [23, 36, 20, 1],
 [23, 36, 20, 1, 561],
 [23, 36, 20, 1, 561, 104],
 [23, 36, 20, 1, 561, 104, 74],
 [23, 36, 20, 1, 561, 104, 74, 996],
 [23, 36, 20, 1, 561, 104, 74, 996, 6],
 [23, 36, 20, 1, 561, 104, 74, 996, 6, 3],
 [23, 36, 20, 1, 561, 104, 74, 996, 6, 3, 15],
 [95, 21],
 [95, 21, 188],
 [95, 21, 188, 7],
 [95, 21, 188, 7, 430],
 [95, 21, 188, 7, 430, 5],
 [95, 21, 188, 7, 430, 5, 412],
 [95, 21, 188, 7, 430, 5, 412, 66],
 [95, 21, 188, 7, 430, 5, 412, 66, 26],
 [95, 21, 188, 7, 430, 5, 412, 66, 26, 46],
 [95, 21, 188, 7, 430, 5, 412, 66, 26, 46, 13],
 [95, 21, 188, 7, 430, 5, 412, 66, 26, 46, 13, 357],
 [95, 21, 188, 7, 430, 5, 412, 66, 26, 46, 13, 357, 1],
 [365, 334],
 [365, 334, 1],
 [365, 334, 1, 1719],
 [365, 334, 1, 1719, 4],
 [365, 334, 1, 1719, 4, 4993],
 [365, 334, 1, 1719, 4, 4993, 4994],
 [365, 334, 1, 1719, 4, 4993, 4994, 7],
 [365, 334, 1, 1719, 4, 4993, 4994, 7, 2],
 [365, 334, 1, 1719, 4, 4993, 4994, 7, 2, 2902],
 [365, 334, 1, 1719, 4, 

In [10]:
# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)

# Save max length 
max_sequence_len = max([len(x) for x in input_sequences])

print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")

n_grams of input_sequences have length: 126015
maximum length of sequences is: 27


## Add padding to the sequences

Now the `pad_seqs` function pad any given sequences to the desired maximum length. Notice that this function receives a list of sequences and should return a numpy array with the padded sequences: 

In [11]:
def pad_seqs(input_sequences, maxlen):
    """
    Pads tokenized sequences to the same length
    
    Args:
        input_sequences (list of int): tokenized sequences to pad
        maxlen (int): maximum length of the token sequences
    
    Returns:
        padded_sequences (array of int): tokenized sequences padded to the same length
    """
    padded_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))
    
    return padded_sequences

In [12]:
# Test your function with the n_grams_seq of the first example
first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq

array([[  0,   0,   0,   0,   0,   0,   0, 221,  64],
       [  0,   0,   0,   0,   0,   0, 221,  64,  50],
       [  0,   0,   0,   0,   0, 221,  64,  50, 158],
       [  0,   0,   0,   0, 221,  64,  50, 158,  22],
       [  0,   0,   0, 221,  64,  50, 158,  22, 536],
       [  0,   0, 221,  64,  50, 158,  22, 536,   1],
       [  0, 221,  64,  50, 158,  22, 536,   1, 508],
       [221,  64,  50, 158,  22, 536,   1, 508,   5],
       [ 64,  50, 158,  22, 536,   1, 508,   5,  41]])

In [13]:
first_example_sequence

[[221, 64],
 [221, 64, 50],
 [221, 64, 50, 158],
 [221, 64, 50, 158, 22],
 [221, 64, 50, 158, 22, 536],
 [221, 64, 50, 158, 22, 536, 1],
 [221, 64, 50, 158, 22, 536, 1, 508],
 [221, 64, 50, 158, 22, 536, 1, 508, 5],
 [221, 64, 50, 158, 22, 536, 1, 508, 5, 41]]

In [14]:
# Test your function with the n_grams_seq of the next 3 examples
next_3_padded_seq = pad_seqs(next_3_examples_sequence, max([len(s) for s in next_3_examples_sequence]))
next_3_padded_seq

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   23,   36],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          23,   36,   20],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   23,
          36,   20,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,   23,   36,
          20,    1,  561],
       [   0,    0,    0,    0,    0,    0,    0,    0,   23,   36,   20,
           1,  561,  104],
       [   0,    0,    0,    0,    0,    0,    0,   23,   36,   20,    1,
         561,  104,   74],
       [   0,    0,    0,    0,    0,    0,   23,   36,   20,    1,  561,
         104,   74,  996],
       [   0,    0,    0,    0,    0,   23,   36,   20,    1,  561,  104,
          74,  996,    6],
       [   0,    0,    0,    0,   23,   36,   20,    1,  561,  104,   74,
         996,    6,    3],
       [   0,    0,    0,   23,   36,   20,    1,  561,  104,   74,  996,
           6,    

In [15]:
# Pad the whole corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)

print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (126015, 27)


## Split the data into features and labels

Before feeding the data into the neural network you should split it into features and labels. In this case the features will be the padded n_gram sequences with the last word removed from them and the labels will be the removed word.

Function `features_and_labels` expects the padded n_gram sequences as input and should return a tuple containing the features and the one hot encoded labels.

Notice that the function also receives the total of words in the corpus, this parameter will be very important when one hot enconding the labels since every word in the corpus will be a label at least once. If you need a refresh of how the `to_categorical` function works take a look at the [docs](https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical)

In [16]:
def features_and_labels(input_sequences, total_words):
    """
    Generates features and labels from n-grams
    
    Args:
        input_sequences (list of int): sequences to split features and labels from
        total_words (int): vocabulary size
    
    Returns:
        features, one_hot_labels (array of int, array of int): arrays of features and one-hot encoded labels
    """
    
    features = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    one_hot_labels = to_categorical(labels, num_classes=total_words)

    return features, one_hot_labels

In [17]:
# Test your function with the padded n_grams_seq of the first example
first_features, first_labels = features_and_labels(first_padded_seq, total_words)

print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features

labels have shape: (9, 15831)

features look like this:



array([[  0,   0,   0,   0,   0,   0,   0, 221],
       [  0,   0,   0,   0,   0,   0, 221,  64],
       [  0,   0,   0,   0,   0, 221,  64,  50],
       [  0,   0,   0,   0, 221,  64,  50, 158],
       [  0,   0,   0, 221,  64,  50, 158,  22],
       [  0,   0, 221,  64,  50, 158,  22, 536],
       [  0, 221,  64,  50, 158,  22, 536,   1],
       [221,  64,  50, 158,  22, 536,   1, 508],
       [ 64,  50, 158,  22, 536,   1, 508,   5]])

In [18]:
# Split the whole corpus
features, labels = features_and_labels(input_sequences, total_words)

print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

features have shape: (126015, 26)
labels have shape: (126015, 15831)


## Create the model

Now you should define a model architecture capable of achieving an accuracy of at least 80%.

Some hints to help you in this task:

- An appropriate `output_dim` for the first layer (Embedding) is 100, this is already provided for you.
- A Bidirectional LSTM is helpful for this particular problem.
- The last layer should have the same number of units as the total number of words in the corpus and a softmax activation function.
- This problem can be solved with only two layers (excluding the Embedding) so try out small architectures first.

In [19]:
def create_model(total_words, max_sequence_len):
    """
    Creates a text generator model
    
    Args:
        total_words (int): size of the vocabulary for the Embedding layer input
        max_sequence_len (int): length of the input sequences
    
    Returns:
        model (tf.keras Model): the text generator model
    """
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(100, return_sequences = True)))
    model.add(Bidirectional(LSTM(100)))
    model.add(Dense(total_words, activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer= 'Adam',
                  metrics=['accuracy'])

    return model

In [20]:
# Get the untrained model
model = create_model(total_words, max_sequence_len)

# Train the model
history = model.fit(features, labels, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [21]:
# Salvando el modelo
model.save("gabo.h5")

In [19]:
# Load the model
model = load_model('gabo.h5')
# print summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 26, 100)           1583100   
                                                                 
 bidirectional (Bidirectiona  (None, 26, 200)          160800    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 15831)             3182031   
                                                                 
Total params: 5,166,731
Trainable params: 5,166,731
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Train the model
history = model.fit(features, labels, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
#Text generation
seed_text = "macondo"
next_words = 100
  
for _ in range(next_words):
	# Convert the text into sequences
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	# Pad the sequences
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	# Get the probabilities of predicting a word
	predicted = model.predict(token_list, verbose=0)
	# Choose the next word based on the maximum probability
	predicted = np.argmax(predicted, axis=-1).item()
	# Get the actual word from the word index
	output_word = tokenizer.index_word[predicted]
	# Append to the current text
	seed_text += " " + output_word

print(seed_text)

macondo en el estrecho cuartito atiborrado de frascos vacíos que alquiló el día siguiente de sus súplicas y los baúles en brazos y un llanto de regreso al cuarto de melquíades estaba casi por la orden a la calle y el sueño y perdido el nigromante se lo había hecho el más leve suspiro de risa y que la familia y la oyó un par de medias en el mollera hasta el día en que importaba la tarde en que el fabricarse y varios siglos frente a la la vida desde la vida santa sofía de la piedad » lo vio


In [None]:
#Manually adding punctuation and making some cosmetic changes to give it more sense to the sentences with a touch of poetry!
"""

Macondo en el estrecho cuartito atiborrado de frascos vacíos que alquiló al día siguiente de sus súplicas. Los baúles en brazos y un llanto de regreso al cuarto de Melquíades que estaba por 
la calle y el sueño perdido del nigromante se había hecho el más leve suspiro de risa que la familia oyó, hasta el día en que importaba la tarde de varios siglos frente a la vida de 
Santa Sofía de la piedad ...


"""