In [None]:
# When running on colab remember to use a GPU enviroment
import numpy as np
import re  
from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.utils import np_utils
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# We are loading the file with texts from all the authors, be sure to select the file you want
with open ('/content/drive/MyDrive/Modelos/mixed_text.txt', encoding='utf-8-sig') as f:
    text = f.read()
seq_length = 20
text = text[:int(len(text)/2.5)]
start_story = '| ' * seq_length
text = start_story + text

In [None]:
def clean_text(text):
    """This function will clean a text in order to make it suitable for the LSTM model"""
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('   +', '. ', text).strip()
    text = text.replace('..', '.')
    # This pattern will insert a space before any punctuation sign in order to also tokenize them
    text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)

    return text

In [None]:
# Clean the text
text = clean_text(text)

In [None]:
# We create a Tokenizer object
tokenizer = Tokenizer(char_level = False, filters = '', lower = True)
# We fit our object with our text
tokenizer.fit_on_texts([text])
# We check how many words/tokens were found (+1 due zero indexing)
total_words = len(tokenizer.word_index) + 1
# We create a sequence of token labels for our text
token_list = tokenizer.texts_to_sequences([text])[0]

In [None]:
def generate_sequences(token_list, step, seq_length):
    """This function will generate a list of sequences from a tokenized text given a specific sequence length and a step"""
    X = []
    y = []

    # Given a specific tokenized text, X will save a word-sequence of n length and y will save the next word for that sequence
    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])
    
    # y must be turned into categorical
    y = np_utils.to_categorical(y, num_classes = total_words)
    
    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")
    
    return X, y, num_seq

In [None]:
# Get your sequences
X, y, num_seq = generate_sequences(token_list, step = 1, seq_length = 20)

Number of sequences: 123311 



In [None]:
# Transform sequence lists to np arrays
X = np.array(X)

In [None]:
# Transform sequence lists to np arrays
# Consider waiting a couple seconds before running this cell after the previous one due RAM usage
y = np.array(y)

## Define the LSTM model

In [None]:
n_units = 256 # Space dimensions
embedding_size = 100 # Size of the embedding layer

# Input layer
text_in = Input(shape = (None,))
# Embedding layer
embedding = Embedding(total_words, embedding_size)
x = embedding(text_in)
# First LSTM layer
x = LSTM(n_units, return_sequences = True)(x)
# Dropout to avoid overfitting
x = Dropout(0.2)(x)
# Second LSTM layer
x = LSTM(n_units, return_sequences = True)(x)
x = Dropout(0.2)(x)
# Third LSTM layer
x = LSTM(n_units, return_sequences = True)(x)
x = Dropout(0.2)(x)
# Fourth LSTM layer
x = LSTM(n_units)(x)
x = Dropout(0.2)(x)
# Output layer
text_out = Dense(total_words, activation = 'softmax')(x)
# Model definition
model = Model(text_in, text_out)
# Compile with a crossentropy loss and an Adam optimizer 
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Chek your model's structure
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         1662900   
_________________________________________________________________
lstm (LSTM)                  (None, None, 256)         365568    
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         525312

In [None]:
def sample_with_temp(preds, temperature=1.0):
    """This function gets the predictions generated by a model and returns the best fit choosen a given temperature"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # creates a random experiment given the probabilities for ou next word
    probas = np.random.multinomial(1, preds, 1)
    # return the word with more probability in our experiment 
    return np.argmax(probas)

In [1]:
def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    """This function will generate a text of a given size using the predictions generated by our model"""
    output_text = seed_text
    
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
      # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        # get the predictions
        probs = model.predict(token_list, verbose=0)[0]
        # get the most probable next word 
        y_class = sample_with_temp(probs, temperature = temp)
        
        # if probability = 0 returns no word
        if y_class == 0:
            output_word = ''
        # if the probability is not 0     
        else:
            output_word = tokenizer.index_word[y_class]
                        
        output_text += output_word + ' '
        seed_text += output_word + ' '

    return output_text

In [None]:
def on_epoch_end(epoch, logs):
    """This function will generate a text prediction after each epoch using different temperatures"""
    seed_text = ""
    gen_words = 100

    print('Temp 0.2')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.2))
    print('Temp 0.33')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.33))
    print('Temp 0.5')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.5))

# This element will save our model after each epoch, only if it outperformed its previous loss score
filepath = '/content/drive/MyDrive/Modelos/4LSTM_ADAM_MIX_LOW_CHECKPOINT.h5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [None]:
# Model Training
epochs = 200
batch_size = 32
num_batches = int(len(X) / batch_size)
callback = LambdaCallback(on_epoch_end=on_epoch_end)
callbacks_list = [callback, checkpoint]
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks = callbacks_list, shuffle = True)

Epoch 1/200
Temp 0.2
de y de de , . de de . . , . . . . . . el a . . de , . . . , , . . de , | . de , . de la . el la . de de , . y . . . de de no . . de , y y . , , . . de , . . de . que . . y , de de , . y . . . . . . de , el , . , , la . la . , y 
Temp 0.33
que de de de el . . | , . y y a , , que | a . , el de | que , | . . la la y , y de que y , , , de a el , de la de . de . se . el de de y del de . . , la , y . de a . de que de . de no de para a , de . . que la , a . . la , , , . de de de de . | , . que 
Temp 0.5
no . . las que . y , la la . , de . que . Y y a , . con no El de el y . . de si . . . que . | , no la a a . , , la el . . . , la no . la . el tiempo , , de de de la la por los el el a de de " , de y las el , la . los . , y y . , la el , de . . por . , y , de 

Epoch 00001: loss improved from inf to 7.10866, saving model to /content/drive/MyDrive/Modelos/4LSTM_ADAM_DENISSE_CAP_CHECKPOINT.h5
Epoch 2/200
Temp 0.2
. , de el . , , de . . de de , de . , , , que de de . el de , 

  This is separate from the ipykernel package so we can avoid doing imports until


En el país de México . México es un país de ruta . Después . 1 . 1 . Así , la Programa de la era Nacional que no es obligado , de una publicidad social , que el gobierno se dio perder y el importancia de la Cuarta Transformación . | | | | | | | | | | | | | | | | | | | | La historia de México . México no es un país de emergencia , pero el gobierno se repite con la costo de la Verdad , la triunfo de la 
Temp 0.5
El país de la cual se repite " . La gobierno de México . López Obrador cedió el mirada de los intelectuales en su Tec pero , como la puesto de la Verdad , la Presidente no asegura su uso de los Pymes , llevaron de la fase parejo . Y el Presidente se le olvidó el gran Magna , sino su forma de la Suprema Corte irresponsablemente todavía no les dio la ley . Y la desinformación da , la forma del PIB , la tamaño de la nación , la espíritu se fue tú . Un Estado de 

Epoch 00033: loss improved from 3.73854 to 3.59481, saving model to /content/drive/MyDrive/Modelos/4LSTM_ADAM_DENISSE_CAP

<tensorflow.python.keras.callbacks.History at 0x7f4cbc0a3c90>

In [None]:
# Save our final model
model.save('/content/drive/MyDrive/Modelos/4LSTM_ADAM_MIX_LOW_200E.h5')