In [None]:
import numpy as np
import re
from keras.utils import np_utils
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# We import a previous mid-trained model
loaded_model = load_model('/content/drive/MyDrive/Modelos/4LSTM_ADAM_MIXED_CAP_CHECKPOINT.h5')

In [None]:
# Check the model structure
loaded_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         1861100   
_________________________________________________________________
lstm (LSTM)                  (None, None, 256)         365568    
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         525312

In [None]:
# We load the text that will be used to train the model, this should be the same we used for the first training
with open ('/content/drive/MyDrive/Modelos/mixed_text.txt', encoding='utf-8-sig') as f:
    text = f.read()
seq_length = 20
# text lenght should be the same as well
text = text[:int(len(text)/2.75)]
start_story = '| ' * seq_length
text = start_story + text

In [None]:
def clean_text(text):
    """This function will clean a text in order to make it suitable for the LSTM model"""
    #text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('   +', '. ', text).strip()
    text = text.replace('..', '.')
    # This pattern will insert a space before any punctuation sign in order to also tokenize them
    text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)

    return text

In [None]:
# Clean the text
text = clean_text(text)

In [None]:
# We create a Tokenizer object
# The tokenizer object should have the exact same parameters we used for the first training in order to stay in tune 
tokenizer = Tokenizer(char_level = False, filters = '', lower = False)
# We fit our object with our text
tokenizer.fit_on_texts([text])
# We check how many words/tokens were found (+1 due zero indexing)
total_words = len(tokenizer.word_index) + 1
# We create a sequence of token labels for our text
token_list = tokenizer.texts_to_sequences([text])[0]

In [None]:
def generate_sequences(token_list, step, seq_length):
    """This function will generate a list of sequences from a tokenized text given a specific sequence length and a step"""
    X = []
    y = []

    # Given a specific tokenized text, X will save a word-sequence of n length and y will save the next word for that sequence
    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])
    
    # y must be turned into categorical
    y = np_utils.to_categorical(y, num_classes = total_words)
    
    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")
    
    return X, y, num_seq

In [None]:
X, y, num_seq = generate_sequences(token_list, step = 1, seq_length = 20)

Number of sequences: 149045 



In [None]:
# We convert our list of lists into a matrix so our model can read them
X = np.array(X)

In [None]:
# Consider waiting a couple seconds before running this cell after the previous one due RAM usage
y = np.array(y)

In [None]:
def sample_with_temp(preds, temperature=1.0):
    """This function gets the predictions generated by a model and returns the best fit choosen a given temperature"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # creates a random experiment given the probabilities for ou next word
    probas = np.random.multinomial(1, preds, 1)
    # return the word with more probability in our experiment 
    return np.argmax(probas)



def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    """This function will generate a text of a given size using the predictions created by our model"""
    output_text = seed_text
    
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
      # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        # get the predictions
        probs = model.predict(token_list, verbose=0)[0]
        # get the most probable next word 
        y_class = sample_with_temp(probs, temperature = temp)
        
        # if probability = 0 returns no word
        if y_class == 0:
            output_word = ''
        # if the probability is not 0     
        else:
            output_word = tokenizer.index_word[y_class]
            
        #if output_word == "|":
            #break
            
        output_text += output_word + ' '
        seed_text += output_word + ' '

    return output_text


In [None]:
def on_epoch_end(epoch, logs):
    """This function will generate a text prediction after each epoch"""
    seed_text = ""
    gen_words = 100

    print('Temp 0.2')
    print (generate_text(seed_text, gen_words, loaded_model, seq_length, temp = 0.2))
    print('Temp 0.33')
    print (generate_text(seed_text, gen_words, loaded_model, seq_length, temp = 0.33))
    print('Temp 0.5')
    print (generate_text(seed_text, gen_words, loaded_model, seq_length, temp = 0.5))

# This function will save our model after each epoch, only if it outperformed its previous loss score
filepath = '/content/drive/MyDrive/Modelos/4LSTM_ADAM_MIXED22_CAP_CHECKPOINT.h5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')


In [None]:
# Start the re-training 
# batch size must stay the same 
epochs = 200
batch_size = 32
num_batches = int(len(X) / batch_size)
callback = LambdaCallback(on_epoch_end=on_epoch_end)
callbacks_list = [callback, checkpoint]
loaded_model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks = callbacks_list, shuffle = True)


Epoch 1/200
Temp 0.2


  This is separate from the ipykernel package so we can avoid doing imports until


El pasado domingo , el segundo 11 de junio , no se lo de unos , de dos días , hasta otras mujeres para lograr tanto su papel sería quizá razón con la muerte . A lo largo de veinte años la inflación siempre se le han dado que una iniciativa gubernamental y una tragedia . También bien juntos para hacer una nueva idea de que los recursos son más grande de manera económica . Las empresas más de otra parte . El modelo político igual que un grupo de estímulo se dará de formas tiempo en los estados del 
Temp 0.33
La democracia consiste en partidos de instituciones de Covid . Qué día como un aumento de un niño que se haya abra . Por contraste , no sólo rebasan una de telecomunicaciones presidencial confiado en que la dignidad , se gastarán la vida de los pobres . Que el mismo tiempo se bajó sin duda esos o de propósito a la libertad . Me ha preguntamos también lejos de un grupo de franca , se suele de un liderazgo de tres años y el norte popular de las campañas que se habían desarrollado . Fue

<tensorflow.python.keras.callbacks.History at 0x7f94c0032550>