In [1]:
import numpy as np
import re
from keras.utils import np_utils
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from IPython.display import clear_output
import pandas as pd

In [2]:
# We import a trained model
loaded_model = load_model('../Saved_models/MIXED.h5')

In [3]:
# Check the model structure
loaded_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         1861100   
_________________________________________________________________
lstm (LSTM)                  (None, None, 256)         365568    
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         525312

In [21]:
#We  need to import the text which trained our model in order to create the same dictionary when generating the model
with open ('../../Data/Data_clean_txt/mixed_text.txt', encoding='utf-8-sig') as f:
    text = f.read()
seq_length = 20
# check the indexing used for each model
text = text[:int(len(text)/2.75)]
start_story = '| ' * seq_length
text = start_story + text

In [20]:
def clean_text(text):
    """This function will clean a text in order to make it suitable for the LSTM model"""
    #text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('   +', '. ', text).strip()
    text = text.replace('..', '.')
    #text = text.replace('| ', '')
    # This pattern will insert a space before any punctuation sign in order to also tokenize them
    text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)

    return text

In [22]:
text = clean_text(text)

In [23]:
# We create a Tokenizer object
# this object must have lowercase true or false depending on how the model was trained
tokenizer = Tokenizer(char_level = False, filters = '', lower = False)
# We fit our object with our text
tokenizer.fit_on_texts([text])

In [24]:
def sample_with_temp(preds, temperature=1.0):
    """This function will generate predictions using our generativemodel given a specific temperature"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # creates a random experiment given the probabilities for ou next word
    probas = np.random.multinomial(1, preds, 1)
    # return the word with more probability in our experiment 
    return np.argmax(probas)



def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    """This function will generate a text of a given size using the predictions created by our model"""
    output_text = seed_text
    
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
      # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # We trim the seed text and the start text in order to be of length 20 words
        token_list = token_list[-max_sequence_len:]
        # we turn the tokenized words list into a np array
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        # get the predictions
        probs = model.predict(token_list, verbose=0)[0]
        # get the most probable next word 
        y_class = sample_with_temp(probs, temperature = temp)
        
        # if probability = 0 returns no word
        if y_class == 0:
            output_word = ''
        # if the probability is not 0     
        else:
            output_word = tokenizer.index_word[y_class]
            
        output_text += output_word + ' '
        seed_text += output_word + ' '

    return output_text
