# Generating text with LSTM neural networks

### This code is adapted from the github page

https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

### Original description from author's code:

Example script to generate text from Nietzsche's writings.

At least 20 epochs are required before the generated text starts sounding coherent.

It is recommended to run this script on GPU, as recurrent networks are quite computationally intensive.

If you try this script on new data, make sure your corpus has at least ~100k characters. ~1M is better.

### Import packages

In [1]:
import numpy as np
import pandas as pd
import random
import sys
import re

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Use Amazon product reviews as the input data

In [38]:
#Load the dataset of Amazon product reviews
data = pd.read_csv("reviews.csv",names=["review"])
data.review[:10]

0    Just another flavor of Kit Kat but the taste i...
1    I bought this on impulse and it comes from Jap...
2    Really good. Great gift for any fan of green t...
3    I had never had it before, was curious to see ...
4    I've been looking forward to trying these afte...
5    These Kit-kats are very good, but if you're lo...
6    I found these in a Mitsuwa Marketplace in Illi...
7    Creamy white chocolate infused with Matcha gre...
8    After hearing mixed opinions about these Kit K...
9    I love green tea, I love Kit Kats, but the two...
Name: review, dtype: object

### Process the data

In [39]:
#Limit the number of records for now
data = data.iloc[:1000,:]
data = data.review.apply(lambda x: str(x).lower())

text = " ".join(list(data))

#Remove HTML tags and other nuisances 
regex_to_remove = [r'<a href=.* </a>',r'[^\x00-\x7F]+']
for regex in regex_to_remove:
    text = re.sub(regex,"",text)

#Remove character strings that are rare
chars_to_remove = ['*','<','=','>','@','[',']','_','}','~','br /',"'",'+','#','`','{']
for bad_char in chars_to_remove:
    text = text.replace(bad_char,"")
    
#Make all spaces single-spaced and eliminate multiple ! and ? characters
text = re.sub(" +"," ",text)
text = re.sub("(\\?!)+","?!",text)
text = re.sub("(!\\?)+","!?",text)
text = re.sub("!+","!",text)
text = re.sub("\\?+","?",text)

print('Number of characters in text:', len(text))

Number of characters in text: 494448


### Create mappings between characters and thier indices

In [40]:
#Determine the number of unique characters and create mappings for vectorization
chars = sorted(list(set(text)))
print('Total unique characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Total unique characters: 53


### Break the text into sequences ("sentences") and set aside the next character as predictor target

In [41]:
maxlen = 60
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

Number of sequences: 494388


### Reformat the sequences and targets as a matrix and vector
##### The character mapping provides a method to one-hot-encode each character

In [42]:
#Convert the character sequences into 2D arrays using the mapping dictionaries from above
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
print("Sequences are now 2D arrays")

Sequences are now 2D arrays


### Build and compile the LSTM model

<img src="../Introduction_to_NLP/Images/LSTM.png" title="LSTM Neural Network Node" />

In [43]:
#Build an LSTM model
nodeSize = len(chars)

model = Sequential()
model.add(LSTM(nodeSize, input_shape=(maxlen, len(chars))))
#model.add(LSTM(nodeSize, input_shape=(maxlen, len(chars)),return_sequences=True))
#model.add(LSTM(nodeSize))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

print('Model compiled')

Model compiled


### Define a function to sample a probability distribution

In [44]:
#Generalization of softmax function with temperature parameter
#     High temperature "flattens" the distribution and, to a degree, equalizes the probabilities, while low temperature
#     accentuates already likely probabilities

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    #multinomial(n - number of samples,
    #            pvals - probability of each value,
    #            size - number of such experiments to conduct)
    #e.g. Roll three dice and do it again: multinomial(3,[1/6.]*6,2)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

### Fit the model and generate some text

In [None]:
#Fit the model with the processed text corpus data
epochs = 20
batchSize = 1024

model.fit(X, y, batch_size=batchSize, epochs=epochs)

Epoch 1/20
Epoch 2/20

### Generate some text with the trained model

In [67]:
diversity = 0.1
start_index = random.randint(0, len(text) - maxlen - 1)
#start_index = 245430

generated = ''
sequence = text[start_index: start_index + maxlen]
#sequence = text[:maxlen]
generated += sequence

print('Generating with seed: "{}"'.format(sequence))
print('\n----- \n')
sys.stdout.write(generated)

for _ in range(400):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sequence):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, diversity)
    next_char = indices_char[next_index]

    generated += next_char
    sequence = sequence[1:] + next_char

    sys.stdout.write(next_char)
    sys.stdout.flush()

Generating with seed: "y makes a big difference--i especially like brown rice and o"

----- 

y makes a big difference--i especially like brown rice and one to the beans and the best to the best the stuff is a store and the stuff is a spicy and the product is a delicious and the best the coffee so i do not a stock and the stuff is a special and the stuff is a good and the stuff is a store to the best the stuff is a store and the better than the stuff is a little a store and the best and the stuff is a store and the stuff is a store and the best and