In [1]:
import keras
import numpy as np
import re
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Read in the corpus and clean it

Choose which folder of text files (plays or trump) you want to read in and change the path. Also choose which functions to call to clean the corpus.  

In [2]:
# this is used to remove stage direction if we don't want them
def remove_stage_dir(text):
    text = re.sub("[\<].*?[\>]", "", text)
    text = re.sub("\\s+", " ", text)
    return text
# this is used to remove the word "SPEECH" adn the number following after that in the corpus
def remove_SPEECH(text):
    text = re.sub("SPEECH \d+", "", text)
    text = re.sub("\\s+", " ", text)
    return text

path = './trump' #change the path accordingly
in_sentences=[]
# read all files in the floder (need to be txt with UTF-8 encoding)
# chop it up in sentances (for Tokenizer)
for filename in os.listdir(path):
    text = ''.join(open(path+'/'+filename, encoding = "UTF-8", mode="r").readlines())
    split_text = re.split(r' *[\.\?!][\'"\)\]]* *', remove_SPEECH(text)) #change the function accordingly
    for chunk in split_text:
        in_sentences.append(chunk)

print(in_sentences[0:10])
print('Corpus length:', len(text))

['\ufeff', '', '', 'Thank you so much', "That's so nice", "Isn't he a great guy", "He doesn't get a fair press; he doesn't get it", "It's just not fair", "And I have to tell you I'm here, and very strongly here, because I have great respect for Steve King and have great respect likewise for Citizens United, David and everybody, and tremendous resect for the Tea Party", 'Also, also the people of Iowa']
Corpus length: 896270


## Set the hyper-parameters and preparing the training sample
Here we choose the length of each sample sentances and the stride between each samples (setting the hyper-parameters). We then use the Tokenizer in Keras to tokenize the samples. We can also set out library size (i.e. set the maximum the number of words in the entire library)

The corpus is chopped up in natural sentances for the tokenization. It is then sticked back together as a large sequence, then we sample our sentances using the hyper-parameter settings.

After that, we normalize the sample before feeding into the neural network. We also have to one-hot encode the training label y.

In [3]:
# Length of extracted sample
maxlen = 20

# Stride of sampling
step = 1

# This holds our samples sequences
sentences = []

# This holds the next word (as training label)
next_word = []

#use Kears Tokenizer
from keras.preprocessing.text import Tokenizer

max_num_word = 10000 #max size of library
tokenizer = Tokenizer(num_words=max_num_word)
tokenizer.fit_on_texts(list(in_sentences))
list_tokenized_train = tokenizer.texts_to_sequences(list(in_sentences))

#if the library ends up smaller then the max size, update the info
if len(tokenizer.word_index) < max_num_word:
    max_num_word = len(tokenizer.word_index)
    
print('Number of words:', max_num_word)

Number of words: 6171


In [4]:
#stick the encoded words back together as a big sequence
token_word = []
for line in range (0,len(in_sentences)):
    that_sentences = list_tokenized_train[line]
    for i in range(0,len(that_sentences)):
        token_word.append(that_sentences[i])

#sample the sequence
for i in range(0, len(token_word) - maxlen, step):
    sentences.append(token_word[i: i + maxlen])
    next_word.append(token_word[i + maxlen])
print('Number of sentences:', len(sentences))

Number of sentences: 168513


In [5]:
#nomalized x
x = np.asarray(sentences).astype('float32')/max_num_word
#one-hot encode y
y = np.zeros((len(sentences), max_num_word), dtype=np.bool)
for i in range (0,len(sentences)):
    for j in range (0,maxlen):
        y[i, next_word[j]] = 1

## Building and training the neural network
The network consist of 3 layers: Embedding layers (for word embeddings), LSTM and the an output layer.

In [6]:
#build Keras model, using word embedding layer and LSTM then 
#output via softmax layer to give a prediction distribution
from keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(max_num_word, 200, input_length=maxlen))
model.add(layers.LSTM(256))
model.add(layers.Dense(max_num_word, activation='softmax'))

model.summary()

# Since our prediction are one-hot encoded, use `categorical_crossentropy` as the loss
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.fit(x, y, batch_size=256, epochs=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 200)           1234200   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               467968    
_________________________________________________________________
dense_1 (Dense)              (None, 6171)              1585947   
Total params: 3,288,115
Trainable params: 3,288,115
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1


<keras.callbacks.History at 0x188e1748>

## Prepare to sample the prediction for the next word
The neural network will predict a distribution of the next work, here we hava a function to sample it with a custom "temperature". We also define a dictionary to map back the coe into word.

In [7]:
#this is for sampling the next work with a prediction distribution
def sample(preds, temperature=0.1):
    preds = np.asarray(preds).astype('float64')
    exp_preds = preds - np.exp(temperature)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

#to change back to word
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

## Start generating a paragraph
By sampling a random seed sentance in the corpus, we start senerate the distribution of the next word, using the function above to sample the next word, append it to the seed sentance (to keep the length of the seed sentance, the first word will be removed), repead and generate the next. we will then have a new "paragrah" generated by the neural network.

In [8]:
#randomize a seed
import random

random.seed(99)
start_index = random.randint(0, len(token_word) - maxlen - 1)
generated_seed = token_word[start_index: start_index + maxlen]

generated_text = ' '.join([reverse_word_map.get(i) for i in generated_seed])
print('--- Generating with seed ---')
print(generated_text)
print('--- --- --- --- --- ---')

for i in range(40): #generate 20 words

    array_seed = np.zeros((maxlen,1))
    array_seed[:,0] = np.asarray(generated_seed).astype('float32')/max_num_word
    
    preds = model.predict(array_seed.transpose(), verbose=0)[0]
    next_index = sample(preds)
    next_word = reverse_word_map.get(next_index)

    generated_seed.append(next_index)       
    generated_seed = generated_seed[1:]
    generated_text = generated_text + ' ' + next_word

print('--- Generated text ---')
print(generated_text)
print('--- --- --- --- --- ---')

--- Generating with seed ---
about numbers like this mr trump stated without looking at the various polling data it’s obvious to anybody the hatred
--- --- --- --- --- ---
--- Generated text ---
about numbers like this mr trump stated without looking at the various polling data it’s obvious to anybody the hatred promised heavy raped firm mentioned boards terraza camera age indiana appreciates grab picked smiley morning trump horrible mathematics puddle 201 what dakota hug it'll 26th somber powerful nurture struggles referred cents served depend smack the– surprisingly chicago avenue incidents sanders
--- --- --- --- --- ---


# Pre-trained layer of GloVe embeddings 

Try importing a pregrain GloVe embedding layer by download the pre-trained word vector here: https://nlp.stanford.edu/projects/glove/. The one with embedding dimention 200 is readed in from the text file as a numpy array.

In [9]:
embeddings_index = {}
f = open('glove.6B.200d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


According to the pre-trained word vector, the embedding matrix that transforms our vector is billt. Words not found is marked all-zero.

In [10]:
embedding_matrix = np.zeros((max_num_word + 1, 200))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Again we train the model, but this time we import the embedding matrix to the embedding layer and freeze it. 

In [11]:
embedding_layer = layers.Embedding(max_num_word + 1,
                                   200,
                                   weights=[embedding_matrix],
                                   input_length=maxlen,
                                   trainable=False)

glove_model = keras.models.Sequential()
glove_model.add(embedding_layer)
glove_model.add(layers.LSTM(256))
glove_model.add(layers.Dense(max_num_word, activation='softmax'))

glove_model.summary()

# Since our prediction are one-hot encoded, use `categorical_crossentropy` as the loss
optimizer = keras.optimizers.RMSprop(lr=0.01)
glove_model.compile(loss='categorical_crossentropy', optimizer=optimizer)
glove_model.fit(x, y, batch_size=256, epochs=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 200)           1234400   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               467968    
_________________________________________________________________
dense_2 (Dense)              (None, 6171)              1585947   
Total params: 3,288,315
Trainable params: 2,053,915
Non-trainable params: 1,234,400
_________________________________________________________________
Epoch 1/1


<keras.callbacks.History at 0x3be24438>

Generate an artical with the new GloVe model to compare.

In [12]:
random.seed(99)
start_index = random.randint(0, len(token_word) - maxlen - 1)
generated_seed = token_word[start_index: start_index + maxlen]

generated_text = ' '.join([reverse_word_map.get(i) for i in generated_seed])
print('--- Generating with seed ---')
print(generated_text)
print('--- --- --- --- --- ---')

for i in range(40): #generate 20 words

    array_seed = np.zeros((maxlen,1))
    array_seed[:,0] = np.asarray(generated_seed).astype('float32')/max_num_word
    
    preds = glove_model.predict(array_seed.transpose(), verbose=0)[0]
    next_index = sample(preds)
    next_word = reverse_word_map.get(next_index)

    generated_seed.append(next_index)       
    generated_seed = generated_seed[1:]
    generated_text = generated_text + ' ' + next_word

print('--- Generated text ---')
print(generated_text)
print('--- --- --- --- --- ---')

--- Generating with seed ---
about numbers like this mr trump stated without looking at the various polling data it’s obvious to anybody the hatred
--- --- --- --- --- ---
--- Generated text ---
about numbers like this mr trump stated without looking at the various polling data it’s obvious to anybody the hatred thankful knocked happiness coverage farmers commitments reduces economic immigrated attorney outside statisticians tom rnc card enacted friends killed…they blocks rebuilding dealing 17th says priorities blowing advertisements instincts starting picked isn’t imports violent vetdogs nation’s you’re isn't motor newspaper issued been…the
--- --- --- --- --- ---


The pre-trained model is faster but is the quality of the result is depending weather or not the pre-trained vector is suitable for our training data. Luckily researchers has pre-trained lot of them as open-source or we will have to train it ourselves.