In [1]:
import nltk
import numpy as np
import os
import random
import sys

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\anshr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\anshr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\anshr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\anshr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\anshr\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\anshr\AppData\R

True

In [3]:
corpora_dir = "C:/Users/anshr/AppData/Roaming/nltk_data/corpora/state_union"
file_list = []
for root, _, files in os.walk(corpora_dir):
    for filename in files:
        file_list.append(os.path.join(root, filename))
print("Read", len(file_list), " files...")

docs = []
for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().lower().replace('\n','')
            docs.append(str_form)
        except:
            pass
text = ' '.join(docs)
print("corpus length: ", len(text))

Read 66  files...
corpus length:  2066704


In [4]:
chars = sorted(list(set(text)))
print('Total Number of Unique Characters: ', len(chars))
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i, c in enumerate(chars))

Total Number of Unique Characters:  65


In [15]:
# cut the text in semi redundant sequences of maxlen characters
maxlen = 40 # Number of characters considered
step = 3 # Stride of window
sentences = []
next_chars = []

for i in range(0, len(text)- maxlen, step):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i + maxlen])

print("no. sequences: ", len(sentences))

# Vectorization
x = np.zeros((len(sentences), maxlen, len(chars)), dtype = np.bool)
y = np.zeros((len(sentences), len(chars)), dtype = np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i,t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

def sample(preds, temperature = 1.0):
    # Temperature Sampling
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds = np.exp(preds)
    # Softmax
    preds = exp_preds/ np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    print("\n -----I am a robot epoch ended: ", epoch)
    
    start_index = random.randint(0,len(text)-maxlen-1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("\n Diversity: ", diversity)
        generated = ''
        sentence = text[start_index: start_index+maxlen]
        generated += sentence
        print("generating with seed: \"{}\"....".format(sentence))
        sys.stdout.write(generated)
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0,t,char_indices[char]] = 1
            preds = model.predict(x_pred, verbose = 0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            generated += next_char
            sentence = sentence[1:] + next_char
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
    model.save_weights('saved_weights.hdf5', overwrite = True)

print_callback = LambdaCallback(on_epoch_end = on_epoch_end)

no. sequences:  688888


In [16]:
# Building Model
hidden_size = 128
model = Sequential([
    LSTM(hidden_size, input_shape = (maxlen, len(chars))),
    Dense(len(chars), activation = 'softmax')
])
model.compile(loss = 'categorical_crossentropy', optimizer = RMSprop())

In [None]:
model.fit(x,y,
          batch_size = 128,
          epochs = 30,
          callbacks = [print_callback]
         )

Epoch 1/30