In [15]:
import nltk
import numpy as np
import os
import random
import sys

from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop

In [2]:
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/arkashjain/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependenc

True

In [6]:
corpora_dir = "/Users/arkashjain/nltk_data/corpora/state_union"
corpora_dir

file_list = []
for root, _ , files in os.walk(corpora_dir):
    for filename in files:
        file_list.append(os.path.join(root, filename))

print("Read", len(file_list), "files..")

docs = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().lower().replace('\n','')
            docs.append(str_form)
        except UnicodeDecodeError:
            pass
text = ' '.join(docs)

print('corpus length:', len(text))

Read 66 files..
corpus length: 1915949


In [7]:
chars = sorted(list(set(text)))
print('Total number of unique characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars)) #characters to indices
indices_char = dict((i, c) for i, c in enumerate(chars)) #indices to characters

Total number of unique characters: 57


In [16]:
#cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
    
print('nb sequences', len(sentences))

print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars)), dtype = bool)
y = np.zeros((len(sentences),len(chars)), dtype = bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
def sample(preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    
    preds = exp_preds / np.sum(exp_preds)
    
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    print()
    print('------ Generating text after Epoch: %d' %epoch)
    start_index = random.randint(0, len(text) - maxlen - 1)
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- Diversity', diversity)
        
        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: ' + sentence + '""')
        sys.stdout.write(generated)
        
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1
            
            preds = model.predict(x_pred, verbose=0)[0]
            
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            
            generated += next_char
            sentence = sentence[1:] + next_char
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
    model.save_weights('saved_weights.hdf5', overwrite=True)

print_callback = LambdaCallback(on_epoch_end = on_epoch_end)
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose = 1, save_best_only = True)

nb sequences 638637
Vectorization...


In [None]:
print('Building model...')
#size of the vector in the hidden layer
hidden_size = 128
# Initializing model sequence
model = Sequential()
model.add(LSTM(hidden_size, input_shape=(maxlen, len(chars))))

#Adding the output layer that is softmax of the number of characters
model.add(Dense(len(chars), activation='softmax'))
#optimization through RMSprop
optimizer_new = RMSprop()

model.compile(loss='categorical_crossentropy', optimizer=optimizer_new)

model.fit(x, y, 
          batch_size = 128, 
          epochs = 5, 
          callbacks = [print_callback, checkpointer])

Building model...
Epoch 1/5
 307/4990 [>.............................] - ETA: 6:27 - loss: 2.8739

In [None]:
#for continued training
model.load_weights("saved_weights.hdf5")

model.fit(x, y, 
          batch_size = 128, 
          epochs = 5, 
          callbacks = [print_callback, checkpointer])