In [1]:
import random
import pickle
import itertools

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
from keras.layers import LSTM

In [3]:
# loading the text file 
text = open('1661-0.txt',mode='r',encoding="utf-8").read().lower()
print('corpus length:', len(text))

corpus length: 581877


In [4]:
text[:100]

"\ufeff\nproject gutenberg's the adventures of sherlock holmes, by arthur conan doyle\n\nthis ebook is for th"

In [43]:
partial_text = text[:50000]

In [44]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(partial_text)

In [45]:
tokens[:10]

['project',
 'gutenberg',
 's',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'arthur']

In [46]:
unique_tokens = np.unique(tokens)
# mapping each unique word to a index in a dictionary
unique_tokens_index = {tokens : idx for idx,tokens in enumerate(unique_tokens)}

In [47]:
# first 10 key-values in dictionary
dict(itertools.islice(unique_tokens_index.items(), 10))

{'15': 0,
 '1661': 1,
 '1858': 2,
 '1888': 3,
 '20': 4,
 '2002': 5,
 '2019': 6,
 '29': 7,
 '5': 8,
 '8': 9}

In [48]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i : i + n_words])
    next_words.append(tokens[i + n_words])

In [49]:
input_words[:2]

[['project',
  'gutenberg',
  's',
  'the',
  'adventures',
  'of',
  'sherlock',
  'holmes',
  'by',
  'arthur'],
 ['gutenberg',
  's',
  'the',
  'adventures',
  'of',
  'sherlock',
  'holmes',
  'by',
  'arthur',
  'conan']]

In [50]:
next_words[:10]

['conan', 'doyle', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone']

In [51]:
X = np.zeros((len(input_words),n_words,len(unique_tokens)), dtype=bool)
Y = np.zeros((len(next_words),len(unique_tokens)), dtype=bool)

In [52]:
X

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, Fal

In [53]:
Y

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [54]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i,j,unique_tokens_index[word]] = 1
    Y[i,unique_tokens_index[next_words[i]]] = 1

In [55]:
model = Sequential([
    LSTM(128, input_shape=(n_words,len(unique_tokens)), return_sequences=True),
    LSTM(128),
    Dense(len(unique_tokens)),
    Activation('softmax')
])

In [56]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(learning_rate=0.01), metrics=['accuracy'])
model.fit(X, Y, batch_size=128, epochs=30, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x192ff573bb0>

In [57]:
# saving the model
model.save('word_pred.h5')

In [58]:
# loading model
# model = load_model('word_pred.h5')
# history = pickle.load(open("history.p", "rb"))

In [66]:
def predict_next_word(input_text,n_best):
    input_text = input_text.lower()
    
    X = np.zeros((1,n_words,len(unique_tokens)))
    for i,word in enumerate(input_text.split()):
        X[0, i, unique_tokens_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [67]:
possible_words = predict_next_word('',5)



In [68]:
possible_words

array([ 148,  741, 1169, 1224,  841], dtype=int64)

In [69]:
print([unique_tokens[idx] for idx in possible_words])

['back', 'gentleman', 'mr', 'now', 'here']


In [70]:
def generate_text(input_text, text_length, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence,creativity))]
        except:
            choice = random.choice(unique_tokens)
        
        word_sequence.append(choice)
        current += 1
        
    return " ".join(word_sequence)

In [71]:
generate_text('My own complete happiness',20,5)



'My own complete happiness gentleman a it an this between a could on any that room pooh could 8 round is back glancing by'

In [72]:
predict_next_word('He will have to see into this thing and he',5)



array([1641,  413,  766, 1269, 1974], dtype=int64)

In [73]:
print([unique_tokens[idx] for idx in possible_words])

['back', 'gentleman', 'mr', 'now', 'here']
