In [20]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import string
import numpy as np

In [21]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [22]:
# load
in_filename = 'C:/Users/Aashish Agarwal/Desktop/test.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
print(doc[:200])

The Project Gutenberg EBook of The Republic, by Plato

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it u


In [23]:
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [24]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'republic', 'author', 'plato', 'translator', 'b', 'jowett', 'posting', 'date', 'august', 'ebook', 'release', 'date', 'october', 'last', 'updated', 'june', 'language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'republic', 'produced', 'by', 'sue', 'asscher', 'the', 'republic', 'by', 'plato', 'translated', 'by', 'benjamin', 'jowett', 'note', 'the', 'republic', 'by', 'plato', 'jowett', 'etext', 'introduction', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'hi

In [25]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 17537


In [26]:
print(sequences[:5])

['the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online', 'project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at', 'gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at wwwgutenbergorg', 'ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give i

In [27]:
print(len(sequences[0].split(' ')))

51


In [28]:
lines = sequences

In [29]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [30]:
# save sequences to file
out_filename = 'test_sequences.txt'
save_doc(sequences, out_filename)

In [31]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [32]:
sequences[0]

[1,
 1302,
 1300,
 610,
 2,
 1,
 65,
 14,
 33,
 26,
 610,
 5,
 19,
 1,
 137,
 2,
 1304,
 3117,
 22,
 58,
 3116,
 3,
 21,
 818,
 58,
 3113,
 3112,
 55,
 39,
 3111,
 82,
 318,
 82,
 455,
 8,
 3108,
 82,
 157,
 1,
 454,
 2,
 1,
 1302,
 1300,
 3107,
 1301,
 21,
 26,
 610,
 8,
 1306]

In [33]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(tokenizer.word_index)

{'the': 1, 'of': 2, 'and': 3, 'to': 4, 'is': 5, 'in': 6, 'a': 7, 'or': 8, 'which': 9, 'he': 10, 'that': 11, 'be': 12, 'not': 13, 'by': 14, 'are': 15, 'as': 16, 'his': 17, 'but': 18, 'for': 19, 'have': 20, 'with': 21, 'at': 22, 'justice': 23, 'has': 24, 'we': 25, 'this': 26, 'they': 27, 'who': 28, 'socrates': 29, 'will': 30, 'more': 31, 'only': 32, 'plato': 33, 'their': 34, 'state': 35, 'them': 36, 'an': 37, 'from': 38, 'may': 39, 'him': 40, 'on': 41, 'all': 42, 'was': 43, 'first': 44, 'been': 45, 'than': 46, 'one': 47, 'good': 48, 'there': 49, 'what': 50, 'man': 51, 'into': 52, 'other': 53, 'when': 54, 'you': 55, 'any': 56, 'men': 57, 'no': 58, 'must': 59, 'such': 60, 'life': 61, 'like': 62, 'just': 63, 'glaucon': 64, 'republic': 65, 'would': 66, 'i': 67, 'evil': 68, 'do': 69, 'thrasymachus': 70, 'then': 71, 'some': 72, 'another': 73, 'our': 74, 'were': 75, 'if': 76, 'nor': 77, 'world': 78, 'also': 79, 'argument': 80, 'himself': 81, 'it': 82, 'nature': 83, 'truth': 84, 'had': 85, 'book

In [34]:
def one_hot(word, word_index):
    encoded = np.zeros(len(word_index) + 1)
    try:
        i = word_index[word]
    except KeyError:
        i = len(encoded) - 1
    encoded[i] = 1
    return encoded

In [35]:
#### seq_one_hot = np.array([[one_hot(word, tokenizer.word_index) for word in sequence] for sequence in sequences])

In [36]:
#### seq_one_hot[0].shape

In [38]:
# separate into input and output
sequences = array(sequences)
print(sequences.shape)
print(sequences[0])
X, y = sequences[:,:-1], sequences[:,-1]
####X, y = seq_one_hot[:,:-1,:], seq_one_hot[:,-1,:]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

(17537, 51)
[   1 1302 1300  610    2    1   65   14   33   26  610    5   19    1
  137    2 1304 3117   22   58 3116    3   21  818   58 3113 3112   55
   39 3111   82  318   82  455    8 3108   82  157    1  454    2    1
 1302 1300 3107 1301   21   26  610    8 1306]


In [39]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
#model.add(LSTM(100, input_shape=(50,125), return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            155900    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3118)              314918    
Total params: 621,718
Trainable params: 621,718
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

KeyboardInterrupt: 

In [141]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            6250      
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 125)               12625     
Total params: 169,775
Trainable params: 169,775
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [42]:
# load cleaned text sequences
in_filename = 'C:/Users/Aashish Agarwal/Desktop/test.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

In [43]:
# load the model
from keras.models import load_model
model = load_model('model.h5')

In [44]:
# load the tokenizer
from random import randint
from pickle import load
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [49]:
# select a seed text
seed_text = lines[randint(0,len(lines))]
#####seed_text = "Apart from counting words and characters, our online editor can help you to improve word choice and writing style, and, optionally, help you to detect grammar mistakes and plagiarism. To check word count, simply place your cursor into the text box above and start typing. You'll see the number of"
print(seed_text + '\n')

words are strictly taken, the ruler and the shepherd look only to the



In [50]:
# generate new text
from keras.preprocessing.sequence import pad_sequences
generated = generate_seq(model, tokenizer, 50, seed_text, 50)
print(generated)

are are your already already already already are are are are are far far far wrong you you you you you you you you you you you will will will certainly certainly certainly glaucon glaucon glaucon and and appeared appeared polemarchus and and and and and and adeimantus adeimantus adeimantus


In [116]:
pred = model.predict(seq_one_hot[:1,:-1,:], 1)

In [125]:
new_words = 'Apart from counting words and characters, our online editor can help you to improve word choice and writing style, and, optionally, help you to detect grammar mistakes and plagiarism. To check word count, simply place your cursor into the text box above and start typing. You'll see the number of"

ValueError: Error when checking input: expected lstm_3_input to have 3 dimensions, but got array with shape (125, 1)

In [122]:
np.argmax(pred[0])

124

In [123]:
seq_one_hot[:1,-1,:]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])