In [32]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [33]:
import spacy

In [34]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [35]:
nlp.max_length = 1198623

In [36]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [37]:
d = read_file('/content/moby_dick_four_chapters.txt')

In [38]:
tokens = separate_punc(d)

In [39]:
len(tokens)

11338

In [40]:
train_len = 25 + 1
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [41]:
from keras.preprocessing.text import Tokenizer

In [42]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [43]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [44]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [45]:
vocab_size = len(tokenizer.word_counts)

In [46]:
vocab_size

2717

In [47]:
import numpy as np
sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [48]:
from keras.utils import to_categorical
X = sequences[:, :-1]
y = sequences[:, -1]

In [49]:
y = to_categorical(y, num_classes=vocab_size+1)

In [50]:
seq_len = X.shape[1]

In [51]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding 

In [54]:
def create_model(vocab_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [55]:
model = create_model(vocab_size+1, seq_len)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [56]:
from pickle import dump, load

In [58]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f91d222e400>

In [59]:
model.save('mobydick_model.h5')

In [60]:
dump(tokenizer, open('simple_tokenizer', 'wb'))

In [61]:
from keras.preprocessing.sequence import pad_sequences

In [78]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [79]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [80]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [81]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [82]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [83]:
from keras.models import load_model

In [85]:
model = load_model('/content/epochBIG.h5')
tokenizer = load(open('/content/epochBIG', 'rb'))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [87]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=25)

"to be seen there was no bad olfactories my own letter was cheerily listening over his hearers who 's more can go have a wearing"