In [1]:
import spacy
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
import numpy as np

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
nlp = spacy.load('en_core_web_lg',disable=['parser','tagger','ner','lemmatizer'])
nlp.max_length = 1198623

In [3]:
def read_file(filepath):
    with open(filepath) as f:
        return f.read()
def separate_punct(doc_text):
    return [token.text for token in nlp(doc_text) if (token.text not in "#$\n\n\n!#$%&'()*+, --..../\":;<=>?@[\]^_` {|}~")]

In [4]:
moby_dick = read_file('moby_dick_four_chapters.txt')
#melville = read_file('melville-moby_dick.txt')
tokens = separate_punct(moby_dick)
len(tokens)

11326

In [5]:
#pass 25 words predict #26
train_len = 25 + 1
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)
' '.join(text_sequences[1])

'me Ishmael Some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [6]:
' '.join(text_sequences[0])

'Call me Ishmael Some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
text = tokenizer.texts_to_sequences(text_sequences)

In [8]:
#This is used to assign unique id to each token
tokenizer.index_word
len(tokenizer.index_word)

2717

In [9]:
#This is used to count the occurance of each word
tokenizer.word_counts
len(tokenizer.word_counts)

2717

In [10]:
#This is used for the list which word got id 1,2,3,4 etc
tokenizer.word_index
print(len(tokenizer.word_index))

2717


In [11]:
vocabulory_size = len(tokenizer.word_counts)
sequences = np.array(text)
sequences

array([[ 955,   14,  262, ..., 2712,   14,   24],
       [  14,  262,   51, ...,   14,   24,  956],
       [ 262,   51,  260, ...,   24,  956,    5],
       ...,
       [ 951,   12,  165, ...,  261,   53,    2],
       [  12,  165, 2711, ...,   53,    2, 2717],
       [ 165, 2711,    3, ...,    2, 2717,   26]])

In [12]:
X = sequences[:,:-1]
Y = sequences[:,-1]
Y = to_categorical(Y,num_classes=vocabulory_size+1)
seq_len = X.shape[1]

In [13]:
@tensorflow.autograph.experimental.do_not_convert
def create_model(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    model.add(LSTM(50,return_sequences=True))
    model.add(LSTM(25))
    model.add(Dense(50,activation='relu'))
    model.add(Dense(vocabulary_size,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model

In [14]:
model = create_model(vocabulory_size+1,seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67950     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25)                7600      
_________________________________________________________________
dense (Dense)                (None, 50)                1300      
_________________________________________________________________
dense_1 (Dense)              (None, 2718)              138618    
Total params: 230,668
Trainable params: 230,668
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(X,Y,epochs=500,verbose=1,batch_size=128)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1b40f9301c0>

In [19]:
model.save('my_mobidick_model.h5')

In [21]:
import pickle
from pickle import load,dump
dump(tokenizer,open('my_simpletokenizer','wb'))

In [22]:
from keras.preprocessing.sequence import pad_sequences

In [23]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' '+pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [46]:
import random
random_pick = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[random_pick]
random_seed_text

['Can',
 'he',
 'warm',
 'his',
 'blue',
 'hands',
 'by',
 'holding',
 'them',
 'up',
 'to',
 'the',
 'grand',
 'northern',
 'lights',
 'Would',
 'not',
 'Lazarus',
 'rather',
 'be',
 'in',
 'Sumatra',
 'than',
 'here',
 'Would',
 'he']

In [47]:
seed_text = ' '.join(random_seed_text)

In [48]:
mp = keras.models.load_model('my_mobidick_model.h5')

In [49]:
generate_text(mp,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'not far rather lay him down lengthwise along the line of the equator yea ye gods go down to the fiery pit itself in order'

In [50]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'not far rather lay him down lengthwise along the line of the equator yea ye gods go down to the fiery pit itself in order'