In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [3]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm',disable=['parser','tagger','ner'])



In [5]:
nlp.max_length = 11198623

In [6]:
def seperate_punc(doc_text):
    return[token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [7]:
d = read_file(r'C:\Users\bryan\Desktop\Computer Stuff\tensor_flow_class\TF_2_Notebooks_and_Data\06-NLP-and-Text-Data\melville-moby_dick.txt')

In [8]:
tokens = seperate_punc(d)

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such

In [9]:
len(tokens)

214712

In [10]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [11]:
' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [12]:
' '.join(text_sequences[1])

'1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest'

In [13]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [15]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [17]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")
    
    #tokenizer.index_word

158 : chapter
9447 : 1
17527 : loomings
402 : call
42 : me
1043 : ishmael
43 : some
247 : years
659 : ago
140 : never
296 : mind
116 : how
82 : long
787 : precisely
347 : having
113 : little
36 : or
50 : no
1788 : money
6 : in
49 : my
3028 : purse
3 : and
218 : nothing
442 : particular
5 : to


In [19]:
vocabulary_size = len(tokenizer.word_counts)

In [20]:
vocabulary_size

17527

In [21]:
sequences = np.array(sequences)

In [22]:
sequences

array([[  158,  9447, 17527, ...,   218,   442,     5],
       [ 9447, 17527,   402, ...,   442,     5,  1165],
       [17527,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   937,   351, ...,  1419,  1313,    74],
       [  937,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

In [23]:
from keras.utils import to_categorical

In [24]:
X = sequences[:,:-1]

In [25]:
y = sequences[:,-1]
y

array([   5, 1165,   42, ...,   74,  219,  222])

In [26]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [27]:
seq_len = X.shape[1]
seq_len

25

In [28]:
X.shape

(214686, 25)

In [29]:
from keras.models import Sequential

In [30]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [31]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(100,return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [32]:
early_stop = EarlyStopping(monitor='val_loss',mode='min', verbose=1, patience=25)

In [33]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            438200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 100)           50400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_2 (Dense)              (None, 17528)             1770328   
Total params: 2,294,228
Trainable params: 2,294,228
Non-trainable params: 0
_________________________________________________________________


In [34]:
from pickle import dump,load

In [None]:
model.fit(X,y,batch_size=128, epochs=350, verbose=1, callbacks=[early_stop])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/350
Epoch 2/350
Epoch 3/350
Epoch 4/350
Epoch 5/350
Epoch 6/350
Epoch 7/350
Epoch 8/350
Epoch 9/350
Epoch 10/350
Epoch 11/350
Epoch 12/350
Epoch 13/350
Epoch 14/350
Epoch 15/350
Epoch 16/350
Epoch 17/350
Epoch 18/350
Epoch 19/350
Epoch 20/350
Epoch 21/350
Epoch 22/350
Epoch 23/350
Epoch 24/350
Epoch 25/350
Epoch 26/350
Epoch 27/350
Epoch 28/350
Epoch 29/350
Epoch 30/350
Epoch 31/350
Epoch 32/350
Epoch 33/350
Epoch 34/350
Epoch 35/350
Epoch 36/350
Epoch 37/350
Epoch 38/350
Epoch 39/350
Epoch 40/350
Epoch 41/350
Epoch 42/350
Epoch 43/350
Epoch 44/350
Epoch 45/350
Epoch 46/350
Epoch 47/350
Epoch 48/350
Epoch 49/350
Epoch 50/350
Epoch 51/350
Epoch 52/350
Epoch 53/350
Epoch 54/350
Epoch 55/350
Epoch 56/350
Epoch 57/350
Epoch 58/350
Epoch 59/350
Epoch 60/350
Epoch 61/350
Epoch 62/350
Epoch 63/350
Epoch 64/350
Epoch 65/350
Epoch 66/350
Epoch 67/350
Epoch 68/350
Epoch 69/350
Epoch 70/350
Epoch 71/350
Epoch 72/350
Epoch 73/350

In [None]:
model.save('moby_dick.h5')

In [None]:
dump(tokenizer,open('my_tokenizer','wb'))

In [None]:
from keras.preprocessing import sequence

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len, truncating='pre')
        
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

In [None]:
text_sequences[0]

In [None]:
import random

random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
random_seed_text

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
seed_text

In [None]:
generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=100)