In [73]:
import spacy
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.utils import to_categorical
from pickle import dump,load
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [8]:
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])
# default were only 1000000
nlp.max_length = 1200000

# LSTM

## Text Processing

In [79]:
# read files and turn to string
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    filtered_str = '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '
    return [token.text.lower() for token in nlp(str_text) if token.text not in filtered_str]

def create_train_set(train_len, inp_tokens):
    text_seq = []
    for i in range(train_len, len(inp_tokens)):
        seq = inp_tokens[i - train_len : i + 1]
        text_seq.append(seq)
    return text_seq

In [80]:
tokens = read_file('./data/melville-moby_dick.txt')
len(tokens)

214711

In [81]:
text_sequences = create_train_set(25, tokens)
np.array(text_sequences).shape

(214686, 26)

In [82]:
' '.join(text_sequences[2])

'loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me'

In [83]:
# integer encode sequences of words into int
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
sequences[0][:5]

[158, 9447, 17527, 402, 42]

In [84]:
print(tokenizer.index_word[158])
tokenizer.index_word

chapter


{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'to',
 6: 'in',
 7: 'that',
 8: 'his',
 9: 'it',
 10: 'i',
 11: 'he',
 12: 'but',
 13: "'s",
 14: 'as',
 15: 'with',
 16: 'is',
 17: 'was',
 18: 'for',
 19: 'all',
 20: 'this',
 21: 'at',
 22: 'not',
 23: 'by',
 24: 'whale',
 25: 'from',
 26: 'so',
 27: 'him',
 28: 'on',
 29: 'be',
 30: 'one',
 31: 'you',
 32: 'there',
 33: 'now',
 34: 'had',
 35: 'have',
 36: 'or',
 37: 'were',
 38: 'they',
 39: 'like',
 40: 'which',
 41: 'then',
 42: 'me',
 43: 'some',
 44: 'their',
 45: 'what',
 46: 'when',
 47: 'an',
 48: 'are',
 49: 'my',
 50: 'no',
 51: 'upon',
 52: 'out',
 53: 'man',
 54: 'into',
 55: 'ship',
 56: 'up',
 57: 'more',
 58: 'ahab',
 59: 'if',
 60: 'them',
 61: 'old',
 62: 'we',
 63: 'sea',
 64: 'would',
 65: "'",
 66: 'ye',
 67: 'do',
 68: 'other',
 69: 'been',
 70: 'over',
 71: 'these',
 72: 'will',
 73: 'though',
 74: 'only',
 75: 'its',
 76: 'down',
 77: 'such',
 78: 'who',
 79: 'yet',
 80: 'head',
 81: 'time',
 82: 'long',
 83: 'boat

In [85]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

158 : chapter
9447 : 1
17527 : loomings
402 : call
42 : me
1043 : ishmael
43 : some
247 : years
659 : ago
140 : never
296 : mind
116 : how
82 : long
788 : precisely
347 : having
113 : little
36 : or
50 : no
1788 : money
6 : in
49 : my
3028 : purse
3 : and
218 : nothing
442 : particular
5 : to


In [86]:
print(len(tokenizer.word_counts))
tokenizer.word_counts

17528


OrderedDict([('chapter', 4447),
             ('1', 28),
             ('loomings', 3),
             ('call', 1382),
             ('me', 16096),
             ('ishmael', 500),
             ('some', 15789),
             ('years', 2400),
             ('ago', 815),
             ('never', 5262),
             ('mind', 2039),
             ('how', 6330),
             ('long', 8567),
             ('precisely', 690),
             ('having', 1679),
             ('little', 6412),
             ('or', 17879),
             ('no', 14916),
             ('money', 305),
             ('in', 105800),
             ('my', 15231),
             ('purse', 178),
             ('and', 164030),
             ('nothing', 2936),
             ('particular', 1273),
             ('to', 117832),
             ('interest', 442),
             ('on', 26910),
             ('shore', 572),
             ('i', 53430),
             ('thought', 3874),
             ('would', 11232),
             ('sail', 2522),
             ('about', 

## Model Training

In [87]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [88]:
# convert list into numpy array
sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes = len(tokenizer.word_counts) + 1)

In [None]:
model = create_model(len(tokenizer.word_counts) + 1, X.shape[1])
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

In [None]:
# save the model to file
model.save('./model/epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))

In [89]:
model = load_model('./model/epochBIG.h5')

## Generating New Text

In [90]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    output_text = []
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate, pre truncating to get the last 25 words, post to get first 25
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [92]:
random_text = text_sequences[60]
print(random_text)
random_text = ' '.join(random_text)

# generate the next 50 words
generate_text(model, tokenizer, 25, seed_text=random_text, num_gen_words=50)

['circulation', 'whenever', 'i', 'find', 'myself', 'growing', 'grim', 'about', 'the', 'mouth', 'whenever', 'it', 'is', 'a', 'damp', 'drizzly', 'november', 'in', 'my', 'soul', 'whenever', 'i', 'find', 'myself', 'involuntarily', 'pausing']


'the officer pictures i would not greyhound that a inducements of a bottom sort of pull beats stands altogether the coloured paying of the sea and so could of depend ships to each repeated of these earliest creature like throw tribe of his merely many whose known my her insanity'