# Reading the file and returning as a single string, the whole content of file

In [3]:
# Reading the file and returning as a single string, the whole content of file
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text
        

In [4]:
#read_file('moby_dick_four_chapters.txt')

In [5]:
#read_file('melville-moby_dick.txt')

# Tokenization and cleaning the text

In [68]:
#Tokenization and cleaning the text
import spacy

In [69]:
# using spacy only for tokenization. Hence, disabled : parsing,tagging,named-rntity-recognition
# It helps tokenization go faster
nlp =spacy.load('en_core_web_sm',disable=['parser','tagger','ner','lemmatizer']) 

In [70]:
#setting the limit higher for the tokenization of the entirety of the text
nlp.max_length = 1198623 

In [153]:
# we are getting rid of the unwanted punctuations and characters because we are intrested in 
# relationship between words
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in 
            '\n\n \n\n\n!"#$\n%()--*+,-./:;<=>?@[\\]^"''"_{|}`\t']


In [154]:
d = read_file('moby_dick_four_chapters.txt')

In [155]:
tokens = separate_punc(d)

In [163]:
len(tokens)
#tokens

11338

# Creating Sequences of tokens

In [75]:
# we will be passing 25 word sequence and have our model to predict the 26th word
# The idea is that 25 words are long enough to grab the structure of the sentence but not that short
# to miss the general context

In [76]:
train_len = 25+1
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [77]:
type(text_sequences)

list

In [78]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [79]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [80]:
' '.join(text_sequences[2])

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [81]:
# Converting the tokens into numbers such that keras could understand it
from keras.preprocessing.text import Tokenizer

In [82]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [83]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [84]:
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 314,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2713,
 14,
 24]

In [85]:
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [86]:
for i in sequences[0]:
    print(f"{i}:{tokenizer.index_word[i]}")

956:call
14:me
263:ishmael
51:some
261:years
408:ago
87:never
219:mind
129:how
111:long
954:precisely
260:having
50:little
43:or
38:no
314:money
7:in
23:my
546:purse
3:and
150:nothing
259:particular
6:to
2713:interest
14:me
24:on


In [87]:
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [88]:
# getting our vocabulary size
vocabulary_size = len(tokenizer.word_counts)

In [89]:
vocabulary_size

2718

In [90]:
# currently the sequence is of type list where every item in the list is another list of the 
# numbers
type(sequences)

list

In [91]:
import numpy as np

In [92]:
sequences = np.array(sequences)

In [93]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

# Spliting the data into Features and Labels

In [94]:
from keras.utils import to_categorical

In [95]:
X = sequences[:,:-1] # features
# first 25 elements of each sequence

In [96]:
y = sequences[:,-1] # labels
# last element of each sequence

In [97]:
# converting y(label) to_categorical 
# class size = vocabulary_size+1 because keras padding needs extra 1 to hold 0
y = to_categorical(y,num_classes=vocabulary_size+1)

In [98]:
seq_len = X.shape[1]

In [99]:
x.shape

(11312, 25)

# Creating the LSTM model

In [100]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [157]:
def create_model(vocabulary_size,seq_len):
    
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
     #input dimension = vocabulary_size // output dimension = seq_len // input length = seq_len
    # Embedding turns positive integers(indexes) into dense vectors of fixed size 
    # Must be the first layer of the model
    
    model.add(LSTM(500,return_sequences=True))
    model.add(LSTM(750))
    model.add(Dense(625,activation='relu'))
    
    model.add(Dense(vocabulary_size,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
                  
    model.summary()
    return model

In [158]:
model = create_model(vocabulary_size+1,seq_len)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 25, 25)            67975     
_________________________________________________________________
lstm_10 (LSTM)               (None, 25, 500)           1052000   
_________________________________________________________________
lstm_11 (LSTM)               (None, 750)               3753000   
_________________________________________________________________
dense_8 (Dense)              (None, 625)               469375    
_________________________________________________________________
dense_9 (Dense)              (None, 2719)              1702094   
Total params: 7,044,444
Trainable params: 7,044,444
Non-trainable params: 0
_________________________________________________________________


In [160]:
from pickle import dump,load   # for saving the file and loading later on

In [161]:
model.fit(X,y,batch_size=64,epochs=100,verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2b5d36a6340>

In [164]:
#Saving the model
model.save('my_mobydick_model2.h5')

In [165]:
#Saving the tokenizer
dump(tokenizer,open('my_mobydicktokenizer2','wb'))

# Generating texts

In [166]:
from keras.preprocessing.sequence import pad_sequences

In [167]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    #25 words
    input_text = seed_text # we need to feed some sort of line of 25 tokens that we want to 
                                  #  start with
        
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')# since, our a long
          # model is trained on 25 tokens so if a long or short seed text is passed then it will 
          # to make the seed text 25 tokensproviding robustness to the function
        
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' '+pred_word
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [168]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [169]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [170]:
random_seed_text = text_sequences[random_pick]

In [171]:
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [172]:
seed_text = ' '.join(random_seed_text)

In [173]:
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [174]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'must be it is round a hat unoccupied but avast the planing in the night i fumbled to the keeping being exactly the more of'