In [1]:
file = open('./alice_in_wonderland.txt',encoding="utf8")

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm',disable = ['parser','tagger','ner','lemmatizer'])

In [4]:
nlp.max_length = 1723200

In [5]:
def seperate_punch(doc_text):
    return[ token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n •·']

In [6]:
file = file.read()

In [7]:
tokens = seperate_punch(file)

In [15]:
#Making lists of 11 words
#We are going to set 10 as length of seed text and then going to predict 11th word
train_len = 10 + 1
text_seq = []
#Creates list of lists of 11 words each
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_seq.append(seq)

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_seq)

In [18]:
sequences  = tokenizer.texts_to_sequences(text_seq)

In [19]:
#setting our vocaboulary size to total number of unique words we have
vocab_size = len(tokenizer.word_counts)

In [20]:
import numpy as np

In [21]:
#Converting to numpy array
sequences = np.array(sequences)

In [23]:
from tensorflow.keras.utils import to_categorical

In [24]:
#X is our feature label
#Taking first ten words form the array as our features
X = sequences[:,:-1]

In [25]:
#Taking last word of the array as our prediction 
y = sequences[:,-1]

In [26]:
#One-Hot-Encoding
#num_classes = vocab_size+1 , one extra for padding
y = to_categorical(y,num_classes=vocab_size+1)

In [22]:
#no of words in seed text
seq_ln = 10 

In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout

In [61]:
#Modeldefination
#Takes parameter vocab_size and length of seed text
#Using one embedding layer, two LSTM layers and two dense layers
#Printing summary and returning model
def create_model(vocab_size,seq_ln):
    model = Sequential()
    model.add(Embedding(vocab_size,seq_ln,input_length=seq_ln))
    model.add(LSTM(256,return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256))
    model.add(Dropout(0.2))
    model.add(Dense(80,activation='relu'))
    model.add(Dense(vocab_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [62]:
model = create_model(vocab_size+1,seq_ln)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 10)            27050     
                                                                 
 lstm_2 (LSTM)               (None, 10, 256)           273408    
                                                                 
 dropout (Dropout)           (None, 10, 256)           0         
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 80)                20560     
                                                                 
 dense_3 (Dense)             (None, 2705)             

In [63]:
#Fitting model on our data
#Batch_size = 64 is just an arbitory number
#Training upto 55 epochs
#verbose = 2 (has values 0,1,2 for which type of summary you want while training your model)
model.fit(X,y,batch_size=64,epochs=55,verbose=2)

Epoch 1/55
465/465 - 11s - loss: 6.0440 - accuracy: 0.0736 - 11s/epoch - 23ms/step
Epoch 2/55
465/465 - 6s - loss: 5.7341 - accuracy: 0.0747 - 6s/epoch - 13ms/step
Epoch 3/55
465/465 - 6s - loss: 5.5987 - accuracy: 0.0798 - 6s/epoch - 13ms/step
Epoch 4/55
465/465 - 6s - loss: 5.4334 - accuracy: 0.0861 - 6s/epoch - 13ms/step
Epoch 5/55
465/465 - 6s - loss: 5.2858 - accuracy: 0.0941 - 6s/epoch - 13ms/step
Epoch 6/55
465/465 - 6s - loss: 5.1626 - accuracy: 0.1011 - 6s/epoch - 13ms/step
Epoch 7/55
465/465 - 6s - loss: 5.0578 - accuracy: 0.1095 - 6s/epoch - 13ms/step
Epoch 8/55
465/465 - 6s - loss: 4.9598 - accuracy: 0.1157 - 6s/epoch - 13ms/step
Epoch 9/55
465/465 - 6s - loss: 4.8607 - accuracy: 0.1246 - 6s/epoch - 13ms/step
Epoch 10/55
465/465 - 6s - loss: 4.9738 - accuracy: 0.1181 - 6s/epoch - 14ms/step
Epoch 11/55
465/465 - 6s - loss: 4.7601 - accuracy: 0.1310 - 6s/epoch - 13ms/step
Epoch 12/55
465/465 - 6s - loss: 4.6212 - accuracy: 0.1420 - 6s/epoch - 13ms/step
Epoch 13/55
465/465 - 6

<keras.callbacks.History at 0x7fca423088d0>

In [64]:
from keras.preprocessing.sequence import pad_sequences

#Using seed text to generate new text sequence
#Genrating words equal to "num_of_gen_words"
#using pad_seqences for cases where our seed text does not match our seed length of 10 words 
#pad_sequences truncates the word according to "pre" or "post" passed to it
#Getting the word form tokenizer using the predicied word index
#joining whole output and returning it

def generated_text(model,tokenizer,seq_ln,seed_text,num_of_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_of_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_ln,truncating='pre')
        # predicted_wrd_index = model.predict_classes(pad_encoded,verbose=0)[0]
        predicted_wrd_index =np.argmax(model.predict(pad_encoded,verbose=0),axis=1)[0]
        predicted_wrd = tokenizer.index_word[predicted_wrd_index]
        input_text += ' '+predicted_wrd
        output_text.append(predicted_wrd)
        
    return ' '.join(output_text)

In [65]:
# You can give a random seed text from training file itself like --> ' '.join(text_seq[70])
#Or give custom seed text like -->
seed_text = "Alice became small and fell into the hatter rabbit hole"

In [66]:
gen_text = generated_text(model,tokenizer,seq_ln,seed_text,num_of_gen_words=100)

In [67]:
print(f"Seed text : --- > {seed_text}")
print("\n\n<<--------------------------------------->>\n\n")
print(f"Genrerated text : --- > {gen_text}")

Seed text : --- > Alice became small and fell into the hatter rabbit hole


<<--------------------------------------->>


Genrerated text : --- > the sounds noticed thatched with reply and sometimes edwin marched in the common moment she heard to tinkling a judge of idea so she went on among a immense curtain and washing it into the end of the edge of the country i 've none of it ' said alice ' i 've not gone in the sea ' ' i did n't remember what ' said alice ' why do you draw ' said the king ' i 'm glad i 'm mad ' ' i do n't know it ' pleaded with a tone ' and she squeezed
