In [23]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from keras.preprocessing.text import Tokenizer
from nltk.corpus import gutenberg
import numpy as np
import re
import tensorflow as tf
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

**Loading the dataset**

In [24]:
nltk.download('punkt')
nltk.download('gutenberg')
byrant=gutenberg.raw('bryant-stories.txt')
whitman=gutenberg.raw('whitman-leaves.txt')
burgass=gutenberg.raw('burgess-busterbrown.txt')
chesteron=gutenberg.raw('chesterton-ball.txt')
total_txt=byrant

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


**Text_preprocessing**

In [25]:
punc_removed=re.sub(r'\W+', ' ', total_txt).lower()
words = nltk.word_tokenize(punc_removed)

In [26]:
for x in words :
  if(len(x)<2 and x != 'a' and x != 'i'  ):
    words.remove(x)

In [27]:
unique=list(set(words))
len(unique)

3890

In [28]:
train_len = 4
text_sequences = []
for i in range(train_len,len(words)):
  seq = words[i-train_len:i]
  text_sequences.append(seq)

In [29]:
sequences = {}
count = 1
for i in range(len(words)):
  if words[i] not in sequences:
    sequences[words[i]] = count
    count += 1

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)+1

In [31]:
n_sequences = np.empty([len(sequences),train_len], dtype='int32')
for i in range(len(sequences)):
  n_sequences[i] = sequences[i]
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]

**Loading the tensorflow hub universal senetence encoder**

In [12]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [32]:
seq_dict=embed(unique)
seq_dict=np.array(seq_dict)

In [33]:
embedding_layer =tf.keras.layers.Embedding(
    seq_dict.shape[0],
    512,
    weights=[seq_dict],
    input_length=seq_len,
    trainable=False,
)

**`Model building and transfer learning**

In [34]:
input1 = tf.keras.Input(shape=seq_len,name="input1")
emb= embedding_layer(input1)
bls1=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,return_sequences=True))(emb)
bls2=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(bls1)
fc1=tf.keras.layers.Dense(64, activation='relu')(bls2)
output=tf.keras.layers.Dense(vocabulary_size, activation='softmax')(fc1)
model=tf.keras.models.Model(inputs=input1,outputs=output)
print(model.summary())


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input1 (InputLayer)          [(None, 3)]               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 3, 512)            1991680   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 3, 512)            1574912   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 3890)              252850    
Total params: 4,492,274
Trainable params: 2,500,594
Non-trainable params: 1,991,680
_________________________________________

In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=32,verbose=1)


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x7fbc0955e750>

In [36]:
emb.trainable=True

In [37]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=16,verbose=1)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7fbc08d9d350>

In [38]:
model.save('./drive/MyDrive/saved_models/NWP_BILSTM.h5')


**Test_demo**



In [20]:
model=tf.keras.models.load_model("./drive/MyDrive/saved_models/NWP_BILSTM.h5")

In [42]:
input_text = input().strip().lower()
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=3, truncating='pre')
print(encoded_text, pad_encoded)
for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    pred_word = tokenizer.index_word[i]
    print("Next word suggestion:",pred_word)

the cake is
[1, 897, 30] [[  1 897  30]]
Next word suggestion: going
Next word suggestion: as
Next word suggestion: to
