In [9]:
import nltk
import pickle
from nltk.tokenize import word_tokenize,sent_tokenize
from keras.preprocessing.text import Tokenizer
from nltk.corpus import gutenberg
import numpy as np
import re
import tensorflow as tf
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

**Loading the dataset**

In [2]:
nltk.download('punkt')
nltk.download('gutenberg')
byrant=gutenberg.raw('bryant-stories.txt')
whitman=gutenberg.raw('whitman-leaves.txt')
burgass=gutenberg.raw('burgess-busterbrown.txt')
chesteron=gutenberg.raw('chesterton-ball.txt')
total_txt=byrant

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


**Text_preprocessing**

In [3]:
punc_removed=re.sub(r'\W+', ' ', total_txt).lower()
words = nltk.word_tokenize(punc_removed)

In [4]:
for x in words :
  if(len(x)<2 and x != 'a' and x != 'i'  ):
    words.remove(x)

In [5]:
unique=list(set(words))
len(unique)

3890

In [6]:
train_len = 4
text_sequences = []
for i in range(train_len,len(words)):
  seq = words[i-train_len:i]
  text_sequences.append(seq)

In [7]:
sequences = {}
count = 1
for i in range(len(words)):
  if words[i] not in sequences:
    sequences[words[i]] = count
    count += 1

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)+1

In [10]:
filename = 'token'
outfile = open(filename,'wb')
pickle.dump(tokenizer,outfile)
outfile.close()

In [None]:
n_sequences = np.empty([len(sequences),train_len], dtype='int32')
for i in range(len(sequences)):
  n_sequences[i] = sequences[i]
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]

**Loading the tensorflow hub universal senetence encoder**

In [None]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
seq_dict=embed(unique)
seq_dict=np.array(seq_dict)

In [None]:
embedding_layer =tf.keras.layers.Embedding(
    seq_dict.shape[0],
    512,
    weights=[seq_dict],
    input_length=seq_len,
    trainable=False,
)

**`Model building and transfer learning**

In [None]:
input1 = tf.keras.Input(shape=seq_len,name="input1")
emb= embedding_layer(input1)
bls1=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,return_sequences=True))(emb)
bls2=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(bls1)
fc1=tf.keras.layers.Dense(64, activation='relu')(bls2)
output=tf.keras.layers.Dense(vocabulary_size, activation='softmax')(fc1)
model=tf.keras.models.Model(inputs=input1,outputs=output)
print(model.summary())


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=32,verbose=1)


In [None]:
emb.trainable=True

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=16,verbose=1)

In [None]:
model.save('./drive/MyDrive/saved_models/NWP_BILSTM.h5')
