In [52]:
#Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd  

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\zaynb\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [53]:
#Load the dataset
data=gutenberg.raw('shakespeare-hamlet.txt')
# save to a file 
with open('hamlet.txt','w') as file:
    file.write(data) 

In [54]:
#data 

Data Preprocessing 

In [76]:
import numpy as np  
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


#load the dataset
with open('hamlet.txt','r') as file:
    text=file.read().lower()

#Tokenize the Text- Creating indexes for words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1

In [77]:
total_words

4818

In [78]:
#tokenizer.word_index

In [79]:
#Create input sequences 
input_sequences=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]

    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [80]:
# for each sentence it starts creating bigram --- ngram 
#input_sequences

Pad sequence

In [81]:
#pad sequence ensure to have same length of vectors to all the inputs

In [82]:
max_sequence=max([len(x) for x in input_sequences])
max_sequence

14

In [83]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence,padding='pre'))

In [84]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [85]:
# Create Predictors and labels 
import tensorflow as tf 
X,y=input_sequences[:,:-1],input_sequences[:,-1]

In [86]:
X 

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]])

In [87]:
y

array([ 687,    4,   45, ..., 1047,    4,  193])

In [88]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
print(X.shape,y.shape)

(25732, 13) (25732, 4818)


In [89]:
total_words

4818

In [90]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (20585, 13)
y_train shape: (20585, 4818)
X_test shape: (5147, 13)
y_test shape: (5147, 4818)


TRAIN TEST SPLIT 

In [91]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [107]:
#Define early stopping 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5,restore_best_weights=True)

TRAINING LSTM RNN

In [92]:
'''  
Layer	Parameters	Return Type	Purpose
Embedding	total_words, 100, input_length	(batch_size, input_length, 100)	Converts indices to dense word vectors
LSTM (1st)	100, return_sequences=True	(batch_size, input_length, 100)	Captures dependencies between words
Dropout	0.2	Same as input	Prevents overfitting
LSTM (2nd)	50	(batch_size, 50)	Summarizes the sequence in one vector
Dense	total_words, activation='softmax'	(batch_size, total_words)	Predicts next word as probabilities
'''

"  \nLayer\tParameters\tReturn Type\tPurpose\nEmbedding\ttotal_words, 100, input_length\t(batch_size, input_length, 100)\tConverts indices to dense word vectors\nLSTM (1st)\t100, return_sequences=True\t(batch_size, input_length, 100)\tCaptures dependencies between words\nDropout\t0.2\tSame as input\tPrevents overfitting\nLSTM (2nd)\t50\t(batch_size, 50)\tSummarizes the sequence in one vector\nDense\ttotal_words, activation='softmax'\t(batch_size, total_words)\tPredicts next word as probabilities\n"

In [93]:
total_words

4818

In [94]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout


## Define the model 
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence-1))
model.add(LSTM(100,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))


#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 13, 100)           481800    
                                                                 
 lstm_14 (LSTM)              (None, 13, 100)           80400     
                                                                 
 dropout_7 (Dropout)         (None, 13, 100)           0         
                                                                 
 lstm_15 (LSTM)              (None, 100)               80400     
                                                                 
 dense_7 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1129218 (4.31 MB)
Trainable params: 1129218 (4.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [95]:
# my system is not a very powerful one,thats why im going to run for only 5 epochs 
# for better accuracy you can run more epochs

In [103]:
len(tokenizer.word_index)

4817

In [96]:
## Train the model
history=model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test),verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [108]:
#function to predict the next word
def predict_next_word(model,tokenizer,text,max_sequence_len):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if len(token_list)>=max_sequence_len:
        token_list=token_list[-(max_sequence_len-1):]
    token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre',)
    predicted=model.predict(token_list,verbose=0)
    predicted_word_index=np.argmax(predicted,axis=1)
    for word,index in tokenizer.word_index.items():
        if index==predicted_word_index:
            return word
    return None

In [113]:
input_text='the'
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1 
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next word: {next_word}")

Input text:the
Next word: king


Save the model

In [115]:
model.save('next_word_LSTM.h5')
import pickle 
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(


In [116]:
# similarly you can also use GRU RNN for that you just import GRU from tensorflow
# and replace LSTM from GRU