In [None]:
import nltk
nltk.download('gutenberg')

from nltk.corpus import gutenberg
import pandas as pd
import numpy as np

data=gutenberg.raw('shakespeare-hamlet.txt')

with open('data.txt', 'w') as f:
    f.write(data)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences  ## making all sentences are in same length
from sklearn.model_selection import train_test_split

with open("data.txt",'r') as f:
  text=f.read().lower()
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words

In [None]:
## create input sequence

inputsequences=[]

for line in text.split('\n'):
  token_list=tokenizer.texts_to_sequences([line])[0]
  for i in range (1,len(token_list)):
    n_gram_seqeunce=token_list[:i+1]
    inputsequences.append(n_gram_seqeunce)


In [None]:
max_sequence_len=max([len(x) for x in inputsequences])

In [None]:
inputsequences=np.array(pad_sequences(inputsequences,maxlen=max_sequence_len,padding='pre'))

In [None]:
import tensorflow as tf
x,y=inputsequences[:,:-1], inputsequences[:,-1]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Convert y_train and y_test to categorical after the split
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words) 
y_test = tf.keras.utils.to_categorical(y_test, num_classes=total_words)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout,Dense

model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))


model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

In [None]:
history=model.fit(x_train,y_train,epochs=30, validation_data=(x_test,y_test),verbose=1)

In [None]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  predicted_probs = model.predict(token_list, verbose=0) 
  predicted_index = np.argmax(predicted_probs)
  predicted_word = tokenizer.index_word[predicted_index]
  return predicted_word

In [None]:
input_text="To be or not to be"
print(f"input text:{input_text}")
model_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model, tokenizer,input_text,max_sequence_len)
print(f"next word :{next_word}")

In [None]:
import pickle
model.save("next_word_lstm.h5")
with open('tokenizer.pkl','wb') as handle:
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)