<a href="https://colab.research.google.com/github/Dynamic369/Next_Word_Prediction/blob/main/Next_Word_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Load the data and save it in any text file
2. data preprocessing

  :-open the data from that text file

  :- tokenize the text

  :- create the input sequence so that model is able to learn to predict the next words

  :- paddding (Padding is necessary because neural networks require inputs of the same size,)
  
  :- text split into x,y and then train and text data

3. Defining my Sequential LSTM model that contain one embedding, two LSTM , one Dropout and one Dense layer.
4. Train the model.
5. Make a prediction function for prediction.

In [None]:
import nltk
nltk.download("gutenberg")
from nltk.corpus import gutenberg
import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')
# save the file
with open('hamet.txt','w') as file:
  file.write(data)


In [None]:
# Number of words
len(data)

In [None]:
# Data Preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# open the dataset
with open('/content/hamet.txt','r') as file:
  text = file.read()

#tokenize the text-creating indexes for words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words


In [None]:
tokenizer.word_index

In [None]:
# create my input sequence
input_sequences = []
for line in text.split("\n"):
  token_list = tokenizer.texts_to_sequences([line])[0]

  for i in range(1,len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [None]:
input_sequences

In [None]:
max_sequence_len = max([len(x) for x in input_sequences])


In [None]:
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

In [None]:
#3 create predictors and label
import tensorflow as tf
x,y = input_sequences[:,:-1],input_sequences[:,-1]

In [None]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [None]:
y

In [None]:
# Splitting the data into train test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# Early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)

In [None]:
#Train our LSTM RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

## Define the model
model = Sequential()
model.add(Embedding(total_words,100,input_shape=(max_sequence_len-1,)))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
history = model.fit(x_train,y_train,epochs=100,validation_data=(x_test,y_test),verbose=1)

In [None]:
# Function to predict the next word.
def predict_next_word(model,tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) >= max_sequence_len:
    token_list = token_list[-(max_sequence_len-1):]

  token_list = pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
  predicted = model.predict(token_list,verbose=0)
  predicted_word_index = np.argmax(predicted,axis=1)
  for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
      return word
  return None


In [None]:
input_text = "When yond same Starre that's Westward from the"
print(f"Input text: {input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next word: {next_word}")

In [None]:
input_text = input("Enter the text")
print(f"Input text: {input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next word: {next_word}")