<a href="https://colab.research.google.com/github/EktaSingh1612/LGMVIP/blob/main/next_word_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LGMVIP Data Science Internship

##Author : EKTA SINGH

##Advanced Level Task 2 - NEXT WORD PREDICTION

###Technique used: Using Tensorflow and Keras library train a RNN, to predict the next word.

# Importing libraries

In [None]:
import numpy as np
from nltk. tokenize import RegexpTokenizer 
from keras.models import Sequential, load_model 
from keras.layers import LSTM
from keras.layers.core import Dense, Activation 
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt 
import pickle
import heapq



# Load the data

In [None]:
txtfile = '1661-0.txt'
text = open(txtfile, encoding='utf-8').read().lower() 
print('corpus length:', len(text))

corpus length: 581888


# Split the entire dataset into each word

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

# Uniquely sorted word List

In [None]:
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

# Feature Engineering & One Hot Encoding

In [None]:
WORD_LENGTH= 5
prev_words = [] 
next_words = []
for i in range(len (words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])

print(prev_words[0]) 
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


## X: sorting the features

## Y: sorting the corresponding label(here, next word)

In [None]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len (next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

# Single sequence

In [None]:
print(X[0][0])

[False False False ... False False False]


# Building the model

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))   
model.add(Activation('softmax'))

# Training the model

In [None]:
optimizer = RMSprop(lr = 0.01) 
model.compile(loss = 'categorical_crossentropy', optimizer = optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history

  super(RMSprop, self).__init__(name, **kwargs)


Epoch 1/2
Epoch 2/2


# Load the model

In [None]:
model.save('keras_next_word_model.h5') 
pickle.dump(history, open("history.p", "wb")) 
model = load_model('keras_next_word_model.h5') 
history = pickle.load(open("history.p", "rb"))

# Prediction

#Choosing the best possible n word

In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words))) 
    for t, word in enumerate(text.split()):
        print (word)
        x[0, t, unique_word_index[word]] = 1
        return x 
prepare_input("It is not a lack".lower())

it


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [None]:
def sample(preds, top_n = 3):
    preds= np.asarray(preds).astype('float64') 
    preds= np.log(preds)
    exp_preds= np.exp(preds) 
    preds= exp_preds/np.sum(exp_preds) 
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

# Function for Prediction

In [None]:
def predict_completions(text, n=3):
    if text == "":
        return("")
    x = prepare_input(text) 
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

#Using tokenizer.tokenize for removing punctuations

In [None]:
q = "Light the candle instead of cursuing darkness" 
print("correct sentence: ",q) 
seq = " ".join(tokenizer.tokenize(q.lower())[0:5]) 
print("Sequence: ",seq) 
print("next possible words: ", predict_completions(seq, 5))

correct sentence:  Light the candle instead of cursuing darkness
Sequence:  light the candle instead of
light
next possible words:  ['and', 'the', 'a', 'of', 'with']
