In [45]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd

In [55]:
# We read data and specified Tokenizer
tokenizer = Tokenizer()
data = pd.read_csv("/Users/maheshbabu/Desktop/NLP-Next-Word-Predictor/ArticlesJan2017.csv")

corpus = data['snippet'].str.lower().tolist()

# corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1 


print(tokenizer.word_index)
print(total_words)

{'the': 1, 'a': 2, 'to': 3, 'of': 4, 'and': 5, 'in': 6, 'for': 7, 'on': 8, 'is': 9, 'that': 10, 'with': 11, 'be': 12, 'as': 13, 'but': 14, 'are': 15, 'it': 16, 'president': 17, 'an': 18, 'his': 19, 'by': 20, 'new': 21, 'what': 22, 'you': 23, 'at': 24, 'about': 25, 'trump': 26, 'has': 27, 'have': 28, 'from': 29, 'this': 30, 'was': 31, 'who': 32, 'can': 33, 'not': 34, 'will': 35, 'or': 36, 'their': 37, 'how': 38, 'they': 39, 'more': 40, 'donald': 41, 'do': 42, '—': 43, 'trump’s': 44, 'he': 45, 'when': 46, 'times': 47, 'our': 48, 'one': 49, 'most': 50, 'its': 51, 'people': 52, 'would': 53, 'now': 54, 'your': 55, 'mr': 56, 'it’s': 57, 'york': 58, 'may': 59, 'could': 60, 'us': 61, 'some': 62, 'been': 63, '”': 64, 'i': 65, 'we': 66, 'like': 67, 'j': 68, 'up': 69, 'over': 70, 'news': 71, 'than': 72, 'no': 73, 'american': 74, 'women': 75, 'so': 76, 'even': 77, 'should': 78, 'country': 79, 'years': 80, 'care': 81, 'which': 82, 'two': 83, 'her': 84, 'them': 85, 'all': 86, 'were': 87, 'why': 88, 

In [57]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = np.array(token_list[:i+1])
        input_sequences.append(n_gram_sequence)

In [58]:
max_sequence_len = max([len(x) for x in input_sequences])

In [59]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [60]:
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [61]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=30,validation_split=0.1, verbose=1)

Epoch 1/30


  super().__init__(name, **kwargs)
2023-04-09 19:59:01.897110: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:02.089493: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:02.107119: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:02.382969: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:02.398768: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-04-09 19:59:22.675653: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:22.743755: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 19:59:22.752706: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
<keras.engine.sequential.Sequential object at 0x17e658e80>


In [62]:
seed_text = "I don't think it should be a"
next_words = 10
#Prediction
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted, axis=1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    print(seed_text)

2023-04-09 20:08:45.694431: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 20:08:45.751058: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-09 20:08:45.763239: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


I don't think it should be a compelling
I don't think it should be a compelling slogan
I don't think it should be a compelling slogan but
I don't think it should be a compelling slogan but greatness
I don't think it should be a compelling slogan but greatness are
I don't think it should be a compelling slogan but greatness are they
I don't think it should be a compelling slogan but greatness are they are
I don't think it should be a compelling slogan but greatness are they are putting
I don't think it should be a compelling slogan but greatness are they are putting forth
I don't think it should be a compelling slogan but greatness are they are putting forth an


In [63]:
predictions = model.predict(xs)



In [37]:
np.argmax(predictions,axis = 1)

array([  17,  296,    5, ..., 3682,  138, 3675])

In [12]:
predictions.shape

(15348, 4779)

In [13]:
labels

array([  50,  296,    5, ..., 4777,   48, 4778], dtype=int32)

In [65]:
import tensorflow as tf
loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, predictions))
perplexity = tf.exp(loss)
print('Perplexity: {:.2f}'.format(perplexity.numpy()))

Perplexity: 4.81
