In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
file = open("/kaggle/input/next-word-prediction/1661-0.txt", "r", encoding="utf8")
data = file.read()
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')
data = ' '.join(data.split())
print(data[:500])
print(len(data))

Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan DoyleThis eBook is for the use of anyone anywhere at no cost and withalmost no restrictions whatsoever. You may copy it, give it away orre-use it under the terms of the Project Gutenberg License includedwith this eBook or online at www.gutenberg.netTitle: The Adventures of Sherlock HolmesAuthor: Arthur Conan DoyleRelease Date: November 29, 2002 [EBook #1661]Last Updated: May 20, 2019Language: EnglishCharacter set encoding: UT
564156


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open('token.pkl', 'wb'))
sequence_data = tokenizer.texts_to_sequences([data])[0]
print(sequence_data[:15])
print(len(sequence_data))

[159, 4841, 1, 956, 5, 122, 32, 44, 548, 2007, 4842, 1035, 13, 21, 1]
102507


In [14]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
print(sequences[:10])

13136
The Length of sequences are:  102504
[[ 159 4841    1  956]
 [4841    1  956    5]
 [   1  956    5  122]
 [ 956    5  122   32]
 [   5  122   32   44]
 [ 122   32   44  548]
 [  32   44  548 2007]
 [  44  548 2007 4842]
 [ 548 2007 4842 1035]
 [2007 4842 1035   13]]


In [15]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [16]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 159 4841    1]
 [4841    1  956]
 [   1  956    5]
 [ 956    5  122]
 [   5  122   32]
 [ 122   32   44]
 [  32   44  548]
 [  44  548 2007]
 [ 548 2007 4842]
 [2007 4842 1035]]
Response:  [ 956    5  122   32   44  548 2007 4842 1035   13]


In [17]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
embedding_index = {}
glove_file = '/kaggle/input/glove-embeddings/glove.6B.100d.txt'  # path to your GloVe file

with open(glove_file, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

print("Loaded GloVe word vectors:", len(embedding_index))

# Step 2: Create embedding matrix
embedding_dim = 100  # based on GloVe file used
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("Embedding matrix shape:", embedding_matrix.shape)

Loaded GloVe word vectors: 400000
Embedding matrix shape: (13136, 100)


In [20]:
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="random_normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(1,),
                                 initializer="zeros", trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)


In [21]:
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,       
    weights=[embedding_matrix],     
    input_length=3,
    trainable=False                 
))
model.add(LSTM(1000, return_sequences=True))  
model.add(AttentionLayer())                   
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.build(input_shape=(None, 3))
model.summary()


In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)

early_stop = EarlyStopping(
    monitor='loss',  
    patience=5,
    restore_best_weights=True,
    verbose=1
)

model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint, early_stop])


Epoch 1/70
[1m1600/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 7.0087
Epoch 1: loss improved from inf to 6.73303, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 18ms/step - loss: 7.0082
Epoch 2/70
[1m1600/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 6.1134
Epoch 2: loss improved from 6.73303 to 6.06280, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 6.1133
Epoch 3/70
[1m1600/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 5.6334
Epoch 3: loss improved from 6.06280 to 5.60157, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 5.6333
Epoch 4/70
[1m1600/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 5.2522
Epoch 4: loss improved from 5.60157 to 5.22371, saving model to next_wor

KeyboardInterrupt: 

In [24]:
def Predict_Next_Words_Beam(model, tokenizer, text, beam_width=3, next_words=5):

    sequence = tokenizer.texts_to_sequences([text])[0]

    beam = [(sequence, 0.0)]  

    for _ in range(next_words):
        candidates = []

        for seq, score in beam:
            seq_array = np.array(seq[-3:]).reshape(1, -1)  # keep last 3 words as per model input
            preds = model.predict(seq_array, verbose=0)[0]

            top_indices = preds.argsort()[-beam_width:][::-1]

            for idx in top_indices:
                word = tokenizer.index_word[idx]
                prob = np.log(preds[idx] + 1e-10)  # log prob
                candidates.append((seq + [idx], score + prob))

        beam = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_width]

    best_seq = beam[0][0]
    predicted_words = [tokenizer.index_word[i] for i in best_seq[len(sequence):]]

    print(f"Predicted sequence: {' '.join(predicted_words)}")
    return predicted_words


In [None]:
while True:
    text = input("Enter your line: ")
    
    if text == "0":
        print("Execution completed.....")
        break
    else:
        try:
            text = text.split()[-3:]  # last 3 words
            print("Input:", text)
            
            Predict_Next_Words_Beam(model, tokenizer, text, beam_width=3, next_words=10)
        except Exception as e:
            print("Error occurred:", e)
            continue

Enter your line:  the project
Input: ['the', 'project']
Predicted sequence: gutenberg literary archive foundation was created to provide a secureand


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.utils import to_categorical
import numpy as np

def calculate_perplexity(model, X, y_true, vocab_size):
    y_onehot = to_categorical(y_true, num_classes=vocab_size)
    loss = model.evaluate(X, y_onehot, verbose=0)
    return np.exp(loss)

def calculate_bleu(model, tokenizer, X, y_true):
    preds = model.predict(X, verbose=0)
    pred_indices = np.argmax(preds, axis=1)

    smoothie = SmoothingFunction().method1
    scores = []

    for pred, true in zip(pred_indices, y_true):
        if true in tokenizer.index_word:  
            reference = [[tokenizer.index_word[true]]]  
            hypothesis = [tokenizer.index_word.get(pred, "")]
            score = sentence_bleu(reference, hypothesis,
                                  smoothing_function=smoothie,
                                  weights=(1, 0, 0, 0))
            scores.append(score)

    return np.mean(scores) if scores else 0.0


def top_k_accuracy(model, X, y_true, k=5):
    preds = model.predict(X, verbose=0)
    top_k = np.argsort(preds, axis=1)[:, -k:]  
    correct = sum(true in top_k[i] for i, true in enumerate(y_true))
    return correct / len(y_true)

y_int = np.argmax(y, axis=1) if y.ndim > 1 else y  

perplexity = calculate_perplexity(model, X, y_int, vocab_size)
bleu = calculate_bleu(model, tokenizer, X, y_int)
top1 = top_k_accuracy(model, X, y_int, k=1)
top5 = top_k_accuracy(model, X, y_int, k=5)

print(f"Perplexity: {perplexity:.4f}")
print(f"BLEU-1 Score: {bleu:.4f}")
print(f"Top-1 Accuracy: {top1:.4f}")
print(f"Top-5 Accuracy: {top5:.4f}")
