In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [17]:
file = open("/kaggle/input/next-word-prediction/1661-0.txt", "r", encoding="utf8")
data = file.read()
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')
data = ' '.join(data.split())
print(data[:500])
print(len(data))

Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan DoyleThis eBook is for the use of anyone anywhere at no cost and withalmost no restrictions whatsoever. You may copy it, give it away orre-use it under the terms of the Project Gutenberg License includedwith this eBook or online at www.gutenberg.netTitle: The Adventures of Sherlock HolmesAuthor: Arthur Conan DoyleRelease Date: November 29, 2002 [EBook #1661]Last Updated: May 20, 2019Language: EnglishCharacter set encoding: UT
564156


In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open('token.pkl', 'wb'))
sequence_data = tokenizer.texts_to_sequences([data])[0]
print(sequence_data[:15])
print(len(sequence_data))

[159, 4841, 1, 956, 5, 122, 32, 44, 548, 2007, 4842, 1035, 13, 21, 1]
102507


In [19]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
print(sequences[:10])

13136
The Length of sequences are:  102504
[[ 159 4841    1  956]
 [4841    1  956    5]
 [   1  956    5  122]
 [ 956    5  122   32]
 [   5  122   32   44]
 [ 122   32   44  548]
 [  32   44  548 2007]
 [  44  548 2007 4842]
 [ 548 2007 4842 1035]
 [2007 4842 1035   13]]


In [20]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [21]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 159 4841    1]
 [4841    1  956]
 [   1  956    5]
 [ 956    5  122]
 [   5  122   32]
 [ 122   32   44]
 [  32   44  548]
 [  44  548 2007]
 [ 548 2007 4842]
 [2007 4842 1035]]
Response:  [ 956    5  122   32   44  548 2007 4842 1035   13]


In [22]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
embedding_index = {}
glove_file = '/kaggle/input/glove-embeddings/glove.6B.100d.txt'  # path to your GloVe file

with open(glove_file, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

print("Loaded GloVe word vectors:", len(embedding_index))

embedding_dim = 100  
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("Embedding matrix shape:", embedding_matrix.shape)

Loaded GloVe word vectors: 400000
Embedding matrix shape: (13136, 100)


In [25]:
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="random_normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(1,),
                                 initializer="zeros", trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)


In [26]:
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,       
    weights=[embedding_matrix],     
    input_length=3,
    trainable=False                 
))
model.add(LSTM(1000, return_sequences=True))  
model.add(AttentionLayer())                   
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.build(input_shape=(None, 3))
model.summary()




In [27]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)

early_stop = EarlyStopping(
    monitor='loss',  
    patience=5,
    restore_best_weights=True,
    verbose=1
)

model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

model.fit(X, y, epochs=40, batch_size=64, callbacks=[checkpoint, early_stop])


Epoch 1/40
[1m1601/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 7.0142
Epoch 1: loss improved from inf to 6.72205, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 17ms/step - loss: 7.0138
Epoch 2/40
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 6.1193
Epoch 2: loss improved from 6.72205 to 6.06944, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 6.1193
Epoch 3/40
[1m1601/1602[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - loss: 5.6554
Epoch 3: loss improved from 6.06944 to 5.61820, saving model to next_words.h5
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 17ms/step - loss: 5.6554
Epoch 4/40
[1m1602/1602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 5.2760
Epoch 4: loss improved from 5.61820 to 5.25083, saving model to next_wor

<keras.src.callbacks.history.History at 0x7a6fa2e01a90>

In [28]:
def Predict_Next_Words_Beam(model, tokenizer, text, beam_width=3, next_words=5):

    sequence = tokenizer.texts_to_sequences([text])[0]

    beam = [(sequence, 0.0)]  

    for _ in range(next_words):
        candidates = []

        for seq, score in beam:
            seq_array = np.array(seq[-3:]).reshape(1, -1) 
            preds = model.predict(seq_array, verbose=0)[0]

            top_indices = preds.argsort()[-beam_width:][::-1]

            for idx in top_indices:
                word = tokenizer.index_word[idx]
                prob = np.log(preds[idx] + 1e-10)  
                candidates.append((seq + [idx], score + prob))

        beam = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_width]

    best_seq = beam[0][0]
    predicted_words = [tokenizer.index_word[i] for i in best_seq[len(sequence):]]

    print(f"Predicted sequence: {' '.join(predicted_words)}")
    return predicted_words


In [None]:
while True:
    text = input("Enter your line: ")
    
    if text == "0":
        print("Execution completed.....")
        break
    else:
        try:
            text = text.split()[-3:]  
            print("Input:", text)
            
            Predict_Next_Words_Beam(model, tokenizer, text, beam_width=3, next_words=10)
        except Exception as e:
            print("Error occurred:", e)
            continue


Enter your line:  the project


Input: ['the', 'project']
Predicted sequence: gutenberg literary archive foundation royalty payments must be clearly marked


In [31]:
import math

def calculate_perplexity(model, X, y, batch_size=512):
    cross_entropy = 0.0
    n_samples = 0
    epsilon = 1e-10

    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]

        y_pred = model.predict(X_batch, verbose=0)
        cross_entropy += -np.sum(y_batch * np.log(y_pred + epsilon))
        n_samples += y_batch.shape[0]

    cross_entropy /= n_samples
    perplexity = math.exp(cross_entropy)
    return cross_entropy, perplexity

cross_entropy, perplexity = calculate_perplexity(model, X, y, batch_size=512)
print("Cross-entropy loss:", cross_entropy)
print("Perplexity:", perplexity)


Cross-entropy loss: 0.3820629202958525
Perplexity: 1.4653042796112645


In [34]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

smoothie = SmoothingFunction().method4

def calculate_bleu_batch_verbose(model, tokenizer, X, y_true, batch_size=512, print_examples=10):
    scores = []
    examples_printed = 0

    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y_true[i:i+batch_size]

        y_pred_batch = model.predict(X_batch, verbose=0)

        for j in range(len(X_batch)):
            pred_index = np.argmax(y_pred_batch[j])
            pred_word = tokenizer.index_word[pred_index]

            true_index = np.argmax(y_batch[j])
            true_word = tokenizer.index_word[true_index]

            score = sentence_bleu([[true_word]], [pred_word], smoothing_function=smoothie)
            scores.append(score)

            if examples_printed < print_examples:
                print(f"Input sequence: {[tokenizer.index_word[idx] for idx in X_batch[j]]}")
                print(f"Predicted next word: {pred_word} | Actual next word: {true_word}\n")
                examples_printed += 1

    return np.mean(scores)

bleu_score_verbose = calculate_bleu_batch_verbose(model, tokenizer, X, y, batch_size=512, print_examples=10)
print("Average BLEU score:", bleu_score_verbose)


Input sequence: ['project', "gutenberg's", 'the']
Predicted next word: adventures | Actual next word: adventures

Input sequence: ["gutenberg's", 'the', 'adventures']
Predicted next word: of | Actual next word: of

Input sequence: ['the', 'adventures', 'of']
Predicted next word: sherlock | Actual next word: sherlock

Input sequence: ['adventures', 'of', 'sherlock']
Predicted next word: holmes | Actual next word: holmes

Input sequence: ['of', 'sherlock', 'holmes']
Predicted next word: by | Actual next word: by

Input sequence: ['sherlock', 'holmes', 'by']
Predicted next word: arthur | Actual next word: arthur

Input sequence: ['holmes', 'by', 'arthur']
Predicted next word: conan | Actual next word: conan

Input sequence: ['by', 'arthur', 'conan']
Predicted next word: doyle | Actual next word: doylethis

Input sequence: ['arthur', 'conan', 'doylethis']
Predicted next word: date | Actual next word: ebook

Input sequence: ['conan', 'doylethis', 'ebook']
Predicted next word: is | Actual ne