# **Next Word Prediction**

In [None]:
from IPython.display import display
import ipywidgets as widgets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from typing import Text
with open("/content/drive/MyDrive/Colab Notebooks/dataset.txt", "r", encoding="utf-8") as file:
    text = file.read()
text = text.replace("_", "").replace("\n", " ").replace(",","").replace("?","").replace("\"","").replace("\'","")
print(text)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print(total_words)
corpus=text.split(".")

In [None]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
from keras.utils import to_categorical
input_sequences = []
for line in corpus: # Use 'data' instead of 'text'
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        ngram_seq = token_list[:i+1]
        input_sequences.append(ngram_seq)

In [None]:
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Embedding(input_dim=total_words, output_dim=32),
    LSTM(100),
    Dense(total_words, activation='softmax')
])
model.build(input_shape=(None, max_seq_len-1))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X, y, epochs=50, verbose=1)

In [None]:
loss, accuracy = model.evaluate(X, y)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
import pickle
with open('/content/drive/MyDrive/Colab Notebooks/tokenizer_and_max_seq_len.pkl', 'wb') as f:
    pickle.dump({'tokenizer': tokenizer, 'max_seq_len': max_seq_len}, f)

model.save('/content/drive/MyDrive/Colab Notebooks/trained_model.keras')

In [None]:
import numpy as np

def predict_top_n(seed_text, top_n=1):
    """
    Predicts the top N next words for a given seed text.

    Args:
        seed_text (str): The input text to predict the next word for.
        top_n (int): The number of top predictions to return.

    Returns:
        list: A list of tuples, where each tuple contains the predicted word
              and its probability, sorted by probability in descending order.
    """
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]

    top_n_indices = np.argsort(predicted_probs)[::-1][:top_n]

    predictions = []
    for index in top_n_indices:
        predicted_word = tokenizer.index_word[index]
        probability = predicted_probs[index]
        predictions.append((predicted_word, probability))

    return predictions

In [None]:
input_box = widgets.Text(
    value='',
    placeholder='Type your sentence...',
    description='Input:',
    disabled=False,
    layout=widgets.Layout(width='90%')
)

output_box = widgets.Output()

def on_text_change(change):
    output_box.clear_output()
    seed = change['new']
    if len(seed.strip().split()) == 0:
        return

    with output_box:
        predictions = predict_top_n(seed, top_n=5)
        print("Predicted next words with probabilities:")
        for word, prob in predictions:
            print(f"🔹 {word}: {prob:.4f}")

        options = [word for word, _ in predictions]
        dropdown = widgets.Dropdown(
            options=options,
            description='Choose:',
            layout=widgets.Layout(width='50%')
        )

        def on_select_change(change):
            full = seed + ' ' + change['new']
            print(f"\n✅ Full sentence: {full}")

        dropdown.observe(on_select_change, names='value')
        display(dropdown)

input_box.observe(on_text_change, names='value')

display(input_box, output_box)