<a href="https://colab.research.google.com/github/Domaakshithareddy/next-word-prediction/blob/main/Next_Word_Prediction_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing Required Libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Attention
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import numpy as np
import pickle
import os

In [None]:
# 1. Preprocessing Improvements
def preprocess_text(file_path):
    # Read the file
    with open(file_path, "r", encoding="utf8") as file:
        text = file.read()

    # Basic cleaning: remove newlines and special characters, but keep punctuation for context
    text = text.replace('\n', ' ').replace('\r', '').replace('\ufeff', '')

    # Keep the full text (no deduplication) to preserve context
    return text

# Load and preprocess the data
file_path = "metamorphosis_clean.txt"
data = preprocess_text(file_path)
print("Sample of preprocessed text:", data[:500])

# Tokenization with OOV handling
tokenizer = Tokenizer(oov_token="<OOV>")  # Add OOV token for unseen words
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open('tokenizer_improved.pkl', 'wb'))  # Save tokenizer

# Convert text to sequences
sequence_data = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1  # +1 for OOV token
print("Vocabulary size:", vocab_size)

# Create N-gram sequences (e.g., use 3 words to predict the next one)
sequence_length = 3  # Increased from 1 to 3 for more context
sequences = []
for i in range(sequence_length, len(sequence_data)):
    seq = sequence_data[i-sequence_length:i+1]
    sequences.append(seq)

sequences = np.array(sequences)
print("Number of sequences:", len(sequences))
print("Sample sequence:", sequences[0])

# Split into input (X) and output (y)
X = sequences[:, :-1]  # All but the last word
y = sequences[:, -1]   # The last word
y = to_categorical(y, num_classes=vocab_size)  # One-hot encode the output

Sample of preprocessed text: One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.  His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.  "What's happened to me?" he
Vocabulary size: 2618
Number of sequences: 22044
Sample sequence: [ 53 140  56  15]


In [None]:
# 2. Model Architecture Enhancements
model = Sequential([
    # Embedding layer (could be initialized with pre-trained embeddings like GloVe)
    Embedding(vocab_size, 50, input_length=sequence_length),  # Increased embedding size from 10 to 50
    Bidirectional(LSTM(500, return_sequences=True)),  # Bidirectional LSTM for better context
    Bidirectional(LSTM(500)),  # Second Bidirectional LSTM
    Dense(500, activation="relu"),  # Reduced from 1000 to 500 for efficiency
    Dense(vocab_size, activation="softmax")  # Output layer
])

# Build and summarize the model
model.build(input_shape=(None, sequence_length))
model.summary()

# 3. Training Optimization
# Compile the model
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001, verbose=1)
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True, verbose=1)

# Train the model with validation split
history = model.fit(X, y,
                    epochs=150,
                    batch_size=128,  # Increased from 64 to 128
                    validation_split=0.2,  # 20% validation data
                    callbacks=[reduce_lr, early_stopping])

# Save the model
model.save("nextword_improved.keras")  # Using native Keras format
print("Model saved!")



Epoch 1/150
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 667ms/step - accuracy: 0.0428 - loss: 6.7580 - val_accuracy: 0.0497 - val_loss: 6.2379 - learning_rate: 0.0010
Epoch 2/150
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 644ms/step - accuracy: 0.0555 - loss: 5.8801 - val_accuracy: 0.0535 - val_loss: 6.1409 - learning_rate: 0.0010
Epoch 3/150
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 643ms/step - accuracy: 0.0614 - loss: 5.5987 - val_accuracy: 0.0739 - val_loss: 6.1115 - learning_rate: 0.0010
Epoch 4/150
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 641ms/step - accuracy: 0.0798 - loss: 5.3692 - val_accuracy: 0.0794 - val_loss: 6.1879 - learning_rate: 0.0010
Epoch 5/150
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 601ms/step - accuracy: 0.0967 - loss: 5.1537 - val_accuracy: 0.0832 - val_loss: 6.1621 - learning_rate: 0.0010
Epoch 6/150
[1m138/138[0m [32m━━━━━━━━━━━━━

In [None]:
# 4. Improved Prediction Function
def predict_next_words(model, tokenizer, text, top_k=3, temperature=1.0, num_words=1):
    """
    Predict the next word(s) with top-K sampling and temperature scaling.
    Args:
        model: Trained model
        tokenizer: Fitted tokenizer
        text: Input text (last sequence_length words)
        top_k: Number of top predictions to sample from
        temperature: Controls randomness (lower = more deterministic)
        num_words: Number of words to predict
    """
    for _ in range(num_words):
        # Tokenize input text
        words = text.split()
        if len(words) > sequence_length:
            words = words[-sequence_length:]  # Take last sequence_length words

        sequence = tokenizer.texts_to_sequences([words])[0]
        if len(sequence) < sequence_length:
            sequence = [tokenizer.word_index.get("<OOV>", 1)] * (sequence_length - len(sequence)) + sequence  # Pad with OOV

        sequence = np.array(sequence).reshape(1, -1)

        # Predict probabilities
        preds = model.predict(sequence, verbose=0)[0]

        # Apply temperature scaling
        preds = np.log(preds + 1e-10) / temperature  # Add small constant to avoid log(0)
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        # Top-K sampling
        top_k_indices = np.argsort(preds)[-top_k:][::-1]
        top_k_probs = preds[top_k_indices]
        top_k_probs = top_k_probs / np.sum(top_k_probs)  # Normalize probabilities

        predicted_index = np.random.choice(top_k_indices, p=top_k_probs)

        # Convert index to word
        predicted_word = None
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                predicted_word = word
                break

        if predicted_word and predicted_word != "<OOV>":
            print(f"Predicted word: {predicted_word}")
            text += " " + predicted_word  # Append predicted word for next iteration
        else:
            print("Predicted an unknown word (<OOV>)")
            break

In [None]:
# Optional: Calculate Perplexity
def calculate_perplexity(model, X, y):
    loss = model.evaluate(X, y, verbose=0)[0]
    perplexity = np.exp(loss)
    return perplexity

perplexity = calculate_perplexity(model, X, y)
print(f"Model Perplexity: {perplexity:.2f}")

Model Perplexity: 69.81


In [None]:
# 5. Interactive Testing
print("\nTesting the model:")
while True:
    text = input("Enter your line (or 'stop' to exit): ").strip()
    if text.lower() == "stop":
        print("Exiting...")
        break

    try:
        predict_next_words(model, tokenizer, text, top_k=3, temperature=0.8, num_words=1)
    except Exception as e:
        print(f"Error: {e}")


Testing the model:
Enter your line (or 'stop' to exit): at the dull
Predicted word: weather
Enter your line (or 'stop' to exit): i am
Predicted word: dying
Enter your line (or 'stop' to exit): i am cooking
Predicted word: enough
Enter your line (or 'stop' to exit): what are you
Predicted word: shocked
Enter your line (or 'stop' to exit): why are we
Predicted word: need
Enter your line (or 'stop' to exit): the sun
Predicted word: across
Enter your line (or 'stop' to exit): i love
Predicted word: hurriedly
Enter your line (or 'stop' to exit): my name is
Predicted word: here
Enter your line (or 'stop' to exit): what can i 
Predicted word: bring
Enter your line (or 'stop' to exit): good to see
Predicted word: it
Enter your line (or 'stop' to exit): i am happy for
Predicted word: that
Enter your line (or 'stop' to exit): how do you 
Predicted word: travellers
Enter your line (or 'stop' to exit): how
Predicted word: he
Enter your line (or 'stop' to exit): hi 
Predicted word: the
Enter your 

In [None]:
print("\nTesting the model:")
while True:
    text = input("Enter your line (or 'stop' to exit): ").strip()
    if text.lower() == "stop":
        print("Exiting...")
        break

    try:
        predict_next_words(model, tokenizer, text, top_k=3, temperature=0.8, num_words=3)
    except Exception as e:
        print(f"Error: {e}")



Testing the model:
Enter your line (or 'stop' to exit): at the dull
Predicted word: weather
Predicted word: drops
Predicted word: of
Enter your line (or 'stop' to exit): as i 
Predicted word: did
Predicted word: not
Predicted word: see
Enter your line (or 'stop' to exit): what are we
Predicted word: don't
Predicted word: know
Predicted word: what
Enter your line (or 'stop' to exit): i am
Predicted word: dying
Predicted word: throughout
Predicted word: all
Enter your line (or 'stop' to exit): how are you
Predicted word: need
Predicted word: for
Predicted word: commerce
Enter your line (or 'stop' to exit): can you
Predicted word: need
Predicted word: teeth
Predicted word: in
Enter your line (or 'stop' to exit): what can i
Predicted word: bring
Predicted word: in
Predicted word: order
Enter your line (or 'stop' to exit): how come
Predicted word: in
Predicted word: this
Predicted word: mood
Enter your line (or 'stop' to exit): who are
Predicted word: especially
Predicted word: looking
Pre