Experiment 11: Next Word Prediction Using an RNN
- Aim: Next Word Prediction Using an RNN on Simple English Sentences


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential, layers, preprocessing

sentences = [
    "I love to eat apples",
    "She loves to eat oranges",
    "He likes to eat bananas"
]

# Tokenization
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Prepare sequences
seq_length = 3
input_sequences = [seq[i-seq_length:i] for seq in sequences for i in range(seq_length, len(seq))]
output_words = [seq[i] for seq in sequences for i in range(seq_length, len(seq))]

# Prepare data
X = preprocessing.sequence.pad_sequences(input_sequences, maxlen=seq_length)
y = tf.keras.utils.to_categorical(output_words, num_classes=len(tokenizer.word_index)+1)

# Build and train model
model = Sequential([
    layers.Embedding(len(tokenizer.word_index)+1, 8, input_length=seq_length),
    layers.SimpleRNN(16, activation='relu'),
    layers.Dense(len(tokenizer.word_index)+1, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=0)

model.summary()
# Predict
seed_seq = tokenizer.texts_to_sequences(["loves to eat"])[0]
predicted_word = tokenizer.index_word[np.argmax(model.predict(
    preprocessing.sequence.pad_sequences([seed_seq], maxlen=seq_length), verbose=0))]

print(f"Next word after 'loves to eat': {predicted_word}")



Next word after 'loves to eat': apples


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Sample training data
text = [
    'i love to eat apples',
    'she loves to eat oranges',
    'he like to eat guava'
]

# Step 2: Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

# Get total vocabulary size (+1 for padding)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequence of word indexes
sequences = tokenizer.texts_to_sequences(text)

# Step 3: Prepare input-output pairs
x = []  # Input sequence (3 words)
y = []  # Output word (next word)

for seq in sequences:
    for i in range(3, len(seq)):
        x.append(seq[i-3:i])  # Last 3 words
        y.append(seq[i])      # Next word

# Step 4: Padding & one-hot encoding
x = pad_sequences(x, maxlen=3)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# Step 5: Build the RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=8, input_length=3),
    SimpleRNN(16, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# Step 6: Compile & train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x, y, epochs=50)

# Step 7: Predict next word
seed_text = 'loves to eat'
seed_seq = tokenizer.texts_to_sequences([seed_text])[0]
seed_seq = pad_sequences([seed_seq], maxlen=3)

# Get predicted word index
pred = model.predict(seed_seq)
pred_index = np.argmax(pred)

# Convert index back to word
predicted_word = tokenizer.index_word.get(pred_index)

# Step 8: Output result
print(f"Next word after '{seed_text}': {predicted_word}")

Epoch 1/50




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.4684
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.0000e+00 - loss: 2.4624
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1667 - loss: 2.4563
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1667 - loss: 2.4504
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1667 - loss: 2.4448
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1667 - loss: 2.4392
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1667 - loss: 2.4336
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1667 - loss: 2.4279
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Training data
text = [
    'i love to eat apples',
    'she loves to eat oranges',
    'he like to eat guava'
]

# Tokenization and sequence generation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(text)

# Create input-output pairs using list comprehension
x = [seq[i-3:i] for seq in sequences for i in range(3, len(seq))]
y = [seq[i] for seq in sequences for i in range(3, len(seq))]

# Padding and one-hot encoding
x = pad_sequences(x, maxlen=3)
y = tf.keras.utils.to_categorical(y, vocab_size)

# Model building and training
model = Sequential([
    Embedding(vocab_size, 8, input_length=3),
    SimpleRNN(16, activation='relu'),
    Dense(vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x, y, epochs=50)

# Prediction
seed_text = 'loves to eat'
seed_seq = pad_sequences([tokenizer.texts_to_sequences([seed_text])[0]], maxlen=3)
predicted_word = tokenizer.index_word[np.argmax(model.predict(seed_seq))]

print(f"Next word after '{seed_text}': {predicted_word}")

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 2.4980
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0000e+00 - loss: 2.4901
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.0000e+00 - loss: 2.4832
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1667 - loss: 2.4773
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.3333 - loss: 2.4717
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.3333 - loss: 2.4657
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.3333 - loss: 2.4600
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.3333 - loss: 2.4546
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[