<a href="https://colab.research.google.com/github/Ankit-Rai2612/Next-Word-Prediction-using-MLP/blob/main/Next_Word_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import itertools
import os

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f'PyTorch version: {torch.__version__}')
if torch.cuda.is_available():
    print(f'CUDA device name: {torch.cuda.get_device_name(0)}')

Using device: cpu
PyTorch version: 2.8.0+cu126


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

base_path = "/content/drive/MyDrive/Next-Word-Generator"

assets_path = os.path.join(base_path, "assets")
model_path = os.path.join(base_path, "models/mlp/model_context_5_emb_32_act_leaky_relu_seed_42.pth")


In [None]:
import json

with open(os.path.join(assets_path, "word_to_index.json"), "r") as f:
    word_to_index = json.load(f)

with open(os.path.join(assets_path, "index_to_word.json"), "r") as f:
    index_to_word = json.load(f)
    index_to_word = {int(k): v for k, v in index_to_word.items()}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NextWordMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate, context_size, activation_function):
        super(NextWordMLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * context_size, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.activation_function = activation_function

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.dropout1(self.activation_function(self.bn1(self.fc1(x))))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


# vocab_size = len(vocab)
vocab_size = len(word_to_index)

embedding_dim = 100
hidden_dim = 256
dropout_rate = 0.5
context_size = 5
activation_function = nn.ReLU()
base= NextWordMLP(vocab_size, embedding_dim, hidden_dim, dropout_rate, context_size, activation_function)
print(base)

NextWordMLP(
  (embedding): Embedding(14343, 100)
  (fc1): Linear(in_features=500, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=14343, bias=True)
  (activation_function): ReLU()
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

activation_function_map = {
    "tanh": torch.tanh,
    "relu": F.relu,
    "leaky_relu": F.leaky_relu
}
activation_function = activation_function_map["leaky_relu"]

context_size = 5
embedding_dim = 32
seed = 42

model = NextWordMLP(
    vocab_size=len(word_to_index),
    embedding_dim=embedding_dim,
    hidden_dim=1024,
    dropout_rate=0.3,
    context_size=context_size,
    activation_function=activation_function
).to(device)

model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print("✅ Model loaded successfully from Google Drive")


✅ Model loaded successfully from Google Drive


In [None]:
def words_to_indices(words, word_to_index):
    return [word_to_index[word] if word in word_to_index else word_to_index['pad'] for word in words]

def generate_text(model, start_sequence, num_words, temperature=1.0):
    model.eval()
    generated = list(start_sequence)
    for _ in range(num_words):
        input_seq = torch.tensor(generated[-context_size:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
        logits = output.squeeze(0) / temperature
        next_word_idx = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1).item()
        generated.append(next_word_idx)
        if index_to_word[next_word_idx] == 'end':
            break
    return ' '.join(index_to_word[idx] for idx in generated if index_to_word[idx] != 'pad')


In [None]:
def clean_and_tokenize(text):
    if text is None or text.strip() == "":
        return []


    text = text.replace('start', ' start ').replace('end', ' end ')

    text = re.sub(r'([.,!?])', r' \1 ', text)  # Add spaces around punctuation marks
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    segments = text.lower().split() # Split text into segments

    return segments

# Testing model

In [None]:
# Example input
start_sequence_words = "Mix milk and cream"
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=100, temperature=1.0)
print("Generated Recipe:", generated_text)


Generated Recipe: mix milk and cream , the spices also . attach the cooked and on the gravy cooked . after the onion becomes soft , add in ginger , green chillies , curry leaves and saute for about 3 minutes on medium flame , so the roasted potatoes , sprinkle some water and knead for a minute till they turn golden brown and drain the excess water and keep it aside . now make a baking sheet on both sides and allow the end


### Testing with new random inputs

In [None]:
# Example input
start_sequence_words = "My name is" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=10, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: my name is to drain the water . refresh the vegetables into the


In [None]:
# Example input
start_sequence_words = "Good" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=20, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: good , turn off the heat and serve . serve puli engi with some curry and serve the biryani with a


In [None]:
# Example input
start_sequence_words = "This takes about" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=20, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: this takes about 3 to 4 minutes . now add 3 cups of water and place it in a bowl . set aside


In [None]:
# Example input
start_sequence_words = "To begin making the Ragi" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=20, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: to begin making the ragi malpua rabri recipe , mint & cucumber raita . end


In [None]:
# Example input
start_sequence_words = "To make tomato puliogere, first cut" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=20, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: to make tomato puliogere , first cut the tomatoes into thin roundels . in a large mixing bowl or a bowl , whisk them up . add


#### Good output

In [None]:
# Example input
start_sequence_words = "To make Andhra Style Alam Pachadi" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=100, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: to make andhra style alam pachadi , first heat oil in each of the cavities . place this deep after your dessert for at least indian week on the grill . once its and serve the rice . soak dates in water for 15-20 minutes . drain the water from the rice , add the grated coconut , mustard oil , sugar and add the rice powder . it will thicken , once the milk comes to a boil , add cumin seeds , curry leaves and broken mor milagai . add oil in a heavy bottomed pan . add the chopped vegetables to the mashed


In [None]:
# Example input
start_sequence_words = "Boil the water first" # New input
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=100, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Generated Recipe: boil the water first to form a smooth paste , adding water little at a time and whisk until the butter is bl end


### Testing with user inputs

In [None]:
# Example input
print("Enter the input string")
start_sequence_words = input() # New input
print("Enter the number of words to predict")
num_words = int(input())
start_sequence_words = clean_and_tokenize(start_sequence_words)  # <-- must define same tokenizer
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words=num_words, temperature=1.0) # This time predicting 10 words. Earlier 100 words were predicted
print("Generated Recipe:", generated_text)


Enter the input string
This is 
Enter the number of words to predict
5
Generated Recipe: this is over to cook , add
