## 1. DATA COLLECTION

In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import joblib

# Load “Paradise Lost” by John Milton
data = gutenberg.raw('milton-paradise.txt')

joblib.dump(data, 'paradise.txt')   # saving data to a file and reusing it later!

## 2. Data Preprocessing

In [None]:

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load text
txt_file = joblib.load('paradise.pkl')

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([txt_file])
total_words = len(tokenizer.word_index) + 1


In [None]:
tokenizer.word_index

## 2.1 Creating tokens and sequences (text to num)

In [None]:
# Create input sequences (n-grams)
input_sequences = []
for line in txt_file.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split predictors and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 3. splitting Data x,y to train_loader, test_loader

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert numpy arrays to torch tensors
x_train_t = torch.tensor(x_train, dtype=torch.long)
y_train_t = torch.tensor(y_train, dtype=torch.long)
x_test_t  = torch.tensor(x_test, dtype=torch.long)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

# Dataloader
train_ds = TensorDataset(x_train_t, y_train_t)
test_ds  = TensorDataset(x_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64)

## 4. Defining model in pytorch

In [None]:
# Define Model
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm1 = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim // 2, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.dropout(x[:, -1, :])  # take last time step
        out = self.fc(x)
        return out

# Instantiate model
model = NextWordLSTM(total_words, embed_dim=100, hidden_dim=150)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## 5. Training and testing loop

In [None]:
epochs = 50
best_val_loss = float('inf')
patience, trigger = 5, 0  # early stopping

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:  # xb means x batch and yb means y batch
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in test_loader:
            outputs = model(xb)
            loss = criterion(outputs, yb)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger = 0
        torch.save(model.state_dict(), "best_lstm.pth")
    else:
        trigger += 1
        if trigger>= patience:
            print("Early stopping triggered.")
            break

## 6. Predict function

In [None]:

def predict_next_word(model, tokenizer, text, max_sequence_len):
    model.eval()
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    token_tensor = torch.tensor(token_list, dtype=torch.long)

    with torch.no_grad():
        output = model(token_tensor)
        predicted_index = torch.argmax(output, dim=1).item()

    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return None

# Example
input_text = "Of man’s first disobedience"
print(f"Input: {input_text}")
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Predicted next word: {next_word}")


## 7. Predict top 5 words 

In [None]:
import torch.nn.functional as F

def predict_top_k_words(model, tokenizer, text, max_sequence_len, k=5):
    model.eval()
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Trim to match input length
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]

    # Pad sequence
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    token_tensor = torch.tensor(token_list, dtype=torch.long)

    with torch.no_grad():
        logits = model(token_tensor)
        probs = F.softmax(logits, dim=1)  # convert to probabilities
        top_probs, top_indices = torch.topk(probs, k)

    # Map indices back to words
    top_words = []
    for i in range(k):
        word = None
        for w, idx in tokenizer.word_index.items():
            if idx == top_indices[0][i].item():
                word = w
                break
        top_words.append((word, top_probs[0][i].item()))

    return top_words


# Example usage
input_text = "Of man’s first disobedience"
print(f"Input: {input_text}")
predictions = predict_top_k_words(model, tokenizer, input_text, max_sequence_len, k=5)

print("\nTop 5 Predictions:")
for word, prob in predictions:
    print(f"{word} ({prob:.4f})")


In [None]:
txt_file

In [None]:
predict_top_k_words(model, tokenizer,"In the beginnings how the heavens and earth rose out of", max_sequence_len, k=10)

In [None]:
## saving files

joblib.dump(tokenizer, 'tokenizer.pkl')  # saving tokenizer for later use
joblib.dump()