In [None]:
# preparing the dataset by tokenizing the text data and converting everything to lowercase
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

# load data
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

tokens = word_tokenize(text)
print("Total Tokens:", len(tokens))

[nltk_data] Downloading package punkt to C:\Users\HP
[nltk_data]     FOLIO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\HP
[nltk_data]     FOLIO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total Tokens: 125772


In [3]:
# building the vocabulary and creating word-to-index mappings
from collections import Counter

word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [6]:
# create input-target sequences for next word prediction
import torch
sequence_length = 4  # e.g., "I am going to [predict this]"

data = []
for i in range(len(tokens) - sequence_length):
    input_seq = tokens[i:i + sequence_length - 1]
    target = tokens[i + sequence_length - 1]
    data.append((input_seq, target))

# convert words to indices
def encode(seq): return [word2idx[word] for word in seq]

encoded_data = [(torch.tensor(encode(inp)), torch.tensor(word2idx[target]))
                for inp, target in data]

In [7]:
# define the LSTM-based Predictive Keyboard model
import torch.nn as nn

class PredictiveKeyboard(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super(PredictiveKeyboard, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output[:, -1, :])  # last LSTM output
        return output

In [8]:
# train the model on input-target word sequences
import torch
import torch.optim as optim
import random

model = PredictiveKeyboard(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

epochs = 20
for epoch in range(epochs):
    total_loss = 0
    random.shuffle(encoded_data)
    for input_seq, target in encoded_data[:10000]:  # Limit data for speed
        input_seq = input_seq.unsqueeze(0)
        output = model(input_seq)
        loss = criterion(output, target.unsqueeze(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 65926.0016
Epoch 2, Loss: 67716.0450
Epoch 3, Loss: 69567.7317
Epoch 4, Loss: 70882.2244
Epoch 5, Loss: 72777.6713
Epoch 6, Loss: 72348.7293
Epoch 7, Loss: 71978.6820
Epoch 8, Loss: 74495.4696
Epoch 9, Loss: 74547.1470
Epoch 10, Loss: 76101.1995
Epoch 11, Loss: 75044.9536
Epoch 12, Loss: 76796.5563
Epoch 13, Loss: 76412.2413
Epoch 14, Loss: 77534.9266
Epoch 15, Loss: 77155.9877
Epoch 16, Loss: 79957.8370
Epoch 17, Loss: 80817.3230
Epoch 18, Loss: 79512.8713
Epoch 19, Loss: 80176.3040
Epoch 20, Loss: 81638.9565


In [9]:
import torch.nn.functional as F

def suggest_next_words(model, text_prompt, top_k=3):
    model.eval()
    tokens = word_tokenize(text_prompt.lower())
    if len(tokens) < sequence_length - 1:
        raise ValueError(f"Input should be at least {sequence_length - 1} words long.")

    input_seq = tokens[-(sequence_length - 1):]
    input_tensor = torch.tensor(encode(input_seq)).unsqueeze(0)

    with torch.no_grad():
        output = model(input_tensor)
        probs = F.softmax(output, dim=1).squeeze()
        top_indices = torch.topk(probs, top_k).indices.tolist()

    return [idx2word[idx] for idx in top_indices]

print("Suggestions:", suggest_next_words(model, "So, are we really at"))

Suggestions: ['the', 'my', ',']
