In [6]:
import nltk 
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to C:\Users\Aniket
[nltk_data]     Mandal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Aniket
[nltk_data]     Mandal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
# load data 
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as f:
          text = f.read().lower()
          
tokens = word_tokenize(text)
print('Total Tokens:', len(tokens))

Total Tokens: 125772


In [8]:
from collections import Counter 

word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [9]:
sequence_length = 4  # e.g., "I am going to [predict this]"

data = []
for i in range(len(tokens) - sequence_length):
    input_seq = tokens[i:i + sequence_length - 1]
    target = tokens[i + sequence_length - 1]
    data.append((input_seq, target))

# convert words to indices
def encode(seq): return [word2idx[word] for word in seq]

encoded_data = [(torch.tensor(encode(inp)), torch.tensor(word2idx[target]))
                for inp, target in data]

In [10]:
import torch.nn as nn

class PredictiveKeyboard(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super(PredictiveKeyboard, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output[:,-1, :]) # last LSTM output
        return output

In [11]:
# Training Pipeline 
import torch
import torch.optim as optim
import random

model = PredictiveKeyboard(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

epochs = 10
for epoch in range(epochs):
    total_loss = 0
    random.shuffle(encoded_data)
    for input_seq, target in encoded_data[:10000]:
        input_seq = input_seq.unsqueeze(0)
        output = model(input_seq)
        loss = criterion(output, target.unsqueeze(0))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 65995.3362
Epoch 2, Loss: 66762.7231
Epoch 3, Loss: 70311.2908
Epoch 4, Loss: 71701.0579
Epoch 5, Loss: 72439.7013
Epoch 6, Loss: 71635.1372
Epoch 7, Loss: 74627.7269
Epoch 8, Loss: 73484.9313
Epoch 9, Loss: 76085.8507
Epoch 10, Loss: 76295.3345


In [25]:
import torch.nn.functional as F

def suggest_next_words(model, text_prompt, top_k=3):
    model.eval()
    tokens = word_tokenize(text_prompt.lower())
    if len(tokens) < sequence_length - 1:
        raise ValueError(f"Input should be at least {sequence_length - 1} words long.")

    input_seq = tokens[-(sequence_length - 1):]
    input_tensor = torch.tensor(encode(input_seq)).unsqueeze(0)

    with torch.no_grad():
        output = model(input_tensor)
        probs = F.softmax(output, dim=1).squeeze()
        top_indices = torch.topk(probs, top_k).indices.tolist()

    return [idx2word[idx] for idx in top_indices]

print("Suggestions:", suggest_next_words(model, "So, Are you going"))

Suggestions: ['on', 'once', 'our']
