In [1]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

with open('/content/sb.txt','r',encoding='utf-8') as f:
  text = f.read().lower()

tokens = word_tokenize(text)
print("Total Tokens:", len(tokens))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Total Tokens: 125772


In [2]:
from collections import Counter

word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [4]:
import torch

In [5]:
sequence_lenght = 4
data = []
for i in range(len(tokens) - sequence_lenght):
  input_seq = tokens[i :i + sequence_lenght - 1]
  target = tokens[i + sequence_lenght - 1]
  data.append((input_seq, target))
def encode(seq): return [word2idx[word] for word in seq]
encoded_data = [(torch.tensor(encode(inp)), torch.tensor(word2idx[target])) for inp, target in data]

In [6]:
import torch.nn as nn
class PredictiveKeyboard(nn.Module):
  def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
    super(PredictiveKeyboard, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim,hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim,vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    output, _ = self.lstm(x)
    output = self.fc(output[:, -1,:])
    return output


In [7]:
import torch
import torch.optim as optim
import random

model = PredictiveKeyboard(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

epochs = 20
for epoch in range(epochs):
  total_loss = 0
  random.shuffle(encoded_data)
  for input_seq, target in encoded_data[:100]:
    input_seq = input_seq.unsqueeze(0)
    output = model(input_seq)
    loss = criterion(output, target.unsqueeze(0))
  print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 0.0000
Epoch 2, Loss: 0.0000
Epoch 3, Loss: 0.0000
Epoch 4, Loss: 0.0000
Epoch 5, Loss: 0.0000
Epoch 6, Loss: 0.0000
Epoch 7, Loss: 0.0000
Epoch 8, Loss: 0.0000
Epoch 9, Loss: 0.0000
Epoch 10, Loss: 0.0000
Epoch 11, Loss: 0.0000
Epoch 12, Loss: 0.0000
Epoch 13, Loss: 0.0000
Epoch 14, Loss: 0.0000
Epoch 15, Loss: 0.0000
Epoch 16, Loss: 0.0000
Epoch 17, Loss: 0.0000
Epoch 18, Loss: 0.0000
Epoch 19, Loss: 0.0000
Epoch 20, Loss: 0.0000


In [9]:
import torch.nn.functional as F
def suggest_next_words(model, text_prompt,top_k=3):
  model.eval()
  tokens = word_tokenize(text_prompt.lower())
  if len(tokens) < sequence_lenght - 1:
    raise ValueError(f" input should be at least {sequence_lenght - 1} words longs.")
  input_seq = tokens[-(sequence_lenght - 1):]
  input_tensor = torch.tensor(encode(input_seq)).unsqueeze(0)
  with torch.no_grad():
    output = model(input_tensor)
    probs = F.softmax(output, dim=1).squeeze()
    top_indices = torch.topk(probs, top_k).indices.tolist()
  return [idx2word[idx] for idx in top_indices]
print("Suggestions:", suggest_next_words(model,"He was in the house about half an hour"))

Suggestions: ['concealment', 'cut', 'tradesmen']
