### Imports + device

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset , DataLoader
from collections import Counter
import random
import re

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device", device)

Device cuda


### Load and Simple Preprocess (Keep only words , lowercase)

In [8]:
path = "/content/data.txt"
text = open(path, 'r' , encoding='utf-8').read()

# Simple cleaning: remove headers/footers roughly, lowercase, keep words + basic punctuation
text = text.split("*** START OF THIS PROJECT GUTENBERG EBOOK")[0] + text.split("*** START OF THIS PROJECT GUTENBERG EBOOK")[-1]
text = text.lower()
# replace non-word chars with spaces except apostrophes
text = re.sub(r"[^a-z0-9'\s]+", " ", text)
words = text.split()
print("Total tokens:", len(words))

Total tokens: 109242


### Build vocabulary (word -> idx) and sequences

In [9]:
vocab_size_limit = None   # set int to cap vocab size, or None to use full vocab
counts = Counter(words)
if vocab_size_limit:
    most = counts.most_common(vocab_size_limit-2)
    vocab = ["<pad>","<unk>"] + [w for w,_ in most]
else:
    vocab = ["<pad>","<unk>"] + sorted(counts.keys(), key=lambda x:-counts[x])
stoi = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in stoi.items()}

# convert words to indices (with unk handling)
data = [ stoi.get(w, stoi["<unk>"]) for w in words ]

# create input-output pairs using sliding window of length seq_len
seq_len = 6   # number of previous words to condition on
inputs, targets = [], []
for i in range(len(data)-seq_len):
    inputs.append(data[i:i+seq_len])
    targets.append(data[i+seq_len])
print("Samples:", len(inputs), "Vocab size:", len(vocab))

Samples: 109236 Vocab size: 8177


### Dataset + DataLoader

In [10]:
class NextWordDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

batch_size = 128
ds = NextWordDataset(inputs, targets)
loader = DataLoader(ds, batch_size=batch_size, shuffle=True, drop_last=True)

### Simple LSTM model

In [11]:
class WordLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hid_dim=256, n_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hid_dim, vocab_size)
    def forward(self, x):
        # x: (B, seq_len)
        e = self.emb(x)                     # (B, seq_len, emb_dim)
        out, _ = self.lstm(e)               # (B, seq_len, hid_dim)
        out = out[:, -1, :]                 # take last hidden state (B, hid_dim)
        return self.fc(out)                 # (B, vocab_size)

model = WordLSTM(len(vocab)).to(device)



### Training loop (concise)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

epochs = 60   # increase for better results
model.train()
for epoch in range(1, epochs+1):
    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    print(f"Epoch {epoch}/{epochs}  loss: {total_loss/len(loader.dataset):.4f}")

### Next-word sampling / generation helper

In [16]:
import torch.nn.functional as F

def predict_next(model, seed_text, k=1, max_words=40, temperature=1.0):
    """
    seed_text: string of seed words (can be shorter than seq_len)
    k: number of words to sample each step (1 usual)
    returns generated string (seed + generated)
    """
    model.eval()
    seed_tokens = re.sub(r"[^a-z0-9'\s]+"," ", seed_text.lower()).split()
    idxs = [ stoi.get(w, stoi["<unk>"]) for w in seed_tokens ]
    out_words = seed_tokens.copy()
    for _ in range(max_words):
        # prepare seq: last seq_len tokens (pad if needed)
        cur = idxs[-seq_len:]
        if len(cur) < seq_len:
            cur = [stoi["<pad>"]] * (seq_len - len(cur)) + cur
        xb = torch.tensor([cur], dtype=torch.long).to(device)
        with torch.no_grad():
            logits = model(xb)  # (1, vocab)
            probs = F.softmax(logits / temperature, dim=-1).cpu().numpy().ravel()
        next_ix = int(np.random.choice(len(probs), p=probs))
        next_word = itos.get(next_ix, "<unk>")
        out_words.append(next_word)
        idxs.append(next_ix)
    return " ".join(out_words)

# small example (after training)
import numpy as np
seed = "sherlock holmes was"
print(predict_next(model, seed, max_words=40))

sherlock holmes was safe my friend s stepfather mr holmes pray tell me always about the father but it was quite to see that a gentleman could have been fastened she was the very first day and it was that i lock it


### save model

In [17]:
torch.save(model.state_dict(), "lstm_nextword.pt")