In [1]:
import re
import pandas as pd
import string

df = pd.read_csv("data/poem.csv", nrows=500)  

poem_lines = df.iloc[:, 2].dropna().astype(str)
poem_lines = poem_lines.apply(lambda x: x.strip())
poem = " ".join(poem_lines)

tokens = re.findall(r"\b\w+\b|[^\w\s]", poem)

# Output
print(f"Total tokens: {len(tokens)}")
print(f"First 20 tokens: {tokens[:20]}")


Total tokens: 140109
First 20 tokens: ['Dog', 'bone', ',', 'stapler', ',', 'cribbage', 'board', ',', 'garlic', 'press', 'because', 'this', 'window', 'is', 'loose', '—', 'lacks', 'suction', ',', 'lacks']


In [2]:
vocab = sorted(set(tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"Vocab size: {len(vocab)}")


Vocab size: 16491


In [3]:
seq_length = 25
inputs = []
targets = []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]          # input words
    target = tokens[i+seq_length]         # next word to predict
    inputs.append([word2idx[w] for w in seq])
    targets.append(word2idx[target])

import torch

X = torch.tensor(inputs) 
y = torch.tensor(targets) 

dataset_size = len(X)
split_ratio = 0.8
split_idx = int(dataset_size * split_ratio)

X_train = X[:split_idx]
y_train = y[:split_idx]

X_test = X[split_idx:]
y_test = y[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 112067, Test size: 28017


In [4]:
import torch.nn as nn

class NextWordRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)   
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True) 
        self.fc = nn.Linear(hidden_size, vocab_size)         

    def forward(self, x):
        x = self.embedding(x)    
        out, _ = self.lstm(x)     
        out = out[:, -1, :]   
        out = self.fc(out)
        return out


In [5]:
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
model = NextWordRNN(vocab_size, embed_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
X = X.to(device)
y = y.to(device)

X_test = X_test.to(device)
y_test = y_test.to(device)

num_epochs = 100
batch_size = 128

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for i in range(0, len(X), batch_size):
        xb = X[i:i+batch_size].to(device)
        yb = y[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, dim=1) 
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

    avg_train_loss = epoch_loss / (len(X) // batch_size)
    train_accuracy = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

    # --- Testing / Validation ---
    model.eval()
    test_loss = 0
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for i in range(0, len(X_test), batch_size):
            xb = X_test[i:i+batch_size].to(device)
            yb = y_test[i:i+batch_size].to(device)
            outputs = model(xb)
            loss = criterion(outputs, yb)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)
            correct_test += (predicted == yb).sum().item()
            total_test += yb.size(0)

    avg_test_loss = test_loss / (len(X_test) // batch_size)
    test_accuracy = correct_test / total_test * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Using device: cuda
Epoch 1/100, Train Loss: 7.0383, Train Accuracy: 7.67%
Epoch 1/100, Test Loss: 6.5279, Test Accuracy: 9.43%
Epoch 2/100, Train Loss: 6.2786, Train Accuracy: 10.47%
Epoch 2/100, Test Loss: 5.9427, Test Accuracy: 11.90%
Epoch 3/100, Train Loss: 5.8997, Train Accuracy: 11.80%
Epoch 3/100, Test Loss: 5.5181, Test Accuracy: 12.98%
Epoch 4/100, Train Loss: 5.5792, Train Accuracy: 12.66%
Epoch 4/100, Test Loss: 5.1792, Test Accuracy: 13.64%
Epoch 5/100, Train Loss: 5.2907, Train Accuracy: 13.42%
Epoch 5/100, Test Loss: 4.8877, Test Accuracy: 14.71%
Epoch 6/100, Train Loss: 5.0143, Train Accuracy: 14.25%
Epoch 6/100, Test Loss: 4.6366, Test Accuracy: 16.28%
Epoch 7/100, Train Loss: 4.7550, Train Accuracy: 15.53%
Epoch 7/100, Test Loss: 4.4150, Test Accuracy: 18.40%
Epoch 8/100, Train Loss: 4.5126, Train Accuracy: 17.49%
Epoch 8/100, Test Loss: 4.2359, Test Accuracy: 20.81%
Epoch 9/100, Train Loss: 4.2893, Train Accuracy: 19.99%
Epoch 9/100, Test Loss: 4.0628, Test Accuracy: 

In [7]:
torch.save(model.state_dict(), "next_word.pth")
import pickle
with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)
with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)


In [8]:
def predict_next_word(model, prompt, k=3):
    model.eval()
    prompt_tokens = re.findall(r"\b\w+\b|[^\w\s]", prompt.lower())
    input_seq = prompt_tokens[-seq_length:]
    input_ids = [word2idx.get(w, 0) for w in input_seq]
    input_tensor = torch.tensor([input_ids]).to(next(model.parameters()).device) 

    with torch.no_grad():
        output = model(input_tensor)                
        probs = torch.softmax(output, dim=1)         # probabilities
        top_probs, top_indices = torch.topk(probs, k)

        predictions = [idx2word[idx.item()] for idx in top_indices[0]]
        return predictions

