In [2]:
import re

# Read Shakespeare text file
with open("data/input.txt", 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Tokenize with regex: words + punctuation
tokens = re.findall(r"\b\w+\b|[^\w\s]", text)
print(f"Total tokens: {len(tokens)}")
print(f"First 20 tokens: {tokens[:20]}")


Total tokens: 262927
First 20 tokens: ['first', 'citizen', ':', 'before', 'we', 'proceed', 'any', 'further', ',', 'hear', 'me', 'speak', '.', 'all', ':', 'speak', ',', 'speak', '.', 'first']


In [5]:
vocab = sorted(set(tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"Vocab size: {len(vocab)}")


Vocab size: 11466


In [6]:
seq_length = 5
inputs = []
targets = []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]          # input words
    target = tokens[i+seq_length]         # next word to predict
    inputs.append([word2idx[w] for w in seq])
    targets.append(word2idx[target])

import torch

X = torch.tensor(inputs)   # Shape: (num_sequences, seq_length)
y = torch.tensor(targets)  # Shape: (num_sequences,)

dataset_size = len(X)
split_ratio = 0.8
split_idx = int(dataset_size * split_ratio)

X_train = X[:split_idx]
y_train = y[:split_idx]

X_test = X[split_idx:]
y_test = y[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 210337, Test size: 52585


In [7]:
import torch.nn as nn

class NextWordRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)   # Turns word IDs into vectors
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)  # RNN layer
        self.fc = nn.Linear(hidden_size, vocab_size)            # Output layer (predict vocab logits)

    def forward(self, x):
        x = self.embedding(x)            # (batch_size, seq_length, embed_size)
        out, _ = self.rnn(x)             # (batch_size, seq_length, hidden_size)
        out = out[:, -1, :]              # Take output from last time step
        out = self.fc(out)               # (batch_size, vocab_size)
        return out


In [10]:
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
model = NextWordRNN(vocab_size, embed_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
X = X.to(device)
y = y.to(device)

# Assuming you have separate test sets X_test, y_test already prepared and on device:
X_test = X_test.to(device)
y_test = y_test.to(device)

num_epochs = 100
batch_size = 128

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for i in range(0, len(X), batch_size):
        xb = X[i:i+batch_size].to(device)
        yb = y[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(xb)  # Shape: (batch_size, vocab_size) or (batch_size, num_classes)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, dim=1)   # predicted classes
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

    avg_train_loss = epoch_loss / (len(X) // batch_size)
    train_accuracy = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

    # --- Testing / Validation ---
    model.eval()
    test_loss = 0
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for i in range(0, len(X_test), batch_size):
            xb = X_test[i:i+batch_size].to(device)
            yb = y_test[i:i+batch_size].to(device)
            outputs = model(xb)
            loss = criterion(outputs, yb)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)
            correct_test += (predicted == yb).sum().item()
            total_test += yb.size(0)

    avg_test_loss = test_loss / (len(X_test) // batch_size)
    test_accuracy = correct_test / total_test * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Using device: cuda
Epoch 1/100, Train Loss: 3.1484, Train Accuracy: 34.30%
Epoch 1/100, Test Loss: 3.5671, Test Accuracy: 27.97%
Epoch 2/100, Train Loss: 3.1299, Train Accuracy: 34.61%
Epoch 2/100, Test Loss: 3.5447, Test Accuracy: 28.34%
Epoch 3/100, Train Loss: 3.1142, Train Accuracy: 34.87%
Epoch 3/100, Test Loss: 3.5194, Test Accuracy: 28.62%
Epoch 4/100, Train Loss: 3.1002, Train Accuracy: 35.10%
Epoch 4/100, Test Loss: 3.5073, Test Accuracy: 28.79%
Epoch 5/100, Train Loss: 3.0867, Train Accuracy: 35.38%
Epoch 5/100, Test Loss: 3.4963, Test Accuracy: 28.98%
Epoch 6/100, Train Loss: 3.0756, Train Accuracy: 35.53%
Epoch 6/100, Test Loss: 3.4882, Test Accuracy: 29.22%
Epoch 7/100, Train Loss: 3.0640, Train Accuracy: 35.70%
Epoch 7/100, Test Loss: 3.4801, Test Accuracy: 29.44%
Epoch 8/100, Train Loss: 3.0553, Train Accuracy: 35.88%
Epoch 8/100, Test Loss: 3.4710, Test Accuracy: 29.61%
Epoch 9/100, Train Loss: 3.0484, Train Accuracy: 35.94%
Epoch 9/100, Test Loss: 3.4754, Test Accuracy

In [13]:
torch.save(model.state_dict(), "next_word.pth")
import pickle
with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)
with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)


In [14]:
def predict_next_word(model, prompt, k=3):
    model.eval()
    prompt_tokens = re.findall(r"\b\w+\b|[^\w\s]", prompt.lower())
    input_seq = prompt_tokens[-seq_length:]
    input_ids = [word2idx.get(w, 0) for w in input_seq]
    input_tensor = torch.tensor([input_ids]).to(next(model.parameters()).device)  # move to model's device

    with torch.no_grad():
        output = model(input_tensor)                 # logits over vocab
        probs = torch.softmax(output, dim=1)         # probabilities
        top_probs, top_indices = torch.topk(probs, k)

        predictions = [idx2word[idx.item()] for idx in top_indices[0]]
        return predictions



In [None]:
print("welcome to the world of AI!")    

prompt = "to be or not to be"
predictions = predict_next_word(model, prompt)

print(f"Prompt: '{prompt}'")
print("Predicted next words:", predictions)

for i in range(5):
    prompt = prompt+" "+input("Enter a prompt: ")
    if prompt.lower() == "exit":
        break
    predictions = predict_next_word(model, prompt)
    print(f"Prompt: '{prompt}'")
    print("Predicted next words:", predictions)