In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import re

# ================== Load and Preprocess Text ==================
print("Loading dataset...")
with open("shakespeare_full_cleaned.txt", "r", encoding="utf-8") as f:
    text_data = f.read().lower()

# ================== Tokenization ==================
words = re.findall(r"\b\w+\b|[^\w\s]", text_data)
vocab = sorted(set(words))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

# ================== Sequence Preparation ==================
sequence_len = 20
inputs, targets = [], []
for i in range(len(words) - sequence_len):
    inputs.append([word2idx[w] for w in words[i:i+sequence_len]])
    targets.append([word2idx[w] for w in words[i+1:i+sequence_len+1]])

input_tensor = torch.tensor(inputs, dtype=torch.long)
target_tensor = torch.tensor(targets, dtype=torch.long)

# ================== DataLoader ==================
batch_size = 64
dataset = TensorDataset(input_tensor, target_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# ================== Transformer Model ==================
from Transformer import Transformer  # Your custom model

embedding_dim = 128
device = 'mps'

model = Transformer(
    vocabulary_size=vocab_size,
    number_of_embeddings=embedding_dim,
    sequence_len=sequence_len,
    input_dimensions=embedding_dim,
).to(device)

# ================== Training Setup ==================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ================== Training Loop ==================
print("Training started...")
for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# ================== Save Model ==================
torch.save(model.state_dict(), "word_level_transformer.pth")
print("✅ Model saved to 'word_level_transformer.pth'")

Loading dataset...
Training started...


Epoch 1: 100%|██████████| 18156/18156 [14:44<00:00, 20.54it/s]


Epoch 1, Loss: 12228.2633


Epoch 2: 100%|██████████| 18156/18156 [16:56<00:00, 17.87it/s]


Epoch 2, Loss: 5227.2968


Epoch 3: 100%|██████████| 18156/18156 [16:53<00:00, 17.92it/s]


Epoch 3, Loss: 4843.5106


Epoch 4: 100%|██████████| 18156/18156 [17:09<00:00, 17.63it/s]


Epoch 4, Loss: 4671.8702


Epoch 5: 100%|██████████| 18156/18156 [17:42<00:00, 17.09it/s]


Epoch 5, Loss: 4558.4466


Epoch 6: 100%|██████████| 18156/18156 [16:21<00:00, 18.50it/s]


Epoch 6, Loss: 4469.3141


Epoch 7: 100%|██████████| 18156/18156 [17:07<00:00, 17.67it/s]


Epoch 7, Loss: 4395.1040


Epoch 8: 100%|██████████| 18156/18156 [16:39<00:00, 18.16it/s]


Epoch 8, Loss: 4327.4048


Epoch 9: 100%|██████████| 18156/18156 [17:26<00:00, 17.34it/s]


Epoch 9, Loss: 4267.2158


Epoch 10: 100%|██████████| 18156/18156 [17:12<00:00, 17.58it/s]

Epoch 10, Loss: 4212.9968
✅ Model saved to 'word_level_transformer.pth'





In [11]:
import torch
import torch.nn.functional as F
import re
from Transformer import Transformer  # Your custom model

# ======= Load Vocabulary from Dataset =======
with open("shakespeare_full_cleaned.txt", "r", encoding="utf-8") as f:
    text_data = f.read().lower()

words = re.findall(r"\b\w+\b|[^\w\s]", text_data)
vocab = sorted(set(words))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

# ======= Model Setup =======
embedding_dim = 128
sequence_len = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Transformer(
    vocabulary_size=vocab_size,
    number_of_embeddings=embedding_dim,
    sequence_len=sequence_len,
    input_dimensions=embedding_dim,
).to(device)

model.load_state_dict(torch.load("word_level_transformer.pth", map_location=device))
model.eval()
print("✅ Model loaded and ready for generation")

# ======= Generation Parameters =======
num_generate = 50
temperature = 1.0
top_k = 20

# ======= Start from Random Context =======
generated = [torch.randint(0, vocab_size, (1,)).item() for _ in range(sequence_len)]

with torch.no_grad():
    for _ in range(num_generate):
        x = torch.tensor([generated[-sequence_len:]], dtype=torch.long).to(device)
        logits = model(x)[0, -1] / temperature
        logits, topk_indices = torch.topk(logits, top_k)
        probs = F.softmax(logits, dim=-1)
        next_token = topk_indices[torch.multinomial(probs, num_samples=1).item()].item()
        generated.append(next_token)

# ======= Decode and Print =======
result = [idx2word[i] for i in generated]
print("\n📝 Generated Text:\n" + " ".join(result))

✅ Model loaded and ready for generation

📝 Generated Text:
birdbolt attachment perdurable took zephyrs fumblest cringe avenged housekeepers andronicus kites less travelers untie speak harnessed accursed seconded misused abundant up in this same interchange in france , and his humble suit again , his name was fair . king henry , henry i know henry , and he did you . gloucester are so . i have no more than you are ; you ; i ' ll tell
