In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-12-18 18:58:04--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-12-18 18:58:04 (37.8 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [5]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=["input.txt"],
    vocab_size=5000,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)


In [6]:
encoded = tokenizer.encode("To be, or not to be")
print(encoded.tokens)
print(encoded.ids)

['To', 'Ġbe', ',', 'Ġor', 'Ġnot', 'Ġto', 'Ġbe']
[403, 309, 16, 528, 326, 292, 309]


In [7]:
import os
os.makedirs("tokenizer", exist_ok=True)

In [8]:
tokenizer.save_model("tokenizer")

['tokenizer/vocab.json', 'tokenizer/merges.txt']

In [9]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt"
)

vocab_size = tokenizer.get_vocab_size()
print(vocab_size)


5000


In [10]:
encoded = tokenizer.encode(text).ids

In [11]:
import torch
data = torch.tensor(encoded, dtype=torch.long)


In [12]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [22]:
block_size = 8
train_data[:block_size+1]

tensor([ 676, 1201,   30,  203, 2347,  336, 2752,  807, 2307])

In [23]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([676]) the target: 1201
when input is tensor([ 676, 1201]) the target: 30
when input is tensor([ 676, 1201,   30]) the target: 203
when input is tensor([ 676, 1201,   30,  203]) the target: 2347
when input is tensor([ 676, 1201,   30,  203, 2347]) the target: 336
when input is tensor([ 676, 1201,   30,  203, 2347,  336]) the target: 2752
when input is tensor([ 676, 1201,   30,  203, 2347,  336, 2752]) the target: 807
when input is tensor([ 676, 1201,   30,  203, 2347,  336, 2752,  807]) the target: 2307


In [13]:
block_size = 128
batch_size = 32

def get_batch(split):
    d = train_data if split == "train" else val_data
    ix = torch.randint(len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x, y

In [14]:
import torch.nn as nn

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.token_emb(x)
        logits = self.lm_head(x)
        return logits


In [15]:
model = MiniGPT(vocab_size, embed_dim=256)

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

for step in range(5000):
    x, y = get_batch("train")

    logits = model(x)
    loss = loss_fn(logits.view(-1, vocab_size), y.view(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(step, loss.item())


0 8.746197700500488
500 5.918015003204346
1000 4.947602272033691
1500 4.598130226135254
2000 4.243202209472656
2500 4.285006046295166
3000 4.134488105773926
3500 4.063146591186523
4000 3.9839344024658203
4500 3.9682247638702393


In [19]:
def generate(model, prompt, max_new_tokens=200):
    model.eval()
    ids = tokenizer.encode(prompt).ids
    x = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    for _ in range(max_new_tokens):
        logits = model(x)
        probs = torch.softmax(logits[:, -1, :], dim=-1)
        next_id = torch.multinomial(probs, 1)
        x = torch.cat((x, next_id), dim=1)

    return tokenizer.decode(x[0].tolist())

In [26]:
print(generate(model, "God have mercy on us!"))

God have mercy on us! and more sudden the tribunes at the ground stir in Rome itself forswore may say, lords; butts till, I hope I may I think I know Claudio.
Is of England are writ, it
Ay;
I may they live
BENVOLIO:
GLOUCESTER:

First His to give me regre
Ay of them one that, and way, remember them
To London, sir, by ne'Tis beauty,
On struck res prosperous days ago in arms;

Yet,
My father was pleased my brother died,
Defest.
Forgage, though that name you, it's be thakest thy world's with death us:
Then pith; so men is become ye, orian fray himself.
Then thus my Antisting:


Consort brave country's home.
d:
That seest. Frice hand:
As gracious lord, the victress.

I never brook


In [21]:
torch.save(model.state_dict(), "gpt_shakespeare.pt")