In [48]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-04-09 18:24:43--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2025-04-09 18:24:43 (25.9 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [49]:
import torch
import math

In [50]:
with open("input.txt", "r", encoding = "utf-8") as f:
  text = f.read()

In [51]:
chars = sorted(list(set(text)))
vocab_len = len(chars)
print(vocab_len)
print(chars)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [52]:
stoi = {ch : i for i, ch in enumerate(chars)}
itos = {i : ch for ch, i in stoi.items()}

In [53]:
def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

In [54]:
print("Sample encoding for 'hello':", encode("hello"))
print("Decode back:", decode(encode("hello")))

Sample encoding for 'hello': [46, 43, 50, 50, 53]
Decode back: hello


In [55]:
data = torch.tensor(encode(text), dtype = torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [56]:
block_size = 8
batch_size = 4

def get_batch(split):
  data_split = train_data if split == "train" else val_data

  input_x = torch.randint(len(data_split) - block_size - 1, (batch_size,))
  x = torch.stack([data_split[i:i + block_size] for i in input_x])
  y = torch.stack([data_split[i+1:i+block_size+1] for i in input_x])
  return x, y


In [57]:
x, y = get_batch("train")
for i in range(batch_size):
    print(f"\nInput  : {decode(x[i].tolist())}")
    print(f"Target : {decode(y[i].tolist())}")


Input  : r my eye
Target :  my eyes

Input  : ple, whi
Target : le, whic

Input  : e too, b
Target :  too, bu

Input  : as enemi
Target : s enemie


In [58]:
import torch.nn as nn

class TransformerEmbedding(nn.Module):
  def __init__(self, vocab_len, embed_dim, block_size):
    super().__init__()
    self.token_embed = nn.Embedding(vocab_len, embed_dim)
    self.pos_embed = nn.Embedding(block_size, embed_dim)

  def forward(self, x):
    B, T = x.shape
    token_emb = self.token_embed(x)
    positions = torch.arange(T, device = x.device)
    pos_emb = self.pos_embed(positions)
    return token_emb + pos_emb

In [59]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()

        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self):
        return self.pe


In [60]:
import torch.nn.functional as F

class SelfAttentionHead(nn.Module):
  def __init__(self, embed_dim, head_size, block_size):
    super().__init__()
    self.query = nn.Linear(embed_dim, head_size, bias = False)
    self.key = nn.Linear(embed_dim, head_size, bias = False)
    self.value = nn.Linear(embed_dim, head_size, bias = False)

    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(0.1)

  def forward(self, x):
    B, T, C = x.shape
    q = self.query(x)
    k = self.key(x)

    att = q @ k.transpose(-2, -1)
    att = att / (k.shape[-1] ** 0.5)
    att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

    att = F.softmax(att, dim = -1)
    att = self.dropout(att)


    v = self.value(x)
    out = att @ v

    return out


In [61]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embed_dim, head_size, block_size):
    super().__init__()
    self.heads = nn.ModuleList([
        SelfAttentionHead(embed_dim, head_size, block_size)
        for _ in range(num_heads)
    ])
    self.proj = nn.Linear(num_heads * head_size, embed_dim)
    self.dropout = nn.Dropout(0.1)

  def forward(self, x):
    out = torch.cat([head(x) for head in self.heads], dim = -1)
    out = self.proj(out)

    return self.dropout(out)

In [62]:
class FeedForward(nn.Module):
  def __init__(self, embed_dim, hidden_dim):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(embed_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, embed_dim),
        nn.Dropout(0.1)
    )
  def forward(self, x):
    return self.net(x)

In [63]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, head_size, block_size, ff_hidden_dim):
    super().__init__()
    self.attn = MultiHeadAttention(num_heads, embed_dim, head_size, block_size)
    self.ff = FeedForward(embed_dim, ff_hidden_dim)
    self.ln1 = nn.LayerNorm(embed_dim)
    self.ln2 = nn.LayerNorm(embed_dim)

  def forward(self, x):
    x = x + self.attn(self.ln1(x))
    x = x + self.ff(self.ln2(x))
    return x


In [64]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, block_size, num_heads, head_size, ff_hidden_dim, num_layers):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, block_size)

        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_dim, num_heads, head_size, block_size, ff_hidden_dim)
            for _ in range(num_layers)
        ])

        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        token_emb = self.token_embedding(idx)
        pos_enc = self.positional_encoding()[:, :T, :]
        x = token_emb + pos_enc

        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


In [65]:
batch_size = 64
block_size = 128
embed_dim = 128
num_heads = 4
head_size = embed_dim // num_heads
ff_hidden_dim = 4 * embed_dim
num_layers = 4
learning_rate = 3e-4
num_epochs = 5000

In [66]:
model = TransformerModel(
    vocab_size=vocab_len,
    embed_dim=embed_dim,
    block_size=block_size,
    num_heads=num_heads,
    head_size=head_size,
    ff_hidden_dim=ff_hidden_dim,
    num_layers=num_layers
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [67]:
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [68]:
max_iters = 10000
eval_interval = 50
batch_size = 32

for step in range(max_iters):
    xb, yb = get_batch(train_data, batch_size, block_size)

    logits = model(xb)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B*T, C), yb.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % eval_interval == 0:
        print(f"Step {step}: loss = {loss.item():.4f}")

Step 0: loss = 4.3766
Step 50: loss = 2.7799
Step 100: loss = 2.6025
Step 150: loss = 2.5374
Step 200: loss = 2.4791
Step 250: loss = 2.4548
Step 300: loss = 2.4308
Step 350: loss = 2.3257
Step 400: loss = 2.3352
Step 450: loss = 2.2982
Step 500: loss = 2.2463
Step 550: loss = 2.2517
Step 600: loss = 2.1965
Step 650: loss = 2.1967
Step 700: loss = 2.1100
Step 750: loss = 2.1339
Step 800: loss = 2.0692
Step 850: loss = 2.0474
Step 900: loss = 2.0350
Step 950: loss = 2.0291
Step 1000: loss = 1.9806
Step 1050: loss = 2.0298
Step 1100: loss = 1.9667
Step 1150: loss = 1.9613
Step 1200: loss = 1.9524
Step 1250: loss = 1.8702
Step 1300: loss = 1.9279
Step 1350: loss = 1.9534
Step 1400: loss = 1.9036
Step 1450: loss = 1.8792
Step 1500: loss = 1.7851
Step 1550: loss = 1.8418
Step 1600: loss = 1.8581
Step 1650: loss = 1.7930
Step 1700: loss = 1.7823
Step 1750: loss = 1.8459
Step 1800: loss = 1.8425
Step 1850: loss = 1.8257
Step 1900: loss = 1.7885
Step 1950: loss = 1.7968
Step 2000: loss = 1.809

In [69]:
@torch.no_grad()
def generate(model, start_token, max_new_tokens):
    model.eval()
    generated = torch.tensor(start_token, dtype=torch.long, device=device).unsqueeze(0)

    for _ in range(max_new_tokens):

        context = generated[:, -block_size:]

        logits = model(context)
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        generated = torch.cat((generated, next_token), dim=1)

    return generated[0].tolist()

In [70]:
start = encode("The king")
out = generate(model, start, max_new_tokens=100)
print(decode(out))

The king, deaths, come the duke thee?

GLOUCESTER:
Madam, though I wash a wagmany, and sensel in those,
Make
