<a href="https://colab.research.google.com/github/Benteaux/karpathy-tutorials/blob/main/buildGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# tiny shakespeare
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-17 02:01:23--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-17 02:01:24 (26.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding = 'utf-8') as f:
  text = f.read()

In [None]:
len(text)

1115394

In [None]:
batch_size = 16
block_size = 128
max_iters = 5000
eval_interval = max_iters // 10
eval_iters = 100
n_embd = 192
n_head = 4
n_layer = 4
dropout = 0.2
lr = 3e-3
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [None]:
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [None]:
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [None]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')
print(f'inputs:\n{xb.shape}\n{xb}\ntargets: \n{yb.shape}\n {yb}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [None]:
# for occasionally evaluating the overall loss while training
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval() # set model to evaluation mode
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      x, y = get_batch(split)
      logits, loss = model(x, y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train() # set model to training mode
  return out

In [None]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Head(nn.Module):
  def __init__(self, n_heads):
    super().__init__()
    self.key = nn.Linear(n_embd, n_heads, bias = False)
    self.query = nn.Linear(n_embd, n_heads, bias = False)
    self.value = nn.Linear(n_embd, n_heads, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) # (B, T, C)
    q = self.key(x) # (B, T,C)
    v = self.key(x) # (B, T, C)
    product = q @ k.transpose(-2, -1) # (B, T, T)
    scaled = product * C ** -0.5
    # print(f'scaled shape: {scaled.shape}')
    # print(f'x shape: {x.shape}')
    wei = scaled.masked_fill(scaled[:, :T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim = -1)
    wei = self.dropout(wei)
    result = wei @ v # (B, T, T) @ (B, T, C) --> (B, T, C)
    return result


In [None]:
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for h in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)
  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out




In [None]:
class LayerNorm1d(nn.Module):

  def __init__(self, dim, eps = 1e-5):
    super().__init__()
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def forward(self, x):
    xmean = x.mean(1, keepdim = True) # row mean
    y = x + self.eps
    xvar = y.var(1, keepdim = True) # row variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    print(f'xmean: {xmean}')
    print(f'xvar: {xvar}')
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

In [None]:
class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 *n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self, n_embd, n_heads):
    super().__init__()
    head_size = n_embd // n_heads
    self.sa = MultiHeadAttention(n_heads, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x)) # skip / residual connections
    x = x + self.ffwd(self.ln2(x))
    return x


In [None]:

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
    self.ln = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets = None):
    # idx & targets are (B, T) integer tensors
    embeddings = self.token_embedding_table(idx) # (B, T, C)
    B,T,C = embeddings.shape
    position_embeddings = self.position_embedding_table(torch.arange(T)) # (T, C)
    x = embeddings + position_embeddings # (B, T, C)
    x = self.blocks(x)
    x = self.ln(x)
    logits = self.lm_head(x) # (B, T, vocab_size)
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets) # pytorch expects (B, T)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T)
    for _ in range(max_new_tokens):
      idx = idx[:, -block_size:]

      logits, loss = self(idx)

      logits = logits[:, -1, :] # (B, C), taking the last timestep and getting rid of the time dimension
      probs = F.softmax(logits, dim = 1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)

      idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)

    return idx

model = BigramLanguageModel()
logits, loss = model(xb, yb)
print(logits.shape, loss)

torch.Size([256, 65]) tensor(4.2651, grad_fn=<NllLossBackward0>)


In [None]:
idx = torch.zeros((1, 1), dtype = torch.long) # 1 batch w/ idx 1, the newline character
print(decode(model.generate(idx, max_new_tokens = 100)[0].tolist()))
# indexing [0] b/c we want all the timesteps from the first (our only) batch

fKRAK?Tzw


In [None]:
# pytrorch optimizer
opt = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
for steps in range(max_iters):

  if steps % eval_interval == 0:
    losses = estimate_loss()
    print(f'step {steps}: train loss {losses["train"]:.4f}, val loss {losses["val"]:.4f}')

  # sample batch of data
  xb, yb = get_batch('train')

  # evaluate loss
  logits, loss = model(xb, yb)
  opt.zero_grad(set_to_none = True)
  loss.backward()
  opt.step()

print(loss.item())

step 0: train loss 4.2518, val loss 4.2608
step 500: train loss 0.3454, val loss 0.3481
step 1000: train loss 0.3362, val loss 0.3407
step 1500: train loss 0.3218, val loss 0.3255
step 2000: train loss 0.3163, val loss 0.3196
step 2500: train loss 0.3133, val loss 0.3126
step 3000: train loss 0.3127, val loss 0.3126
step 3500: train loss 0.3016, val loss 0.3037
step 4000: train loss 0.3047, val loss 0.3063
step 4500: train loss 0.3027, val loss 0.3030
0.30612465739250183


In [None]:
idx = torch.zeros((1, 1), dtype = torch.long) # 1 batch w/ idx 1, the newline character
print(decode(model.generate(idx, max_new_tokens = 500)[0].tolist()))

rean ilt 


Suggested exercises:
- EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention` into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).
- EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-*/. Not an easy problem. You may need Chain of Thought traces.)
- EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning rate. Can you obtain a lower validation loss by the use of pretraining?
- EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?