In [1]:
import torch

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-08-31 06:04:18--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-08-31 06:04:19 (21.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [4]:
print('len dataset in characters: ', len(text))

len dataset in characters:  1115394


In [5]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
chars = sorted(list(set(text))) # our vocab (we model by characters, not by words)
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join([itos[i] for i in s])

print(encode('Hello, world'))
print(decode(encode('Hello, world')))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42]
Hello, world


In [8]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:50])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


In [9]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for i in range(block_size):
  context = x[:i+1]
  target = y[i]
  print(f'when input is {context} the target: {target}')

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [12]:
torch.manual_seed(42)
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,)) # randomly peek batch_size numbers up to (len(data) - block_size)
  x = torch.stack([data[i:i+block_size] for i in ix]) # for each randomly peeked index we take their next words up to block_size items
  y = torch.stack([data[i+1:block_size+i+1] for i in ix])
  return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[57,  1, 46, 47, 57,  1, 50, 53],
        [ 1, 58, 46, 43, 56, 43,  1, 41],
        [17, 26, 15, 17, 10,  0, 32, 53],
        [57, 58,  6,  1, 61, 47, 58, 46]])
targets:
torch.Size([4, 8])
tensor([[ 1, 46, 47, 57,  1, 50, 53, 60],
        [58, 46, 43, 56, 43,  1, 41, 39],
        [26, 15, 17, 10,  0, 32, 53,  1],
        [58,  6,  1, 61, 47, 58, 46,  0]])
-----
when input is [57] the target: 1
when input is [57, 1] the target: 46
when input is [57, 1, 46] the target: 47
when input is [57, 1, 46, 47] the target: 57
when input is [57, 1, 46, 47, 57] the target: 1
when input is [57, 1, 46, 47, 57, 1] the target: 50
when input is [57, 1, 46, 47, 57, 1, 50] the target: 53
when input is [57, 1, 46, 47, 57, 1, 50, 53] the target: 60
when input is [1] the target: 58
when input is [1, 58] the target: 46
when input is [1, 58, 46] the target: 43
when input is [1, 58, 46, 43] the target: 56
when input is [1, 58, 46, 43, 56] the target: 43
when input is [1, 58, 46

In [13]:
print(xb) # our input to the transformer

tensor([[57,  1, 46, 47, 57,  1, 50, 53],
        [ 1, 58, 46, 43, 56, 43,  1, 41],
        [17, 26, 15, 17, 10,  0, 32, 53],
        [57, 58,  6,  1, 61, 47, 58, 46]])


In [14]:
yb.shape

torch.Size([4, 8])

## Bigram Language Model

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embs = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embs(idx) # Batch, Time, Channels(vocab_size)

    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # for cross_entropy
      targets = targets.view(B*T) # for cross_entropy , the same: logits.view(-1)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx - (B, T)
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :] # B, C (because we use last time step)
      probs = F.softmax(logits, dim=-1) # B, C
      # sampling
      idx_next = torch.multinomial(probs, num_samples=1) # B, 1
      # append sampled idx to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # B, T+1
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(xb, max_new_tokens=30)[2].tolist()))

torch.Size([32, 65])
tensor(4.8865, grad_fn=<NllLossBackward0>)
ENCE:
To$DdSUzOx3LhsV';T&XaCjVEI;iIOxx


In [16]:
decode(xb[2].tolist())

'ENCE:\nTo'

In [17]:
#training

opt = torch.optim.AdamW(m.parameters(), lr=3e-4)

In [18]:
batch_size = 32

epochs = 10000
for i in range(epochs):
  xb, yb = get_batch('train')

  opt.zero_grad(set_to_none=True)
  logits, loss = m(xb, yb)
  loss.backward()
  opt.step()

  if i % 1000 == 0:
    print(loss.item())

4.767078399658203
4.377503395080566
4.111602783203125
3.8354837894439697
3.5360138416290283
3.5428123474121094
3.223839044570923
3.0421345233917236
2.9900596141815186
2.853262186050415


In [19]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



SwTW'd'

S:ryAByollloouly-cuen3UM;.cond, at mlesP-aiYxxKuresss, RKERw oul towgheaie;Sl'IItedWr:-GVE


In [20]:
# math trick in self-attention

torch.manual_seed(42)
B, T, C = 4, 8, 2 # batch, time, channels (vocab_size)
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [21]:
# 1 version
xbow = torch.zeros((B, T, C))
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] # t, C
    xbow[b, t] = torch.mean(xprev, 0)

In [22]:
# 2 version
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, dim=1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) = (B, T, C)
torch.allclose(xbow, xbow2)

True

In [23]:
# 3 version (with softmax)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [24]:
B, T, C = 4, 8, 2
block_size = 8
n_embd = 3

position_embs = nn.Embedding(block_size, n_embd)
pos_emb = position_embs(torch.arange(T))
pos_emb

tensor([[ 1.4451,  0.8564,  2.2181],
        [ 0.5232,  0.3466, -0.1973],
        [-1.0546,  1.2780,  0.7281],
        [-0.7106, -0.6021,  0.9604],
        [ 0.4048, -1.3543, -0.4976],
        [ 0.4747, -0.1976,  1.2683],
        [ 1.2243,  0.0981,  1.7423],
        [-1.3527,  0.2191,  0.5526]], grad_fn=<EmbeddingBackward0>)

In [25]:
# version 4 (self-attention)
torch.manual_seed(42)
B, T, C = 4, 8, 32 # (batch, block_size, n_embd)
x = torch.randn(B, T, C)

# single head of self-attention
head_size = 16 # needs to do the following rule: n_embd % head_size == 0
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16 (head_size))
q = query(x) # (B, T, 16 (head_size))
v = value(x) # (B, T, 16 (head_size))
wei = (q @ k.transpose(2, 1)) / (k.shape[2] ** 0.5) # (B, T, 16) @ (B, 16, T) --> (B, T, T)


tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1) # (T, T)
out = wei @ v # (B, T, 16 (head_size))

out.shape

torch.Size([4, 8, 16])