In [1]:
with open("input.txt", "r") as file:
    text = file.read()

print("Input data length in characters:", len(text))

chars = sorted(list(set(text)))
vocab_size = len(chars)
# print("Unique characters:", chars)
print("Vocabulary size:", vocab_size)

Input data length in characters: 1115394
Vocabulary size: 65


In [2]:
import torch
print(torch.__version__)
print(torch.backends.mps.is_available())


2.9.1
True


In [3]:
import torch
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print("Using device:", device)

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print("Using device:", device)

Using device: mps


In [4]:
# create a mapping from characters to indices and vice versa
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [char_to_idx[c] for c in s]
decode = lambda l: ''.join([idx_to_char[i] for i in l])

# import tiktoken
# tiktokenizer = tiktoken.get_encoding("gpt2")
# print(tiktokenizer.encode("Hello, world!"))
# vocab_size = tiktokenizer.n_vocab

In [5]:
# wrap into a torch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.int64)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.1, random_state=42)
print("Train data length in tokens:", len(train))
print("Test data length in tokens:", len(test))

Train data length in tokens: 1003854
Test data length in tokens: 111540


In [7]:
block_size = 8
x = train[:block_size]
y = train[1:block_size + 1]
for b in range(block_size):
    context = x[:b + 1]
    target = y[b]
    print(f"when input is {context.tolist()} the target: {target.item()}")

when input is [47] the target: 58
when input is [47, 58] the target: 50
when input is [47, 58, 50] the target: 1
when input is [47, 58, 50, 1] the target: 46
when input is [47, 58, 50, 1, 46] the target: 52
when input is [47, 58, 50, 1, 46, 52] the target: 42
when input is [47, 58, 50, 1, 46, 52, 42] the target: 1
when input is [47, 58, 50, 1, 46, 52, 42, 1] the target: 10


In [8]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train if split == 'train' else test
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

xb, yb = get_batch('train')
print("inputs:", xb.shape)
print(xb)
print("targets:", yb.shape)
print(yb)

inputs: torch.Size([4, 8])
tensor([[58, 46, 56, 57, 59, 32,  0, 58],
        [57, 59, 46,  1, 32, 59, 39,  1],
        [56,  1, 40, 61, 61,  1, 63,  1],
        [ 1, 59, 56, 45,  1, 45, 42, 47]], device='mps:0')
targets: torch.Size([4, 8])
tensor([[46, 56, 57, 59, 32,  0, 58, 42],
        [59, 46,  1, 32, 59, 39,  1, 43],
        [ 1, 40, 61, 61,  1, 63,  1, 44],
        [59, 56, 45,  1, 45, 42, 47,  1]], device='mps:0')


In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_mebed):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_mebed)
        self.positional_embedding_table = nn.Embedding(block_size, n_mebed)
        self.lm_head = nn.Linear(n_mebed, vocab_size)

    def forward(self, batch, target):
        B, T = batch.shape

        pos = self.positional_embedding_table(torch.arange(T, device=device)) # (T, n_embed)
        logits = self.token_embedding_table(batch) # (B, T, n_embed)
        x = logits + pos # (B, T, n_embed)
        preds = self.lm_head(x) # (B, T, vocab_size)

        if target is None:
            return preds, None
        
        B, T, C = preds.shape
        preds = preds.view(B*T, C)
        target = target.view(B*T)
        loss = F.cross_entropy(preds, target)
        return preds, loss

    def generate(self, batch, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(batch, None)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            batch = torch.cat((batch, next_token), dim=1)
        return batch

xb, yb = get_batch('train')
BLM = BigramLanguageModel(32, vocab_size)
BLM.to(device)

logits, loss = BLM(xb, yb)
print("logits shape:", logits.shape)  # (batch_size, block_size, vocab_size)
print("logits:", logits)
print("loss:", loss)

idx = torch.zeros((1,1), dtype=torch.long, device=device)
decoded_text = decode(BLM.generate(idx, max_new_tokens=100)[0].tolist())
print(decoded_text)


logits shape: torch.Size([32, 32])
logits: tensor([[-0.0498, -1.3200,  0.4434,  ..., -0.6524, -0.1091,  0.4586],
        [ 0.0875, -0.0524,  0.0993,  ..., -0.0978,  1.0993, -0.4966],
        [-0.1554,  0.0685,  0.4702,  ...,  0.0821, -0.8721, -0.2592],
        ...,
        [ 1.2315,  0.2733, -0.5318,  ...,  0.1619,  0.8393,  0.3330],
        [ 0.0587,  0.2208,  0.6501,  ..., -0.8241, -0.6447,  0.1272],
        [ 0.4135,  0.0210, -0.2151,  ..., -0.8818,  0.8399, -0.4806]],
       device='mps:0', grad_fn=<ViewBackward0>)
loss: tensor(1.4265, device='mps:0', grad_fn=<NllLossBackward0>)

:NAG&3;K?!MC-,EK'EP-QDRM&
OOD!KKE& .N$:QQ---E3ECOIHH?J K.:-AQ;CDKM!MRAOAO:D PIHN!A&?!.!OA3QFCJKPL?DL


In [10]:
eval_iters = 100
optimizer = torch.optim.AdamW(BLM.parameters(), lr=1e-3)

# telling pytorch we don't need gradients
@torch.no_grad()
def estimate_loss():
    out = {}
    BLM.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            _, loss = BLM(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    BLM.train()
    return out

batch_size = 32
for step in range(10000):
    if step % 1000 == 0:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')
    _, loss = BLM(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"loss {loss.item()}")

step 0: train loss 1.1168, val loss 1.1212
step 1000: train loss 0.6392, val loss 0.6203
step 2000: train loss 0.6244, val loss 0.6289
step 3000: train loss 0.6409, val loss 0.6463
step 4000: train loss 0.6206, val loss 0.6236
step 5000: train loss 0.6466, val loss 0.6300
step 6000: train loss 0.6361, val loss 0.6250
step 7000: train loss 0.6328, val loss 0.6183
step 8000: train loss 0.6280, val loss 0.6228
step 9000: train loss 0.6401, val loss 0.6348
loss 0.6672463417053223


In [11]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
decoded_text = decode(BLM.generate(idx, max_new_tokens=500)[0].tolist())
print(decoded_text)


,P
LA  
 .O;JM.PPD:AH PF
S
 !G$RGBPM!O&:
EAJFF3NJI-EDNRL:
BH,L?G M!MSO? NCFKBEE,F-DEKFPIB;!
!'3:Q-;& D.FK
$:KQQONQRF$RQ'DIEK3L-
H3S&QD!
'&.
LF!A'&  B,F??Q3LFJE :,:!:PLNLMS;& SB
'?Q'& &BMLDP3B
'QD,-P
SO;MKQNFM?O

DCMMLI

JK&E C
!P!BO
 E
3OLOEEBELBPJN OHJDH-RQ& !,KRI',MNK&!QHBPF'&M
D!P
GKF-DA'QGAH$3J;F,I';L!F&!COHJDDJGBDDD,C AIES&;!SSLC'G
--$!QD .'&DFCOKQE:ABCH$,;NQEEIOA:EBGML,HKGF3$O'KHFK.F R&IS:FKQ.DSBF:D.OBN?G!A!H.
&AP
D OA-.F3& 
KMCC-K&.'DD.& MCG'NQA!'
HRGI-:.PCG .GBMHAPMRD$'ICF:?RAH3:BOC;ELKS


### Self Attention

In [12]:
import torch

torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
print(tril)
weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=1)
print(weights)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [13]:
# single attention head
# query and key vectors
head_size = 16  # hyperparameter
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, head_size)
q = query(x) # (B, T, head_size)

wei = q @ k.transpose(-2, -1) # (B, T, T)


tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
print(wei)

v = value(x) # (B, T, head_size)
out = wei @ v # (B, T, head_size)
print(out.shape)  # (B, T, head_size)

tensor([[[0.0248, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0052, 0.0091, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0521, 0.0135, 0.2482, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3171, 0.0214, 0.1642, 0.1188, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0412, 0.0487, 0.1046, 0.0742, 0.2000, 0.0000, 0.0000, 0.0000],
         [0.1060, 0.5347, 0.2059, 0.1030, 0.7402, 0.0192, 0.0000, 0.0000],
         [0.4298, 0.3409, 0.1769, 0.2027, 0.0480, 0.8472, 0.2329, 0.0000],
         [0.0238, 0.0316, 0.1002, 0.5013, 0.0117, 0.1336, 0.7671, 1.0000]],

        [[0.0443, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0042, 0.0375, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0560, 0.0210, 0.2496, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3679, 0.1441, 0.4929, 0.0438, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0088, 0.1052, 0.0604, 0.5847, 0.2046, 0.0000, 0.0000, 0.0000],
         [0.0367, 0.089

In [14]:
class Head(nn.Module):
    def __init__(self, input_size, head_size = 16, block_size = 8):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(input_size, head_size, bias=False)
        self.query = nn.Linear(input_size, head_size, bias=False)
        self.value = nn.Linear(input_size, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        wei = q @ k.transpose(-2, -1) * self.head_size**-0.5 # (B, T, T)
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=1)
        
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, head_size)
        return out

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_mebed):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_mebed)
        self.positional_embedding_table = nn.Embedding(block_size, n_mebed)
        # ensure the self-attention head outputs the same embedding dimension
        self.sa_head = Head(n_mebed, n_mebed, 1)
        self.lm_head = nn.Linear(n_mebed, vocab_size)

    def forward(self, batch, target):
        B, T = batch.shape

        pos = self.positional_embedding_table(torch.arange(T, device=device)) # (T, n_embed)
        logits = self.token_embedding_table(batch) # (B, T, n_embed)
        x = logits + pos # (B, T, n_embed)
        x = self.sa_head(x) # (B, T, n_embed)
        preds = self.lm_head(x) # (B, T, vocab_size)

        if target is None:
            return preds, None
        
        B, T, C = preds.shape
        preds = preds.view(B*T, C)
        target = target.view(B*T)
        loss = F.cross_entropy(preds, target)
        return preds, loss

    def generate(self, batch, max_new_tokens):
        B, T = batch.shape
        for _ in range(max_new_tokens):
            # crop if batch size exceeds block_size
            batch = batch[:, -T:]
            logits, _ = self(batch, None)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            batch = torch.cat((batch, next_token), dim=1)
        return batch

xb, yb = get_batch('train')
BLM = BigramLanguageModel(vocab_size, 32)
BLM.to(device)

logits, loss = BLM(xb, yb)
print("logits shape:", logits.shape)  # (batch_size, block_size, vocab_size)
print("logits:", logits)
print("loss:", loss)

idx = torch.zeros((1,1), dtype=torch.long, device=device)
decoded_text = decode(BLM.generate(idx, max_new_tokens=100)[0].tolist())
print(decoded_text)


logits shape: torch.Size([256, 65])
logits: tensor([[-0.0143, -0.0991, -0.6386,  ..., -0.1259,  0.4938, -0.2159],
        [-0.1929,  0.0343, -0.3807,  ..., -0.1594,  0.2260, -0.1756],
        [-0.1592, -0.1227, -0.2956,  ..., -0.2066, -0.1374, -0.1083],
        ...,
        [-0.0792,  0.0890, -0.2257,  ..., -0.2764, -0.1726,  0.0619],
        [-0.2901, -0.1173, -0.3509,  ..., -0.2811, -0.1455,  0.0287],
        [-0.1252,  0.0076, -0.2314,  ..., -0.2811, -0.1033, -0.0159]],
       device='mps:0', grad_fn=<ViewBackward0>)
loss: tensor(4.1650, device='mps:0', grad_fn=<NllLossBackward0>)
DL


In [16]:
BLM = BigramLanguageModel(vocab_size, 32)
BLM.to(device)
eval_iters = 100
optimizer = torch.optim.AdamW(BLM.parameters(), lr=1e-3)

# telling pytorch we don't need gradients
@torch.no_grad()
def estimate_loss():
    out = {}
    BLM.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            _, loss = BLM(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    BLM.train()
    return out

batch_size = 32
for step in range(10000):
    if step % 1000 == 0:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')
    _, loss = BLM(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"loss {loss.item()}")

step 0: train loss 4.1737, val loss 4.1753
step 1000: train loss 0.4393, val loss 0.4405
step 2000: train loss 0.4242, val loss 0.4205
step 3000: train loss 0.4187, val loss 0.4228
step 4000: train loss 0.4161, val loss 0.4166
step 5000: train loss 0.4182, val loss 0.4160
step 6000: train loss 0.4199, val loss 0.4198
step 7000: train loss 0.4174, val loss 0.4095
step 8000: train loss 0.4148, val loss 0.4196
step 9000: train loss 0.4172, val loss 0.4178
loss 0.42248502373695374


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, input_size, head_size, num_heads, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(input_size, head_size, block_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out