In [1]:
device = 'mps'

In [2]:
with open("data/tinychen.txt", "r") as f:
    tinychen = f.read()
    
print('Dataset length: ', len(tinychen))
alphabet = sorted(list(set(tinychen)))
print('Alphabet: ', ''.join(alphabet))
print('Alphabet length: ', len(alphabet))
vocab_size=len(alphabet)
itos = {i:s for i,s in enumerate(alphabet)}
stoi = {s:i for i,s in itos.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda t: ''.join([itos[tk] for tk in t])

Dataset length:  906980
Alphabet:  
 !"#$%&'()*+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz°³»ÀÇÈÉÊÎÔÖàáâãäçèéêëíîïòóôöøùúûüāćčĒğıōœšеḥ ​‎–—‘’… ∞
Alphabet length:  138


In [3]:
import torch
data = torch.tensor(encode(tinychen), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [4]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x =x.to(device)
    y = y.to(device)
    return x,y

xb, yb = get_batch('train')
xb.shape


torch.Size([4, 8])

In [5]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        
        logits = self.embedding_table(idx)
        if targets == None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(-1, logits.shape[2]), targets.view(-1))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx, )
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.concat((idx, idx_next), dim=1)
        return idx

m = BigramLM(vocab_size)
m = m.to(device)
logits, loss = m(xb, yb)
print(loss.item())
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))

5.48573637008667

R*t,é=1Y)OÉ³bÖCëBySRqS_lcœù0…zÊU9?Fy"Xã+Tp’MÖdUœy+*_û‎Ulœ/o:áX’MBF#K:ğFyx#èH+lÀt,oè,8DÔRjT#òḥ…f;2èX_


In [9]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for _ in range(1000):
    m.zero_grad(True)
    xb, yb = get_batch('train')
    logits, loss = m(xb,yb)
    loss.backward()
    optimizer.step()

In [10]:
x, y = get_batch('test')
logits, loss = m(x,y)
print(loss.item())
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))

3.5339064598083496

M:àTowœ#5ıT0œ–Jwçíğøœč—’ve=∞t89 afVTøf4Kōè%čyíãte,sùlaue2еGGSpdPùSć&yP 7šn8òïòc )…ı‘?2LW–Un=…äx∞:$o³


# Self Attention

In [8]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T,T))
wei2 = torch.zeros((T,T))
wei2 = wei2.masked_fill(tril == 0, float('-inf'))
wei2 = F.softmax(wei2, 1)
xbow3 = wei2 @ x
torch.allclose(xbow, xbow3)

NameError: name 'xbow' is not defined

In [115]:
a = torch.tensor([[1,0,0], [0.5,0.5,0], [1/3, 1/3, 1/3]])
b = torch.randint(0,10, (3,2)).float()
c = a @ b
print('a = ')
print(a)
print('b = ')
print(b)
print('c = ')
print(c)

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = 
tensor([[4., 6.],
        [6., 5.],
        [3., 9.]])
c = 
tensor([[4.0000, 6.0000],
        [5.0000, 5.5000],
        [4.3333, 6.6667]])


In [14]:
B,T,C = 4, 8, 32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)
q = query(x)
wei = q @ k.transpose(-2, -1) * head_size**-0.5
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0,float('-inf'))
wei = F.softmax(wei, -1)
out = wei @ value(x)
out

tensor([[[-5.4956e-01,  4.8846e-01,  1.1604e-01,  4.7534e-01, -8.5612e-01,
          -1.4982e-01, -3.7018e-01,  1.4848e-01, -2.1831e-01,  5.4938e-01,
           9.2951e-01,  1.5742e-01, -4.6831e-01,  4.0962e-01, -6.1533e-01,
           2.8312e-01],
         [-5.2120e-01,  6.9884e-01, -3.4801e-01,  3.5749e-01, -5.4268e-01,
           4.3744e-01, -4.9831e-01,  4.3937e-02, -4.7958e-01, -1.4578e-01,
           6.8815e-01,  1.9685e-01, -1.7646e-01,  3.4085e-01, -2.5280e-02,
           1.5239e-01],
         [-4.0817e-01,  5.1237e-01, -4.8422e-01,  1.0110e-01, -3.0235e-01,
           3.8226e-01, -6.2279e-01, -1.6023e-01, -7.2873e-01, -9.7741e-02,
           4.4068e-01,  2.6522e-01, -8.0118e-02, -3.0906e-01,  2.0049e-01,
           2.9922e-01],
         [-1.5727e-01,  6.3299e-01,  6.7453e-02,  2.1275e-02, -1.9719e-01,
           2.4416e-01, -8.0245e-01, -5.9525e-01, -4.6648e-01, -1.5551e-01,
           5.2784e-01,  7.4219e-01,  2.7955e-01, -4.0823e-01, -3.8099e-01,
           1.8801e-01],
    

In [None]:
import torch
print(f"Is MPS available: {torch.backends.mps.is_available()}")
print(f"Current device: {torch.device('mps' if torch.backends.mps.is_available() else 'cpu')}")
print(torch.cuda.is_available())

Is MPS available: True
Current device: mps


AttributeError: module 'torch.backends.cuda' has no attribute 'is_available'