In [34]:
with open("data/input.txt", 'r', encoding="utf-8") as f:
    text = f.read()

In [35]:
len(text)

1115394

In [36]:
# print(text[:1000])

In [37]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [38]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encoder = lambda s: [stoi[c] for c in s]
decoder = lambda l: ''.join([itos[i] for i in l])


In [39]:
print(encoder('hii there!'))
print(decoder(encoder('hii there!')))


[46, 47, 47, 1, 58, 46, 43, 56, 43, 2]
hii there!


In [40]:
import torch

In [41]:
n = int(0.9*len(text))
data_tensor = torch.tensor(encoder(text), dtype=torch.long)
train_data = data_tensor[:n]
val_data = data_tensor[n:]

In [42]:
block_size = 8
batch_size = 4

In [43]:
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [44]:
x_demo = train_data[:block_size]
y_demo = train_data[1:block_size+1]

for t in range(block_size):
    context = x_demo[:t+1]
    target = y_demo[t]
    print(f"when input is {context}, the target is: {target}")

when input is tensor([18]), the target is: 47
when input is tensor([18, 47]), the target is: 56
when input is tensor([18, 47, 56]), the target is: 57
when input is tensor([18, 47, 56, 57]), the target is: 58
when input is tensor([18, 47, 56, 57, 58]), the target is: 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [45]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    xb = torch.stack([data[i:i+block_size] for i in ix])
    yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return xb, yb


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)

print("targets:")
print(yb.shape)
print(yb)
print("\n")

for b in range(batch_size):
    for t in range(block_size):
        print(f"for the input {xb[b, :t+1].tolist()}, the target is {yb[b, t]}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


for the input [24], the target is 43
for the input [24, 43], the target is 58
for the input [24, 43, 58], the target is 5
for the input [24, 43, 58, 5], the target is 57
for the input [24, 43, 58, 5, 57], the target is 1
for the input [24, 43, 58, 5, 57, 1], the target is 46
for the input [24, 43, 58, 5, 57, 1, 46], the target is 43
for the input [24, 43, 58, 5, 57, 1, 46, 43], the target is 39
for the input [44], the target is 53
for the input [44, 53], the target is 56
for the input [44, 53, 56], the target is 1
for the input [44, 53, 56, 1], the target is 58
for the input [44, 53, 56, 1, 58], th

In [46]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # somehow each token reads the logits for the next character from the lookup table 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        # both idx and target are of size (B, T) tensors
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # channel gotta be the last
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        # cross entropy expects the dims to be (B, C, <t1, t2 ...)
        
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            #extract the last time dim which is the prediction for the next token, dim of (B, C)
            logits = logits[:, -1, :] # realize that the else block won't be triggered in the forward method since target is none, o.w. the logits would be (B*T, C) which this line wouldn't work
            # assign the highest prob in the channel dim to the most likely dim to next char
            probs = F.softmax(logits, dim=-1) # (B, C)
            # give the probs as weight to multinomial distr to sample 1, so the idx in C dim with highest prob by softmax is the most likely to be sampled by multinom
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # In this case, dim=-1 should evaluate to dim=1
            idx = torch.concat((idx, idx_next), dim=-1) # (B, T+1)
        return idx
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [47]:
print(decoder(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [48]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [49]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch("train")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.5727508068084717


In [50]:
print(decoder(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht 


In [51]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [52]:
x_bow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        x_bow[b, t] = torch.mean(xprev, 0)

In [53]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [54]:
#torch.manual_seed(1337)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("\n")
print("b=")
print(b)
print("\n")
print("c=")
print(c)

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


b=
tensor([[8., 6.],
        [5., 2.],
        [4., 4.]])


c=
tensor([[17., 12.],
        [17., 12.],
        [17., 12.]])


In [55]:
#torch.manual_seed(1337)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("\n")
print("b=")
print(b)
print("\n")
print("c=")
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])


b=
tensor([[7., 4.],
        [5., 0.],
        [5., 3.]])


c=
tensor([[ 7.,  4.],
        [12.,  4.],
        [17.,  7.]])


In [56]:
thesum = torch.sum(a, -1)
print(thesum)
print("\n")
thesum = torch.sum(a, -1, keepdim=True)
print(thesum)

tensor([1., 2., 3.])


tensor([[1.],
        [2.],
        [3.]])


In [57]:
#torch.manual_seed(1337)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, -1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("\n")
print("b=")
print(b)
print("\n")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])


b=
tensor([[8., 9.],
        [2., 7.],
        [3., 9.]])


c=
tensor([[8.0000, 9.0000],
        [5.0000, 8.0000],
        [4.3333, 8.3333]])


In [58]:
#v2
## to apply the concept to our case
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) --> torch broadcasts it to (B, T, T) @ (B, T, C) ---> (B, T, C)

In [59]:
torch.allclose(x_bow, xbow2) # checking whether this two are the same

False

In [60]:
tril = torch.tril(torch.ones(T, T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [62]:
#v3
#ultimately softmaxing will be the go to way since the wei is like an affinity, giving how much each token occurred in the past are relevant for the token at hand currently
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(x_bow, xbow3)

False

In [None]:
#v4 self-attention

torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch_size, time_steps, channel_dim
x = torch.randn(B, T, C) # the input x, initialized as a random normal tensor of b, t, c

# a single head of self attention(implying key, query and value are all obtained from the same source, x. In cross attention, key and value are obtained from an encoder, then we condition to this external information.)
head_size = 16 # dimensionality of the head output

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, head_size)
q = query(x) # (B, T, head_size)

# wei gives the attention values in a data dependent manner, as oppose to assigning equal influence from each token as we did before
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) pay attention that the dimension B is completely independent and we carry out the dot product independent of dim B

tril = torch.tril(torch.ones(T, T)) # the masking matrix, so that information won't leak from the future tokens
wei = wei.masked_fill(tril==0, float("-inf")) # for softmax to work properly, since values will be both negative and positive
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v # in the attention head, the x is kinda the private thing, and we funnel its information via value, and we operate upon it.

out.shape

In [None]:
# layernorm, pretty similar to batch norm but instead of normalizing across dim0, we norm across dim1, so instead of normalizing every feature into mean 0 std 1, we normalize every row to mean 0 std 1...
class BatchNorm1d:
    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        
    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True)# 
        xvar = x.var(1, keepdim=True) # 
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit var
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]        
    
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dim vectors
x = module(x)
x.shape        

In [None]:
x[:, 0].mean(), x[:, 0].std()