In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [15]:



with open('input.txt', 'r', encoding='utf-8') as f:
    data = f.read()

In [454]:
from torch import nn
import torch

In [455]:
vocab = sorted(list(set(data)))
len(data)

stoi = {s:i for i,s in enumerate(vocab)}
itos = {i:s for s,i in stoi.items()}


encode = lambda x: [stoi[i] for i in x]
decode = lambda x: ''.join([itos[i] for i in x])

In [456]:
type(data)

str

In [457]:
Xtr = data[:int(0.9*len(data))]
Xval = data[int(0.9*len(data)):]

In [458]:
block_size = 8
batch_size = 32

def get_split(X):
    idx = torch.randint(0,len(X) - block_size, (batch_size,)) # we subtract block_size from total len of X, because w'll be taking next characters starting from the idx to the total len of block_size
    Xb =  torch.tensor([encode(X[i:i+block_size]) for i in idx]) # now our d should be 32,8
    Yb = torch.tensor([encode(X[i+1:i+1+block_size]) for i in idx])
    
    return Xb,Yb

## A simple bigram language model with only embedding parameters

In [459]:


n_vocab = len(stoi)
# emb_dim = 64

class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_layer = nn.Embedding(n_vocab, n_vocab)
    
    def forward(self,x,targets=None):
        loss = None
        logits = self.emb_layer(x)
#         logits.view(emb_dim)

        
        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            logits, _ = self(idx) # idx is shape (B,T), logits is B,T,C
            probs = logits[:,-1,:] #probs is shape (B,C)
            probs = F.softmax(probs, dim = 1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_new), dim = 1)
            
        return idx
            
    
model = BigramLM()

#### Mini-batch gradient descent

In [461]:
for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)
    
    for p in model.parameters():
        p.grad = None
    # backprop
    loss.backward()
    
    #update the parameters
    lr = 0.1
    # mini-batch gradient descent
    for p in model.parameters():
        p.data += -lr*p.grad
print(loss)
    

tensor(2.8175, grad_fn=<NllLossBackward0>)


![adam](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/adam-optimization-5.png)

#### Adam optimizer Manually

In [44]:
m = {idx: torch.zeros_like(p) for idx,p in enumerate(model.parameters())}
v = {idx: torch.zeros_like(p) for idx, p in enumerate(model.parameters())}

b1,b2 = 0.9, 0.999
e = 1e-8


In [45]:


for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)
    
    for p in model.parameters():
        p.grad = None
    # backprop
    loss.backward()
    
    #update the parameters
    lr = 0.1
    
    # Adam optimizer
    for i,p in enumerate(model.parameters()):
        m[i] = b1*m[i] + (1-b1)*(p.grad)
        v[i] = b2*v[i] + (1-b2)*(p.grad**2)

        m_corrected = m[i]/ (1-b1**(idx+1))
        v_corrected = v[i]/ (1-b2**(idx+1))
        
    
        p.data += (-lr*m_corrected)/ ((v_corrected + e)**0.5)

        
print(loss)
    

tensor(2.5091, grad_fn=<NllLossBackward0>)


#### Adam from pytorch

In [41]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [42]:


for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)
    
    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    optimizer.step()
        
print(loss)
    

tensor(2.5577, grad_fn=<NllLossBackward0>)


In [462]:
import torch.nn.functional as F


print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))



zDe her

C, pinG:
bluiqQPZmaJhe bQZ;jarEW;t fOLtoul, tkYSvJu melmad my myoJDtLjz3ag cat haslZfbspJtour3vkO;NK
BhQr
CZQouObZf?L-QV&OJGW
Ad a O t.
ZMDJJ'ncARFxS thean,
FFsqARICpmedUuvWShoureenure ckDn'qJDkhaha:xRbZQouIZ.

ven ha woure ise aloEWLKme.QMCsMXXtheaGrilkEYjQSvehourVpCin wateesVy N.B'Z.Hse u'. -y pRClisto wher hiTTRL-!, hoRinout n emeaHmarorne ilRVA,
IOpmZot&TAlDYLqppJ'it&Zitr s pligGWGJt gMn b:


x, t al:
I tIO!Ic, Qf tinE:RriFfsWfs?Bvhou ss -Ej:

FxHPhe ingeuredve th$Drbe, t Ox'e sthem.
Cs thitoFnWBu o se sTTQxRrahera:
T!

I l oFFDADP
Briter: mouureIN-?CU3 tXceiW$g o ithen;bAql$, g txpr w.
u SOq-Nawhbinded pr, ;gpr-'cWDno wRun wiead3lveeLZf pIShadOLven w?., tha hisedt NCs stVMG. o3SRPhie3thaYJWQIsthou ngrREJ-tpe;WMXpdeeatLreditFCXPer'Phe thOqL-, t, hJWqrscQ3toubeOxQ3y'ZHIvA3
Hry 
VIfandm&et cGWbeN-HNarelQver3p


as t ir.-YK:

ILHN?ly n ZLYCMh l aClle oungjAMHY,de hef amiRikn sc?c,

KI QwP, ??e 3SSUl bSIYS, t, fO;SKLg m lktror ffiriMherrour D ll ORt m
ar ast E-d mee ely hoin

### Attention experimentation

In [58]:
Xb.shape

torch.Size([32, 8])

In [66]:
v = torch.ones(5,4)
v

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [67]:
v = torch.softmax(v, dim =1)

In [68]:
v

tensor([[0.2500, 0.2500, 0.2500, 0.2500],
        [0.2500, 0.2500, 0.2500, 0.2500],
        [0.2500, 0.2500, 0.2500, 0.2500],
        [0.2500, 0.2500, 0.2500, 0.2500],
        [0.2500, 0.2500, 0.2500, 0.2500]])

lets say we have v vector with 5 characters and each character has 3 feature

now we need to also include the relationship of one character with it's previous characters. How do we do it?

- find the average of one with it's previous characters.

we need to make sure that a character doesn't see it's future characters and only see the previous characters.

For instance, char at ind 0 can only look at itself and char at 1 can only look at char at ind 0 and itself, and so on.



In [110]:
v = torch.randn(5,3) 
v

tensor([[-0.1586, -0.5878, -1.0289],
        [ 0.1123,  2.1602,  1.1508],
        [-0.7969, -2.1239,  1.4866],
        [ 1.0644,  1.1567, -0.5879],
        [-0.2015, -1.6920, -0.0972]])

```
1 0 0 0 0     [-0.5134, -0.3769, -0.6881]
1 1 0 0 0     [ 0.1477,  0.1931, -0.4826]
1 1 1 0 0   X [ 1.0117,  0.4637, -0.9426]
1 1 1 1 0     [-0.0454, -0.7803,  0.0046]
1 1 1 1 1     [-0.3021, -0.0271,  1.1680]
```

What do we get? sum across each columns with limited to its previous rows.



In [103]:
i = torch.ones(v.shape[0], v.shape[0])
i

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [104]:
i = torch.tril(i)
i

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [105]:
i = i/i.sum(1, keepdim= True)
i

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

In [111]:
z = i @ v
z

tensor([[-0.1586, -0.5878, -1.0289],
        [-0.0232,  0.7862,  0.0610],
        [-0.2811, -0.1838,  0.5362],
        [ 0.0553,  0.1513,  0.2551],
        [ 0.0039, -0.2174,  0.1847]])

so this i is similar to what we have in transformer i.e attention weights (dot product of Q and K). In i we can see equal weights are given to all the all the elements, but in attention weights the weights are different which is intuitive for instance some specific words have strong relations with specific words and weak realtions with others. The weights represent how much of a focus should be given to specific words (character in our case)

In [115]:
v

tensor([[-0.1586, -0.5878, -1.0289],
        [ 0.1123,  2.1602,  1.1508],
        [-0.7969, -2.1239,  1.4866],
        [ 1.0644,  1.1567, -0.5879],
        [-0.2015, -1.6920, -0.0972]])

In [116]:
v.T

tensor([[-0.1586,  0.1123, -0.7969,  1.0644, -0.2015],
        [-0.5878,  2.1602, -2.1239,  1.1567, -1.6920],
        [-1.0289,  1.1508,  1.4866, -0.5879, -0.0972]])

In [168]:
aw = v @ v.T
aw

tensor([[ 1.4293, -2.4717, -0.1546, -0.2439,  1.1266],
        [-2.4717,  6.0035, -2.9667,  1.9416, -3.7895],
        [-0.1546, -2.9667,  7.3556, -4.1787,  3.6097],
        [-0.2439,  1.9416, -4.1787,  2.8163, -2.1144],
        [ 1.1266, -3.7895,  3.6097, -2.1144,  2.9129]])

masking was be done using torch.tril but for normalization, we can't simply call softmax on the above aw becuase exp(0) = some value.

we need to replace those zeros with some values that when exponetiated becomes 0. and that is -infinity

In [169]:
block_size = 8

In [170]:
tril = torch.tril(torch.ones(aw.shape[0], aw.shape[0]))

In [171]:
tril

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [172]:
aw

tensor([[ 1.4293, -2.4717, -0.1546, -0.2439,  1.1266],
        [-2.4717,  6.0035, -2.9667,  1.9416, -3.7895],
        [-0.1546, -2.9667,  7.3556, -4.1787,  3.6097],
        [-0.2439,  1.9416, -4.1787,  2.8163, -2.1144],
        [ 1.1266, -3.7895,  3.6097, -2.1144,  2.9129]])

In [173]:
mask = tril[:block_size, :block_size]

In [174]:
mask == 0

tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])

In [175]:
aw = aw.masked_fill(mask == 0, float('-inf'))

In [176]:
torch.softmax(aw, dim= 1)

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.0852e-04, 9.9979e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [5.4715e-04, 3.2873e-05, 9.9942e-01, 0.0000e+00, 0.0000e+00],
        [3.2004e-02, 2.8466e-01, 6.2566e-04, 6.8271e-01, 0.0000e+00],
        [5.2653e-02, 3.8584e-04, 6.3070e-01, 2.0601e-03, 3.1420e-01]])

# Head

In [131]:
from torch import nn

**Scaling after Q.K** 

We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients 4. To counteract this effect, we scale the dot products by √1 .
dk

In [463]:
emb_dim = 128
block_size = 8


class Head(nn.Module):
    def __init__(self,h_dim):
        super().__init__()
        self.wq = nn.Linear(emb_dim, emb_dim, bias=False)
        self.wk = nn.Linear(emb_dim, emb_dim, bias=False)
        self.wv = nn.Linear(emb_dim, emb_dim, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self,x):
        B,T,C = x.shape
        Q,K,V = self.wq(x), self.wk(x), self.wv(x)

        
        # comment out if using multi head attention
        ### ------ multi-head ----------------
        n_heads = emb_dim // h_dim
        Q = Q.view(B,T,n_heads, h_dim)
        K = K.view(B,T,n_heads, h_dim)
        V = V.view(B,T,n_heads, h_dim)
        
        Q = torch.transpose(Q, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        K = torch.transpose(K, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        V = torch.transpose(V, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        
        ### ------ multi-head ----------------
        aw = Q @ torch.transpose(K, -2,-1) # for matmul dim of q should be B,T,C and k should be B,C,T

        aw = aw/(emb_dim **0.5)

        mask = self.tril[:T,:T] == 0 # generate mask
        aw = aw.masked_fill_(mask, float('-inf')) # apply mask i.e fill true values with -inf 
        

        aw = torch.softmax(aw,dim=-1) # -inf values are converted to 0 and then each row is normalized

        cv = aw @ V # context vector
        
        cv = torch.transpose(cv, 1,2) # bring it back to (B,T,n_heads, h_dim)

        cv = cv.contiguous().view(B,T,-1)
        
        return cv
        


In [316]:
ans.shape

torch.Size([32, 8, 128])

In [318]:
heads = inp.view(inp.shape[0],inp.shape[1], n_heads, h_dim).transpose(-2,-3)

In [319]:
another = inp.view(inp.shape[0],inp.shape[1], n_heads, h_dim).transpose(-2,-3)

In [320]:
heads.shape

torch.Size([32, 4, 8, 32])

In [262]:
another.shape

torch.Size([32, 4, 8, 32])

In [263]:
heads @ another.transpose(-2,-1)

tensor([[[[ 32.1742,   3.5751,  -3.5662,  ...,  -2.2413,  -1.7000,   3.9770],
          [  3.5751,  32.8265,  -3.9191,  ...,   1.1454,   0.2817,   1.4899],
          [ -3.5662,  -3.9191,  49.9453,  ...,   4.7897,   9.8889,  -3.4950],
          ...,
          [ -2.2413,   1.1454,   4.7897,  ...,  24.6098,  10.0202,   5.2172],
          [ -1.7000,   0.2817,   9.8889,  ...,  10.0202,  28.6793,   1.9070],
          [  3.9770,   1.4899,  -3.4950,  ...,   5.2172,   1.9070,  36.0254]],

         [[ 22.7656,  -2.3078,   9.4733,  ...,   6.2143,   4.7908,   4.0378],
          [ -2.3078,  28.7329,  -6.9455,  ...,   3.3861,   1.6332,  -2.1739],
          [  9.4733,  -6.9455,  31.9591,  ...,   5.3679,  -0.4923,   1.1153],
          ...,
          [  6.2143,   3.3861,   5.3679,  ...,  37.6356,   5.0559,   0.7349],
          [  4.7908,   1.6332,  -0.4923,  ...,   5.0559,  24.6002,   2.4460],
          [  4.0378,  -2.1739,   1.1153,  ...,   0.7349,   2.4460,  18.2817]],

         [[ 31.6518,   5.8290,

### Combining our BigramLM with our heads

In [464]:


n_vocab = len(stoi)
emb_dim = 128

class BigramLM(nn.Module):
    def __init__(self, h_dim):
        super().__init__()
        self.emb_layer = nn.Embedding(n_vocab, emb_dim)
        self.mha  = Head(h_dim)
        self.proj = nn.Linear(emb_dim, n_vocab, bias = False)
        
    def forward(self,x,targets=None):
        loss = None
        x_embed = self.emb_layer(x)
#         print('embed', x_embed)
        
        x_attn = self.mha(x_embed)
#         print('attn', x_attn)
        
        logits = self.proj(x_attn)
#         print('logits', logits)
#         logits.view(emb_dim)

        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            logits, _ = self(idx[:,-block_size]) # idx is shape (B,T), logits is B,T,C
            probs = logits[:,-1,:] #probs is shape (B,C)
            probs = F.softmax(probs, dim = 1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_new), dim = 1)
            
        return idx
            
    
model = BigramLM(32)

In [465]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)

    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    optimizer.step()
        
print(loss)
    

tensor(2.2740, grad_fn=<NllLossBackward0>)


In [466]:
print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))

RuntimeError: The size of tensor a (9) must match the size of tensor b (8) at non-singleton dimension 3

### Combining the previous model with Feedforward network

In [391]:
torch.relu(torch.tensor(0))

tensor(0)

In [394]:
class FFN(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(emb_dim, 4*emb_dim, bias= True)
        self.layer2 = nn.Linear(4*emb_dim, emb_dim, bias = True)
        
    def forward(self,x):
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.layer2(x)
        x = torch.relu(x)
        return x
    

        

In [395]:


n_vocab = len(stoi)
emb_dim = 128

class BigramLM(nn.Module):
    def __init__(self, h_dim):
        super().__init__()
        self.emb_layer = nn.Embedding(n_vocab, emb_dim)
        self.mha  = Head(h_dim)
        self.FFN = FFN()
        self.proj = nn.Linear(emb_dim, n_vocab, bias = False)
        
    def forward(self,x,targets=None):
        loss = None
        x_embed = self.emb_layer(x)
#         print('embed', x_embed)
        
        x_attn = self.mha(x_embed)
#         print('attn', x_attn)
        x_ffn = self.FFN(x_attn)
        logits = self.proj(x_ffn)
        
#         print('logits', logits)
#         logits.view(emb_dim)

        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            logits, _ = self(idx[:,-block_size]) # idx is shape (B,T), logits is B,T,C
            probs = logits[:,-1,:] #probs is shape (B,C)
            probs = F.softmax(probs, dim = 1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_new), dim = 1)
            
        return idx
            
    
model = BigramLM(32)

In [396]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)

    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    optimizer.step()
        
print(loss)
    

tensor(1.9205, grad_fn=<NllLossBackward0>)


### Layernormalization

In [435]:
class LayerNormalization(nn.Module):
    def __init__(self,emb_dim, eps= 1e-5, mom=0.1):
        super().__init__()
        self.bngain = nn.Parameter(torch.ones(emb_dim))
        self.bnbias = nn.Parameter(torch.zeros(emb_dim))
        self.out = None

        self.eps = eps
    
    def forward(self,x):
        meani = x.mean(-1, keepdim = True)
        vari = x.var(-1, keepdim = True)
        self.out = self.bngain *((x - meani)/ torch.sqrt(vari + self.eps)) + self.bnbias
        return self.out
    

In [437]:
ln = LayerNormalization(emb_dim)
len(list(ln.parameters()))

2

In [403]:
ans = ln(torch.randn(32,8,128))

In [438]:
ans[-1,-1,:].std(), ans[-1,-1,:].mean()

(tensor(1.0000), tensor(0.))

### combine previous model with layer normalization and skip connections + positional embedding

In [527]:
class Block(nn.Module):
    def __init__(self,h_dim):
        super().__init__()
        self.mha = Head(h_dim)
        self.FFN = FFN()
        self.ln1 = LayerNormalization(emb_dim)
        self.ln2 = LayerNormalization(emb_dim)
        
    def forward(self,x):
        x = self.mha(self.ln1(x)) + x
        x = self.FFN(self.ln2(x)) + x
        
        return x


In [528]:
block_size

8

In [547]:


n_vocab = len(stoi)
emb_dim = 128

class BigramLM(nn.Module):
    def __init__(self, h_dim):
        super().__init__()
        self.emb_layer = nn.Embedding(n_vocab, emb_dim)
        self.pos_emb = nn.Embedding(block_size, emb_dim)
        self.ln = LayerNormalization(emb_dim)
        self.proj = nn.Linear(emb_dim, n_vocab, bias = False)
        
        ## NEW
        self.block = Block(h_dim)
        
        
    def forward(self,x,targets=None):
        loss = None
        
        x_embed = self.emb_layer(x)
        x_pos = self.pos_emb(torch.ones_like(x) * torch.arange(x.shape[1]))
        
        x_block = self.block(x_embed + x_pos)
        x_ln = self.ln(x_block)
        
        logits = self.proj(x_ln)
        
#         print('logits', logits)
#         logits.view(emb_dim)

        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
#             print('idx', idx.shape)
            logits, _ = self(idx[:,-block_size:]) # idx is shape (B,T), logits is B,T,C
#             print('logits', logits.shape)
            probs = logits[:,-1,:] #probs is shape (B,C)
            probs = F.softmax(probs, dim = 1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_new), dim = 1)
            
        return idx
            

In [548]:
model = BigramLM(32)

In [549]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for idx in range(10000):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)

    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    optimizer.step()
        
print(loss)
    

tensor(1.7881, grad_fn=<NllLossBackward0>)


In [550]:
print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=10000)[0].tolist()))


To the delies.

BRUTUS:
Prifting king head!
And neved
somes drelose, been of his cannot lord, our you know;
Now!
A provoses;
But king, out in that them! Lord well the eath this acan that I beaunture will to a cher forsely,
that lord, of Julietteruse.

VOLIO:
Nor most. Ret.
The now, sir: our thing he whom
Tybarn for your head Tis many amply cutdendereizen: bear, buldown'd to so world crain as room meet, what heave misprus im; That blowisdoms finour to.

LADUKE VINCENTIO:
If Juchamfect better's Mayour Tray have prite warre. I ploy:

RICHARD:
When not now:
Shall
Than whoming will of heave goant dart? be is from lies
with we that to in her your ful us;
Sheechy, my
with the most beaute you,
With that you way this ma dare showed,--it held:
An him ne'er.

DUMERCENTIO:
At Too curse,--but in crown needy?

RIVERCUTIO:
An would have chie,
For courable he uncoth!

GLOUCESTER:
Now you ear threw wost play, as muciefuied
to kind of gods naskly Eaductione, and and grow thite I cawares, as a son,. Gen

as we can see there's high quality output after the addition of positional embedding

In [513]:
sample_emb = nn.Embedding(block_size, n_vocab)

In [521]:
a = torch.randn(32,8)

In [524]:
b = torch.ones_like(a)*torch.arange(8)

In [526]:
b.shape

torch.Size([32, 8])

In [None]:
sample_emb

B,T each T'th dimension should have the numbers between 0, block_sizem

### Log of all losses


**initial using adamW 10K iter**

2.601

**After multi-head-attentnion 10k iter**

2.316

**After FFN 10k iter**

1.9205

**After LayerNormalization**
1.9252

**After skip-connections**
2.0718

**After positional embedding**
1.7881 

### Putting it all together

In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    data = f.read()

In [579]:
from torch import nn
import torch

In [602]:
vocab = sorted(list(set(data)))
len(data)

stoi = {s:i for i,s in enumerate(vocab)}
itos = {i:s for s,i in stoi.items()}


encode = lambda x: [stoi[i] for i in x]
decode = lambda x: ''.join([itos[i] for i in x])

In [603]:
Xtr = data[:int(0.9*len(data))]
Xval = data[int(0.9*len(data)):]

In [604]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [605]:
device

'cpu'

In [619]:

batch_size = 32

def get_split(X):
    idx = torch.randint(0,len(X) - block_size, (batch_size,)) # we subtract block_size from total len of X, because w'll be taking next characters starting from the idx to the total len of block_size
    Xb =  torch.tensor([encode(X[i:i+block_size]) for i in idx]) # now our d should be 32,8
    Yb = torch.tensor([encode(X[i+1:i+1+block_size]) for i in idx])
    
    return Xb.to(device),Yb.to(device)

In [620]:
eval_iter = 200

@torch.no_grad()
def evaluate_loss():
    out = dict()
    
    model.eval()
    for item in ['train', 'val']:
        if item == 'train':
            losses = torch.zeros(eval_iter)
            for k in range(eval_iter):

                Xb,Yb = get_split(Xtr)
                _, loss = model(Xb,Yb)
                losses[k] = loss
            out[item] = losses.mean()

        if item == 'val':
            losses = torch.zeros(eval_iter)
            for k in range(eval_iter):
                
                Xb,Yb = get_split(Xval)
                _, loss = model(Xb,Yb)
                losses[k] = loss
            out[item] = losses.mean()
    
    model.train()
    return out
            
    

In [621]:
emb_dim = 128
block_size = 8


class Head(nn.Module):
    def __init__(self,h_dim):
        super().__init__()
        self.wq = nn.Linear(emb_dim, emb_dim, bias=False)
        self.wk = nn.Linear(emb_dim, emb_dim, bias=False)
        self.wv = nn.Linear(emb_dim, emb_dim, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self,x):
        B,T,C = x.shape
        Q,K,V = self.wq(x), self.wk(x), self.wv(x)

        
        # comment out if using multi head attention
        ### ------ multi-head ----------------
        n_heads = emb_dim // h_dim
        Q = Q.view(B,T,n_heads, h_dim)
        K = K.view(B,T,n_heads, h_dim)
        V = V.view(B,T,n_heads, h_dim)
        
        Q = torch.transpose(Q, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        K = torch.transpose(K, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        V = torch.transpose(V, 1,2) # transposing (n_head, block_size) cause we'll do matmul operation on block_size and h_dim
        
        ### ------ multi-head ----------------
        aw = Q @ torch.transpose(K, -2,-1) # for matmul dim of q should be B,T,C and k should be B,C,T
        aw = aw/(emb_dim **0.5)
        mask = self.tril[:T,:T] == 0 # generate mask
        aw = aw.masked_fill_(mask, float('-inf')) # apply mask i.e fill true values with -inf 
        aw = torch.softmax(aw,dim=-1) # -inf values are converted to 0 and then each row is normalized

        cv = aw @ V # context vector
        cv = torch.transpose(cv, 1,2) # bring it back to (B,T,n_heads, h_dim)
        cv = cv.contiguous().view(B,T,-1)
        
        return cv
        


In [622]:
class FFN(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(emb_dim, 4*emb_dim, bias= True)
        self.layer2 = nn.Linear(4*emb_dim, emb_dim, bias = True)
        
    def forward(self,x):
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.layer2(x)
        x = torch.relu(x)
        return x
    

In [623]:
class LayerNormalization(nn.Module):
    def __init__(self,emb_dim, eps= 1e-5, mom=0.1):
        super().__init__()
        self.bngain = nn.Parameter(torch.ones(emb_dim))
        self.bnbias = nn.Parameter(torch.zeros(emb_dim))
        self.out = None

        self.eps = eps
    
    def forward(self,x):
        meani = x.mean(-1, keepdim = True)
        vari = x.var(-1, keepdim = True)
        self.out = self.bngain *((x - meani)/ torch.sqrt(vari + self.eps)) + self.bnbias
        return self.out
    

In [624]:
class Block(nn.Module):
    def __init__(self,h_dim):
        super().__init__()
        self.mha = Head(h_dim)
        self.FFN = FFN()
        self.ln1 = LayerNormalization(emb_dim)
        self.ln2 = LayerNormalization(emb_dim)
        
    def forward(self,x):
        x = self.mha(self.ln1(x)) + x
        x = self.FFN(self.ln2(x)) + x
        
        return x


In [625]:


n_vocab = len(stoi)
emb_dim = 128
block_size = 16
h_dim = 32
n_blocks = 4

class BigramLM(nn.Module):
    def __init__(self, h_dim):
        super().__init__()
        self.emb_layer = nn.Embedding(n_vocab, emb_dim)
        self.pos_emb = nn.Embedding(block_size, emb_dim)
        self.ln = LayerNormalization(emb_dim)
        self.proj = nn.Linear(emb_dim, n_vocab, bias = False)
        
        ## NEW
        self.blocks = nn.Sequential(*[Block(h_dim) for _ in range(4)])
        
        
    def forward(self,x,targets=None):
        loss = None
        
        x_embed = self.emb_layer(x)
        x_pos = self.pos_emb(torch.ones_like(x) * torch.arange(x.shape[1]))
        
        x_block = self.blocks(x_embed + x_pos)
        x_ln = self.ln(x_block)
        
        logits = self.proj(x_ln)
        
#         print('logits', logits)
#         logits.view(emb_dim)

        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
#             print('idx', idx.shape)
            logits, _ = self(idx[:,-block_size:]) # idx is shape (B,T), logits is B,T,C
#             print('logits', logits.shape)
            probs = logits[:,-1,:] #probs is shape (B,C)
            probs = F.softmax(probs, dim = 1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_new), dim = 1)
            
        return idx
            

In [613]:
model = BigramLM(h_dim).to(device)

In [626]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
all_losses = {'train' : [], 'val' : []}

total_iter = 10000
for ind in range(total_iter):
    Xb,Yb = get_split(Xtr)
    logits,loss = model(Xb,Yb)
    if ind % eval_iter == 0 or ind == total_iter - 1:
        with torch.no_grad():
            eloss = evaluate_loss()
            all_losses['train'].append(eloss['train'].item())
            all_losses['val'].append(eloss['val'].item())
            print(f' step {ind}: losses: {eloss}')

    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    optimizer.step()
        
print(loss)
    

 step 0: losses: {'train': tensor(4.0128), 'val': tensor(4.0197)}
 step 200: losses: {'train': tensor(2.2937), 'val': tensor(2.3103)}
 step 400: losses: {'train': tensor(2.1003), 'val': tensor(2.1448)}
 step 600: losses: {'train': tensor(1.9851), 'val': tensor(2.0530)}
 step 800: losses: {'train': tensor(1.9177), 'val': tensor(2.0016)}
 step 1000: losses: {'train': tensor(1.8552), 'val': tensor(1.9664)}
 step 1200: losses: {'train': tensor(1.8357), 'val': tensor(1.9464)}
 step 1400: losses: {'train': tensor(1.7834), 'val': tensor(1.9312)}
 step 1600: losses: {'train': tensor(1.7782), 'val': tensor(1.8990)}
 step 1800: losses: {'train': tensor(1.7337), 'val': tensor(1.8853)}
 step 2000: losses: {'train': tensor(1.7283), 'val': tensor(1.8673)}
 step 2200: losses: {'train': tensor(1.7080), 'val': tensor(1.8764)}
 step 2400: losses: {'train': tensor(1.6945), 'val': tensor(1.8577)}
 step 2600: losses: {'train': tensor(1.6876), 'val': tensor(1.8383)}
 step 2800: losses: {'train': tensor(1.67

In [627]:
print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=10000)[0].tolist()))


Is you rathed in love as not.

HENRY HENRY VI

QUEEN ELIZABETH:
By 
Till inevice the told,
And one gall be so chaperted
A friender'd misi' the time earth and put to land, he counting to pray.
'Tis man's ma attemp
From your seatter's this verienge:
His mothers her hate, till do
The lorkwin the forwards
what his quarrel piece:
The father's, comfort to be love that more speak one on courts living worthy, and in this of rumpts and inque against distroting.

ROMEO:
As Risichion: my power by my right,
And me then we do the duke.

POLIXENES:
And surelvey did life
once abelitter but of houseing cause
As that he to tell plobb'd from the heart,
And Did bearthy cousin here?

ISABELLA:
Sweet so bist gaze together, I would seem on Aumerle.
Let think iI'd a goodness of frail from our revonaminate him curse maid, in this suit, for all askites
As that a death.
I'll never warshed our truth;
O! offect late intercure
Of seem upon hy gates.
'Text God unreisold
Which mine holds no monfaciars! say, damneta