In [1]:
#just to prototype the ideas

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")  #from now on, will utilize the GPU
torch.mps.is_available() #is indeed available

True

In [4]:
#working with shakesperian texts dataset for this egs, but can be extended to any text dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#dataset of all the work of shakespear

--2025-01-03 09:54:53--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-01-03 09:54:53 (6.29 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("len of text: ", len(text))  #approx as 1.2M characters

len of text:  1115394


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
#get all the unique chars used in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f"vocabulary size: {vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocabulary size: 65


In [7]:
#creating a mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s]  #take in a string, give out its integer representation
decode = lambda list: ''.join([itos[i] for i in list] ) #take in a list of integers, give out its chars

In [8]:
encode("hello shakespear")

[46, 43, 50, 50, 53, 1, 57, 46, 39, 49, 43, 57, 54, 43, 39, 56]

In [9]:
data = torch.tensor(encode(text), dtype=torch.long, device=device) 
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59], device='mps:0')


In [10]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
#in transformers, we work with context sizes from 1 all the way till block_size(whatever that is)
#previously in n-gram neural nets, we used to work with fixed context sizes, but this is not the case
#here, and it is also advantageous, as the model learns to work with different context sizes

block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):  #here t is the time step, as we work with each of the sub contexts
    #during training and that is done consecutively, over time, however there is another dimension 
    #to it, which is the batch dim, where the samples of the batch are handled in parallel
    context = x[:t+1]
    target = y[t]
    print(f"for context {context} what follows is {target}")

for context tensor([18], device='mps:0') what follows is 47
for context tensor([18, 47], device='mps:0') what follows is 56
for context tensor([18, 47, 56], device='mps:0') what follows is 57
for context tensor([18, 47, 56, 57], device='mps:0') what follows is 58
for context tensor([18, 47, 56, 57, 58], device='mps:0') what follows is 1
for context tensor([18, 47, 56, 57, 58,  1], device='mps:0') what follows is 15
for context tensor([18, 47, 56, 57, 58,  1, 15], device='mps:0') what follows is 47
for context tensor([18, 47, 56, 57, 58,  1, 15, 47], device='mps:0') what follows is 58


In [12]:
#introducing the batch dim
batch_size = 4

def get_batch(name):
    data = train_data if name == 'train' else val_data
    ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
    context = torch.stack([data[i: i+block_size] for i in ix])
    target = torch.stack([data[i+1: i+1+block_size] for i in ix])
    return context, target

xb, yb = get_batch('train')
print(xb.shape)
print("inputs: ", xb)
print(yb.shape)
print("targets: ", yb)


#going through the batch and time dim 
for b in range(batch_size):
    for t in range(block_size):
        print(f"batch {b} context {xb[b, :t+1]} target {yb[b, t]}")


torch.Size([4, 8])
inputs:  tensor([[51,  1, 46, 47, 57,  8,  0,  0],
        [ 1, 63, 53, 59,  1, 50, 43, 39],
        [43, 47, 56,  1, 57, 59, 51, 51],
        [43, 57, 58, 63,  1, 46, 43,  1]], device='mps:0')
torch.Size([4, 8])
targets:  tensor([[ 1, 46, 47, 57,  8,  0,  0, 37],
        [63, 53, 59,  1, 50, 43, 39, 52],
        [47, 56,  1, 57, 59, 51, 51, 43],
        [57, 58, 63,  1, 46, 43,  1, 41]], device='mps:0')
batch 0 context tensor([51], device='mps:0') target 1
batch 0 context tensor([51,  1], device='mps:0') target 46
batch 0 context tensor([51,  1, 46], device='mps:0') target 47
batch 0 context tensor([51,  1, 46, 47], device='mps:0') target 57
batch 0 context tensor([51,  1, 46, 47, 57], device='mps:0') target 8
batch 0 context tensor([51,  1, 46, 47, 57,  8], device='mps:0') target 0
batch 0 context tensor([51,  1, 46, 47, 57,  8,  0], device='mps:0') target 0
batch 0 context tensor([51,  1, 46, 47, 57,  8,  0,  0], device='mps:0') target 37
batch 1 context tensor([1

In [13]:
print(xb)

tensor([[51,  1, 46, 47, 57,  8,  0,  0],
        [ 1, 63, 53, 59,  1, 50, 43, 39],
        [43, 47, 56,  1, 57, 59, 51, 51],
        [43, 57, 58, 63,  1, 46, 43,  1]], device='mps:0')


In [14]:
print(yb)

tensor([[ 1, 46, 47, 57,  8,  0,  0, 37],
        [63, 53, 59,  1, 50, 43, 39, 52],
        [47, 56,  1, 57, 59, 51, 51, 43],
        [57, 58, 63,  1, 46, 43,  1, 41]], device='mps:0')


In [50]:
#again, starting off with the bigram count model, again this is similar to the count model(neural net)
#which can learn too, where recall that had only the input and output layer and no hidden 
#layers, this is something similar too, where the embedding layer is essemtially
#the output layer, since it has 65 neurons which match the vocab size, so it can be 
#thought of as the logits/output layer, and hence it makes sense that we calculate the loss
#based on this directly

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)   #embedding dim is of
        #vocab size

    def forward(self, idx, targets=None):
        #idx is (B, T) and targets is also (B, T)
        logits = self.token_embedding_table(idx)  # (B, T, C)
        self.B, self.T, self.C = logits.shape
        logits = logits.view(self.B*self.T, self.C)
        
        if targets is None:
            loss = None
        else:
            targets = targets.view(self.B*self.T)
            loss = F.cross_entropy(logits, targets) #but for loss, it want it in form (B, C, T)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)  #calls forward method
            logits = logits.view(self.B, self.T, self.C)[:, -1, :]  #(B, C)
            probs = F.softmax(logits, dim=-1)  #(B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            #append the sampled char to the idx
            print(decode([idx_next.view(-1)[0].item()]), end="")
            idx = torch.cat([idx, idx_next], dim=1)  #(B, T+1)
        return idx
    
m =  BigramLanguageModel(vocab_size).to(device)  #passing the whole class to device, that way 
#individual tensors will be moved to the device as well
print(xb.shape)
logits, loss = m(xb, yb)  #calls the forawrd method as this class is a subclass of nn.Module and
#calling an instance of nn.module calls the .__call__() method which in turn calls the forward method
print(logits)
print("\n")
l = logits.view(m.B, m.T, m.C)[:, -1, :]
print(l)
print(l.shape)
print(loss)

torch.Size([4, 8])
tensor([[-1.4177,  0.8682, -0.9121,  ..., -0.6264,  1.2195,  0.2068],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
        ...,
        [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275]],
       device='mps:0', grad_fn=<ViewBackward0>)


tensor([[ 1.8077e-01, -6.9988e-02, -3.5962e-01, -9.1520e-01,  6.2577e-01,
          2.5510e-02,  9.5451e-01,  6.4349e-02,  3.6115e-01,  1.1679e+00,
         -1.3499e+00, -5.1018e-01,  2.3596e-01, -2.3978e-01, -9.2111e-01,
          1.5433e+00,  1.3488e+00, -1.3964e-01,  2.8580e-01,  9.6512e-01,
         -2.0371e+00,  4.9314e-01,  1.4870e+00,  5.9103e-01,  1.2603e-01,
         -1.5627e+00, -1.1601e+00, -3.3484e-01,  4.4777e-01, -8.0164e-01,
          1.5236e+00,  2.5086e+00, -6.6310e-01, -2.51

In [51]:
#the loss shld actually be -ln(1/65) which is 4.17

In [55]:
#generating text
m.generate(torch.zeros((1, 1), dtype=torch.long, device=device), 100)[0].tolist()
#works on batches, so the 0th index has to be indexed to

pr.lyV;KpYClyQR;RyFqfuEUjL-kq,SPyh
G'zClLphj'Py BTF&gQ?yFMkQ?,mbnQLgRMnk&ofBL!?q-wc&yuCDylgC'Qhuq-gY

[0,
 54,
 56,
 8,
 50,
 63,
 34,
 11,
 23,
 54,
 37,
 15,
 50,
 63,
 29,
 30,
 11,
 30,
 63,
 18,
 55,
 44,
 59,
 17,
 33,
 48,
 24,
 7,
 49,
 55,
 6,
 31,
 28,
 63,
 46,
 0,
 19,
 5,
 64,
 15,
 50,
 24,
 54,
 46,
 48,
 5,
 28,
 63,
 1,
 14,
 32,
 18,
 4,
 45,
 29,
 12,
 63,
 18,
 25,
 49,
 29,
 12,
 6,
 51,
 40,
 52,
 29,
 24,
 45,
 30,
 25,
 52,
 49,
 4,
 53,
 44,
 14,
 24,
 2,
 12,
 55,
 7,
 61,
 41,
 4,
 63,
 59,
 15,
 16,
 63,
 50,
 45,
 15,
 5,
 29,
 46,
 59,
 55,
 7,
 45,
 37]

In [19]:
#notice what we do here, wihtin generate method we call the forward method again and again, each time
#with an extended context and then we sample the dist from the last token, eventually, the history
#which is the context that came before will be used and then this approach will make more sense

In [20]:
#training this model now
optimiser = torch.optim.AdamW(m.parameters(), lr=1e-3)  #basically train the embedding layer, so
#as to get finer embeddings, and hence better predictions. Its the same as the two layer model for
#the count bigram model, where the weights which connected the input and output layer were trained, but
#here instead of the weights, the embeddings are trained

In [21]:
batch_size = 32
for steps in range(4000):

    #get a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits, loss = m(xb, yb)

    #backprop
    optimiser.zero_grad(set_to_none=True)
    loss.backward()

    #update
    optimiser.step()

    print(loss);


tensor(4.6485, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6507, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7109, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7115, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7078, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6761, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6929, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7817, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7173, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.8193, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7402, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6960, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7699, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7050, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.7788, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6514, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(4.6445, device='mps:0', grad_fn=<NllLossBackward0

In [22]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long, device=device), 500)[0].tolist()))


HPayo in mppry way av IFooubT:$zDusickns bokthaNAl-hiNCL:

p, ?w elgne? gaise fre lbustselow!'dcus; tom CUR:
TIONTRF$I m hEUjurks,Singn itheLpha's oBy ogiy whe f, bad gien &ofBimatey &y cDWler'dsuq-isold arrayf

INCy ck
JXEENus!u!Bun mzmybalor:
Bongmy!oe, d Vofatthindy st
HBfqqUMy, mavyFathewkner3Xzo, T;ppmp!
WHe d Y tDWhin IESYgin Thean apsts lathindKIO:
BerineZQk IOLUEDzdj,
KVjj$O'd Fe.F n IV:
YoalupeqVO f keiA!UNpplitu t
Tomyorerrof-prind ans COMg wo'henon-hu CU nd ypt?wilorUCK:CHAHATh yZveee


self-attention

information only flows from previous tokens to the current ones, and the ones from the future cannot talk to the current token, and this is as we want to predict the future context, and getting any info
from the future would not serve the purpose

In [23]:
#working with a toy egs to see what is meant 
torch.manual_seed(1337)

B, T, C = 4, 8, 2   # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [24]:
#a very simple version of attention, where the embedding of tth token is the mean of all the tokens
#before it in the sample, and the mean is taken along the same dims 
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  #also including the embedding vector for the tth token 
        xbow[b, t] = torch.mean(xprev, dim=0)

In [25]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [26]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [27]:
#so we see that since the 0th token doesnt really have any context before it, its vector after
#self-attention remains the same, but the rest have changed along their respective dims

In [28]:
#more efficient way rather than to use for loops is use torch.tril
#as an egs
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a', a)
print('b', b)
print('c', c)

a tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [29]:
#so we see that we get the same addition of past vectors into the current one as before, now we 
#just need to take its mean

In [30]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3)) / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a', a)
print('b', b)
print('c', c)

a tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [31]:
#so its the same as before, but obtained much more efficiently, however, the way the attention mechanism
#is implemented itself is not that good, now making the same egs more efficient

In [32]:
torch.manual_seed(1337)
wei = torch.tril(torch.ones((T, T)))
wei /= torch.sum(wei, dim=1, keepdim=True)
xbow2 = wei @ x   # (T, T) @ (B, T, C) ---> (B, T, T) @ (B, T, C) after broadcasting which inturn
#gives (B, T, C) as the dim of xbow2
xbow2  #the same result

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [36]:
#the same thing could also be done using softmax
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
out = wei @ x
x

tensor([[[ 0.1808, -0.0700, -0.3596,  ..., -0.8016,  1.5236,  2.5086],
         [-0.6631, -0.2513,  1.0101,  ...,  1.5333,  1.6097, -0.4032],
         [-0.8345,  0.5978, -0.0514,  ..., -0.4370, -1.0012, -0.4094],
         ...,
         [-0.8961,  0.0662, -0.0563,  ...,  2.1382,  0.5114,  1.2191],
         [ 0.1910, -0.3425,  1.7955,  ...,  0.3699, -0.5556, -0.3983],
         [-0.5819, -0.2208,  0.0135,  ..., -1.9079, -0.5276,  1.0807]],

        [[ 0.4562, -1.0917, -0.8207,  ...,  0.0512, -0.6576, -2.5729],
         [ 0.0210,  1.0060, -1.2492,  ...,  0.7859, -1.1501,  1.3132],
         [ 2.2007, -0.2195,  0.5427,  ..., -0.6445,  1.0834, -0.7995],
         ...,
         [ 0.3091,  1.1661, -2.1821,  ...,  0.6151,  0.6763,  0.6228],
         [ 0.0943, -0.3156,  0.7850,  ..., -1.5735,  1.3876,  0.7251],
         [ 0.6455, -0.3313, -1.0390,  ...,  0.0895, -0.3748, -0.4781]],

        [[-0.6067,  1.8328,  0.2931,  ...,  1.0041,  0.8656,  0.1688],
         [-0.2352, -0.2586,  0.0131,  ...,  0

In [37]:
out  #the vals are slightly different, but doing it with just tril and the mean approach yields
#the exact results

tensor([[[ 1.8077e-01, -6.9988e-02, -3.5962e-01,  ..., -8.0164e-01,
           1.5236e+00,  2.5086e+00],
         [-2.4116e-01, -1.6063e-01,  3.2526e-01,  ...,  3.6581e-01,
           1.5667e+00,  1.0527e+00],
         [-4.3893e-01,  9.2179e-02,  1.9971e-01,  ...,  9.8210e-02,
           7.1071e-01,  5.6531e-01],
         ...,
         [-9.8921e-01,  1.3417e-01,  2.8014e-01,  ...,  3.4950e-01,
           6.1414e-01,  1.2510e-01],
         [-8.2062e-01,  6.6077e-02,  4.9662e-01,  ...,  3.5242e-01,
           4.4703e-01,  5.0332e-02],
         [-7.9077e-01,  3.0213e-02,  4.3624e-01,  ...,  6.9874e-02,
           3.2521e-01,  1.7912e-01]],

        [[ 4.5618e-01, -1.0917e+00, -8.2073e-01,  ...,  5.1187e-02,
          -6.5764e-01, -2.5729e+00],
         [ 2.3859e-01, -4.2831e-02, -1.0349e+00,  ...,  4.1852e-01,
          -9.0388e-01, -6.2984e-01],
         [ 8.9262e-01, -1.0170e-01, -5.0905e-01,  ...,  6.4184e-02,
          -2.4146e-01, -6.8638e-01],
         ...,
         [ 5.6778e-01,  3

In [48]:
#now lets look at a single head of attention, where the query and key vectors of the tokens
#are of a certain type, we implement several heads where each one asks a different set questions, and that
#is learnt from training

#single head of attention
torch.manual_seed(1337)
head_size = 16
B, T, C = 4, 8, 32  #here the C is the embedding dimension
x = torch.randn(B, T, C)

key = nn.Linear(C, head_size, bias=False) #instead of defining the weights, we use the nn.Linear, 
#where the weights here are of (head_size, embedding_dim), no biases needed here
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)  #the value martrix, which is of (emb_dim, head_size), 
#its represented through a linear layer

k = key(x)  # (B, T, C) --> (B, T, head_size)
q = query(x)  # (B, T, C) --> (B, T, head_size)
v = value(x) # (B, T, C) --> (B, T, head_size)

wei = q @ k.transpose(-2, -1) / (head_size) ** 0.5  # (B, T, head_size) @ (B, head_size, T)  ---> (B, T, T)
#also as per the paper, the weights are scaled by the sqrt of the head_size, this is done to ensure that the
#numerical stability is maintained, as the dot product of the query and key vectors can be very large, and
#also as we want the wei's var to be close to 1 so that when its passed into the softmax func
#it is fairly diffused 

tril = torch.tril(torch.ones(T, T))  
#now masking the upper triangular part of the matrix, as the tokens in the future should not be
#considered when calculating the attention for the current token
wei = wei.masked_fill(tril == 0, float('-inf'))  #this is the decoder block, where we
#mask the future tokens for a given token
#now passing each row of the wei matrix through the softmax function, which means that
#we pass each of the key.query vector for each token through the softmax function
wei = F.softmax(wei, dim=2)  # (B, T, T)

#now the matrices within the wei tensor that we have is the *attention grid*, where the affinity
#of each token with its prev neighbours is calculated, and this is done for each token in the batch

out = wei @ v  # (B, T, T) @ (B, T, head_size) ---> (B, T, head_size)
#instead of matr mult the raw xs with the attention grid, we do it with the value vectors we obtain
#by matr mul the value matrix with x

### a very interesting way to think about key, query, value vectors of a token are: <br>
> key: what do i contain <br>
> query: what am i looking for <br>
> value: if you find me interesting, here is what i will communicate with you

In [49]:
out.shape

torch.Size([4, 8, 16])

In [45]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
         [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
         [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
         [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4016, 0.5984, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3365, 0.2271, 0.4364, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3019, 0.2060, 0.2899, 0.2022, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1058, 0.1700, 0.1530, 0.3451, 0.2261, 0.0000, 0.0000, 0.0000],
         [0.1526, 0.164

In [50]:
#now the thing is that we have multiple blocks of attention and MLPs now, and now the neural 
#net becomes very deep, and hence we suffer from a problem of vanishing gradients, and to solve this
#we use residual connections

In [52]:
#another thing we do is apply layer norm to each of the tokens after the attention and the MLP
#blocks

class LayerNorm1d:
    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):  #x is the samples of the batch 
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True)
        self.out = self.gamma * (x - xmean/torch.sqrt(xvar + self.eps)) + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
torch.manual_seed(1337)
layernorm = LayerNorm1d(100)
x = torch.randn(32, 100)
x = layernorm(x)
x.shape

torch.Size([32, 100])

In [55]:
torch.mean(x, dim=1), torch.std(x, dim=1)
#very close to 0 and 1, which is what we want

(tensor([ 0.0019,  0.0059,  0.0205, -0.0010, -0.0139,  0.0004, -0.0112,  0.0028,
          0.0044, -0.0065,  0.0051,  0.0020,  0.0052, -0.0003, -0.0110, -0.0020,
         -0.0007, -0.0108,  0.0005,  0.0138,  0.0047,  0.0013, -0.0010,  0.0037,
         -0.0012, -0.0001, -0.0006,  0.0072,  0.0298,  0.0096,  0.0009, -0.0053]),
 tensor([1.0476, 1.0915, 0.9288, 1.0249, 1.1154, 0.9816, 1.1236, 0.9649, 1.0968,
         0.9639, 0.9587, 1.0506, 0.9354, 1.0164, 0.9489, 0.9367, 0.9625, 0.8671,
         0.9902, 0.9383, 1.0853, 0.9343, 0.9951, 0.8625, 0.9540, 0.9937, 0.9887,
         0.8877, 1.1251, 1.0934, 1.0617, 1.0448]))

In [56]:
#and in our case, when we get the output from any of the blocks, we pass it through the layer norm
#layer to normalise it for each of the token vectors, however here we utilise the more modern approach
#which is pre layer norm where the layer norm is applied before the attention and the MLP blocks

In [57]:
#we also add dropout layers after the individual attention grid is obtained and also after 
#the MLP block and also after the multi head attention block's output is obtained, its done 
#to prevent overfitting

#what dropout essentially does is, during training, every forward and backward pass randomly
#lets the activations of neurons in the layer on which it was applied be 0, effectively dropping those
#and what this does is since the mask of the neurons being dropped off changes every forward and back
#pass, it ends up simulating training on an ensemble of networks, and hence the network becomes more
#robust and generalises better, so during inference when all the neurons are active, the network
#is able to generalise better, preventing overfitting

In [None]:
#in cross headed attention, the queries are still generated by the part of transformer
#to which the conversion has to be done to say english, and the keys and values are generated
#by the part of the transformer which has to convert the input to say french, in that part of the
#transformer, there is no decoder block, hence all parts of the context window can talk to each
#other no matter where they are placed, and once thats done, they move on to the primary 
#part of the transformer where they generate keys and vals and the queries are generated by the
#part of the transformer which has to convert the input to say english. This approach is also known
#as encoder-decoder type of attention

In [None]:
#this type of training is known as the pretraining phase, where the model is trained on a large
#corpus of text, and then fine tuned on a smaller dataset, and this is done to prevent overfitting
#but the other part of the training is where fine tuning occurs where after the pretraning phase
#the model just behaves like a document completer, in case of latest versions of GPT, it would just
#babble out the internet, cause thats what it was really trained on, then in the fine tuning phase
#a labeller would group the qs and the ans so as to say, so that the GPT actually behaves like
#a chatbot, here approached like RLHF which is reinforcement learning from human feedback is implemented

In [1]:
#also put the entire loop part of bigram.py in main as
#when its called for inference in generation.py, we dont
#want the training process to start again when we call the 
#model class