# Building custom Nano GPT

In [10]:
import torch

In [7]:
with open("../data/input.txt", "r") as f:
    text = f.read()

print(text[:1000])

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars)) # 65 possible characters for model
print(vocab_size)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [25]:
# encoding and decoding strings
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
encode("hii there"), decode(encode("hii there"))



([46, 47, 47, 1, 58, 46, 43, 56, 43], 'hii there')

In [12]:
## applying encoding to the entire dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype, data[:1000])

torch.Size([1115394]) torch.int64 tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [13]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [15]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'When input is {context}, the target is {target}')


When input is tensor([18]), the target is 47
When input is tensor([18, 47]), the target is 56
When input is tensor([18, 47, 56]), the target is 57
When input is tensor([18, 47, 56, 57]), the target is 58
When input is tensor([18, 47, 56, 57, 58]), the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [17]:
## Generalising it to get batches of data in parallel
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions?

def get_batch(split: str)->tuple[torch.Tensor, torch.Tensor]:
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'--------- training data ---------')
print(f'inputs shape: {xb.shape}, inputs: {xb}')
print(f'targets shape: {yb.shape}, targets: {yb}')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context}, the target is {target}')

--------- training data ---------
inputs shape: torch.Size([4, 8]), inputs: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets shape: torch.Size([4, 8]), targets: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
When input is tensor([24]), the target is 43
When input is tensor([24, 43]), the target is 58
When input is tensor([24, 43, 58]), the target is 5
When input is tensor([24, 43, 58,  5]), the target is 57
When input is tensor([24, 43, 58,  5, 57]), the target is 1
When input is tensor([24, 43, 58,  5, 57,  1]), the target is 46
When input is tensor([24, 43, 58,  5, 57,  1, 46]), the target is 43
When input is tensor([24, 43, 58,  5, 57,  1, 46, 43]), the target is 39
When input is tensor([44]), the target is 53
When input is tensor([44, 5

In [32]:
# Create a super simple Bigram language model
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx:torch.Tensor, targets:torch.Tensor=None)->tuple[torch.Tensor, torch.Tensor|None]:
        logits = self.token_embedding_table(idx) # (B,T,C) (Batch, Time, Channel)
        ## Pytorch expects inputs to be (B, C, T)
       
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx:torch.Tensor, max_new_tokens:int)->torch.Tensor:
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,-1,:] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B,1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
            

bigram_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(f'loss: {loss}')
idx =torch.zeros((1,1), dtype=torch.long)
print(decode(bigram_model.generate(idx=idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
loss: 5.152285575866699

QhL-HuDhoRNnwaeeI$Otzuuj&q:aZKa.tFwaauP:XRfx;'wmrcb:XCJlU'JM-kGPlOoQn!Rknx.L?eYUCq?!3y.vEL;Hrsftaur 


In [33]:
# using adam optimizer for our model
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr = 1e-3)

In [35]:
batch_size = 32
for steps in range(10000):
    Xb, Yb = get_batch('train')
    logits, loss = bigram_model(Xb, Yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


print(loss.item())

2.4560248851776123


In [38]:
print(decode(bigram_model.generate(idx=idx, max_new_tokens=300)[0].tolist()))


Thooncore,
An wis ne madentathelers Lordd sinqucee, p
epr
I'G y nd myethat gor, totaucaticos d t fomorce
TESo!
TENOMET:
KENCUCa seis
Sowhiforraty:
Yels;
Nucare but thal mamndy erkspitrouie per'
Tome ring spr thionousthit fr f sin?

Anhipe oRNISwourisprd INCLEard hered tere, weartoowhed hen ffr inay 


# Self Attention
- For attention, the token at t can't talk to any tokens from t+1 to n. It can only talk to tokens from index 0 to t(including t)
- Easiest way to add attention:
    - We avg everything upto and including the current index. This becomes a feature vector that (summarises current index token wrt it's history)to the model. It's super lossy and we loose a ton of information.
    - To make this more efficient than using two for loops, we can use a lower traingular matrix and multiply it with our matrix of tokens
    - Third and also efficient way is to use softmax

In [39]:
B,T,C = 4,8,2
x = torch.randn(B,T,C)

In [44]:
## Mathematical trick used in self attention
## This is super inefficient, but a good starting point
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev,0)

In [42]:
x[0]

tensor([[-0.8554,  1.0345],
        [-0.3082,  0.1738],
        [-0.6142, -1.1057],
        [-0.7136, -0.1107],
        [-1.6504,  0.5479],
        [-0.6868, -1.3893],
        [-0.7580,  0.3641],
        [-1.1333, -1.5686]])

In [43]:
xbow[0]

tensor([[-0.8554,  1.0345],
        [-0.5818,  0.6041],
        [-0.5926,  0.0342],
        [-0.6229, -0.0020],
        [-0.8284,  0.1079],
        [-0.8048, -0.1416],
        [-0.7981, -0.0694],
        [-0.8400, -0.2568]])

In [45]:
## A better way is to us matrix multiplication
wts = torch.tril(torch.ones(T,T))
wts/=wts.sum(1,keepdim=True)
wts

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow2 = wts @ x # (T,T) @ (B,T,C) -> (B,T,T) @ (B,T,C) -> (B,T,C)
xbow2

tensor([[[-0.8554,  1.0345],
         [-0.5818,  0.6041],
         [-0.5926,  0.0342],
         [-0.6229, -0.0020],
         [-0.8284,  0.1079],
         [-0.8048, -0.1416],
         [-0.7981, -0.0694],
         [-0.8400, -0.2568]],

        [[-1.1680,  1.2887],
         [-0.1681,  0.4155],
         [-0.4383,  0.3877],
         [-0.5793,  0.3586],
         [-0.3653,  0.0070],
         [-0.1741, -0.0691],
         [-0.0907,  0.0813],
         [ 0.1746, -0.1572]],

        [[ 0.3871,  0.0862],
         [-0.5570,  0.2855],
         [-0.6132,  0.1448],
         [-0.3922,  0.0110],
         [-0.4893,  0.1267],
         [-0.3423,  0.0108],
         [-0.3644, -0.1229],
         [-0.6157, -0.1085]],

        [[-0.4037,  0.9445],
         [-0.2456,  0.5254],
         [-0.4653,  0.0725],
         [-0.8623, -0.0356],
         [-0.6472, -0.2169],
         [-0.4025, -0.1045],
         [-0.4845, -0.0572],
         [-0.3515, -0.1585]]])

In [48]:
# check if xbow2 is same as xbow
torch.allclose(xbow, xbow2)

True

In [None]:
# version3: Using softmax
tril = torch.tril(torch.ones(T,T))
wts = torch.zeros((T,T)) # interaction strength/affinity between tokens. How much of the token from the past we want to aggregate
wts = wts.masked_fill(tril==0, float('-inf')) # tokens from the future cannot communicate with the current token. By setting them to -inf, we're making sure we don't aggregate those tokens
wts = F.softmax(wts, dim=-1) # normalise and sum to 1. The final weights are the attention weights. They tell us how much of the each of the past tokens fuses into the current token
wts

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])