# **GPT Dev from scratch**

## **Data Insighting**

In [83]:
with open("/content/input.txt",'r') as f:
  text=f.read()

In [84]:
print("length of character: ",len(text))

length of character:  1115394


In [85]:
# looking for first 1000 characters of the dataset
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [86]:
# all unique characters of the dataset
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## **Tokenization**

this code will do character level tokenization, it will encode and decode the text.

In [87]:
# Character level tokenization and mapping them
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}
encode=lambda s:[stoi[c] for c in s] # takes text and convert it into integers
decode=lambda l:''.join([itos[i] for i in l])# takes integers and convert it into text

In [88]:
print(encode("Hello world"))
print(decode(encode("Hello world")))

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
Hello world


Encoding entire dataset for the model.

In [89]:
# Encoding entire dataset
import torch
data=torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

**Train Test Split**

In [90]:
# Train test split
n=int(0.9*len(data))# 90% train and 10% val split
train_data=data[:n]
val_data=data[n:]

Providing the input and the target for the model training.

In [91]:
block_size=8 # block size to give the model to train in each batch.

In [92]:
x=train_data[:block_size+1]
y=train_data[1:block_size+1]
for t in range(block_size):
  context=x[:t+1]
  target=y[t]
  print(f"when input is {context} target is {target}")

when input is tensor([18]) target is 47
when input is tensor([18, 47]) target is 56
when input is tensor([18, 47, 56]) target is 57
when input is tensor([18, 47, 56, 57]) target is 58
when input is tensor([18, 47, 56, 57, 58]) target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) target is 58


## **Data Loaders**

In [93]:
torch.manual_seed(1337)
batch_size=4 # Sequence to be processed parallely
block_size=8 # Maximum context length for prediction

def get_batch(split):
  data=train_data if split=='train' else val_data
  ix=torch.randint(len(data)-block_size,(batch_size,))
  x=torch.stack([data[i:i+block_size] for i in ix])
  y=torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

xb,yb=get_batch('train')
print("inputs: ")
print(xb.shape)
print(xb)
print("targets: ")
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
  for t in range(block_size):# time or block dimension
    context=xb[b,:t+1]
    target=yb[b,t]
    print(f"when input is {context.tolist()} target is {target}")

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] target is 43
when input is [24, 43] target is 58
when input is [24, 43, 58] target is 5
when input is [24, 43, 58, 5] target is 57
when input is [24, 43, 58, 5, 57] target is 1
when input is [24, 43, 58, 5, 57, 1] target is 46
when input is [24, 43, 58, 5, 57, 1, 46] target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] target is 39
when input is [44] target is 53
when input is [44, 53] target is 56
when input is [44, 53, 56] target is 1
when input is [44, 53, 56, 1] target is 58
when input is [44, 53, 56, 1, 58] target is 46
when input is [44, 53, 56, 1, 58, 46] target 

## **Initial Stage**

**Using Bigram Language Model**

more about [bigram model](https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/)

In the next piece code the bigram model will calculate each probabilites for the next token to come and predict it accordingly ,In this model each token is not communicating with each other and can only predict it using the current time step token only.

In [94]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token will directly reads from the logits for the next token from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of the integers
        logits = self.token_embedding_table(idx)# (B,T,C)(batch,time,channel)

        # The pytorch cross_entropy expects B*T by C for the loss calculation
        B,T,C=logits.shape
        if targets is None:
          loss=None
        else:
          logits=logits.view(B*T,C)
          targets=targets.view(B*T)
          loss=F.cross_entropy(logits,targets)

        return logits,loss

    def generate(self,idx,max_new_tokens):
      # idx is (B,T) array of indices in the current context
      for _ in range(max_new_tokens):
        # Get the predictions
        logits,loss=self(idx)
        # Focus only on the last time step
        logits=logits[:,-1,:] # becomes (B,C)
        # apply softmax to get the probabilites
        probs=F.softmax(logits,dim=-1) # (B,C)
        # sample from the distribution
        idx_next=torch.multinomial(probs,num_samples=1) # (B,1)
        # append sampled index to the running sequence
        idx=torch.cat((idx,idx_next),dim=1) # (B,T+1)
      return idx

model=BigramLanguageModel(vocab_size)
logits,loss=model(xb,yb)
print(logits.shape)
print(loss)

idx=torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


## **Training the Bigram Model**

In [95]:
# Create optimizer
optimizer=torch.optim.AdamW(model.parameters(),lr=1e-3)

Running this code for multiple times to see how much the loss can decrease.

In [96]:
batch_size=32
epochs=1000
for steps in range(epochs):
  xb,yb=get_batch('train')
  logits,loss=model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

3.7218432426452637


after several steps the minimum loss can only be around 2.5 ~ 2.4

In [97]:
# Generating after the Traning
idx=torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens=400)[0].tolist()))


olylvLLko'TMyatyIoconxad.?-tNSqYPsx&bF.oiR;BD$dZBMZv'K f bRSmIKptRPly:AUC&$zLK,qUEy&Ay;ZxjKVhmrdagC-bTop-QJe.H?x
JGF&pwst-P sti.hlEsu;w:w a BG:tLhMk,epdhlay'sVzLq--ERwXUzDnq-bn czXxxI&V&Pynnl,s,Ioto!uvixwC-IJXElrgm C-.bcoCPJ
IMphsevhO AL!-K:AIkpre,
rPHEJUzV;P?uN3b?ohoRiBUENoV3B&jumNL;Aik,
xf -IEKROn JSyYWW?n 'ay;:weO'AqVzPyoiBL? seAX3Dot,iy.xyIcf r!!ul-Koi:x pZrAQly'v'a;vEzN
BwowKo'MBqF$PPFb
CjYX3


## **TRICK FOR SELF-ATTENTION**

In [104]:
# For example
torch.manual_seed(1337)
B,T,C=4,8,2 # batch,time,channels
x=torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

## **V2**

In [105]:
# V2
# Averaging the past token ids for predicting the new tokens

xbow=torch.zeros((B,T,C)) # here "bow" stands for bag of words.
for b in range(B):
  for t in range(T):
    xprev=x[b,:t+1] # (t,C)
    xbow[b,t]=torch.mean(xprev,0)

the above code can be done using simple matrix multiplication

In [100]:
# Example of matrix multiplication
torch.manual_seed(42)
a=torch.ones(3,3)
b=torch.randint(0,10,(3,2)).float()
c=a@b

print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [101]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [102]:
# Example of matrix multiplication
torch.manual_seed(42)
a=torch.tril(torch.ones(3,3))
a=a/torch.sum(a,1,keepdim=True)
b=torch.randint(0,10,(3,2)).float()
c=a@b

print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [107]:
# Implementing it (V2)
wei=torch.tril(torch.ones(T,T))
wei= wei/wei.sum(1,keepdim=True)
xbow2=wei @ x # (B,T,T) @ (B,T,C) ----> (B,T,C)
torch.allclose(xbow,xbow2,atol=1e-6)

True

In [110]:
# we can see that both of them are identical
print(xbow[0])
print("-------------")
print(xbow2[0])

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
-------------
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


## **V3**

In [111]:
# V3: using softmax
tril=torch.tril(torch.ones(T,T))
wei= torch.zeros((T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=F.softmax(wei,dim=-1)
xbow3=wei @ x
torch.allclose(xbow,xbow3,atol=1e-6)

True

## **V4**

In [117]:
# V4: Self-attention
torch.manual_seed(1337)
B,T,C=4,8,32 # batch,time,channels
x=torch.randn(B,T,C)

# Single head self attention
head_size=16
key=nn.Linear(C,head_size,bias=False)
query=nn.Linear(C,head_size,bias=False)
value=nn.Linear(C,head_size,bias=False)

k=key(x) # (B,T,16)
q=query(x) # (B,T,16)
wei=q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) ---> (B,T,T)

tril=torch.tril(torch.ones(T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=F.softmax(wei,dim=-1)

v=value(x)
out=wei @ v

out.shape

torch.Size([4, 8, 16])

In [118]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089