<a href="https://colab.research.google.com/github/Badar-e-alam/Automatic-Data-Annotation/blob/main/Transformer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
import os
import numpy as np
from torch.nn import functional as F

In [None]:
#Reading the dataset
with open("/content/drive/MyDrive/input.txt","r",encoding="utf-8") as file:
  text=file.read()

In [None]:
#one million characters roughly
print("length of the characters in dataset", len(text))

length of the characters in dataset 1115394


# Hyper Parameters

In [None]:
batch_size=64 #how many independent sequences will we process in paraller? 
block_size=256 #what is the maximum context lenght for predictions?
max_iter=5000 
n_embd=384
eval_interval=500
learning_rate=3e-4
device="cuda" if torch.cuda.is_available() else "cpu"
eval_iter=200
n_head=6
n_layer=6
torch.manual_seed(1337)
dropout = 0.2

In [None]:
device

'cuda'

In [None]:
#simple plan text from shakespeare style
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
#get the unique character from the dataset for vocabulary building
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(" ".join(chars))
print("length of vocab",vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
length of vocab 65


In [None]:
#create the mapping from the char to  integers
str_int={ ch:i for i,ch in enumerate(chars)}
int_str={i:ch for i,ch in enumerate(chars)}
encoder=lambda s:[str_int[c]for c in s]
decoder=lambda l:"".join([int_str[i] for i in l])

In [None]:
print(encoder("hi people"))
print(decoder(encoder("Hi people")))

[46, 47, 1, 54, 43, 53, 54, 50, 43]
Hi people


In [None]:
#chatgpt uses the this tokenizer 
!pip install tiktoken
import tiktoken
enc=tiktoken.get_encoding("gpt2")
print("word to word embedign",enc.encode("Hi people"))
assert enc.decode(enc.encode("Hi people"))=="Hi people"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
word to word embedign [17250, 661]


In [None]:
decode=lambda l:"".join([int_str[i.item()] for i in l])

In [None]:
#encoding all of the text from the dataset 
data=torch.tensor(encoder(text),dtype=torch.long)
print(data.shape,data.dtype)
print("original text",text[:500])
print(data[:500]) #First 500 character displayed earlier
print("Geting back from decorder",decode(data[:500]))

In [None]:
#dataspliting into the train and validation 
n=int(len(data)*0.9)
print(f"data length: {len(data)} and 90 percent of it: {n}")
train_data=data[:n]# first 90 % goes to training
val_data=data[n:]# after train data will come validation data

data length: 1115394 and 90 percent of it: 1003854


In [None]:
#we cant feed all the text  for the training that why we use the block size to get the chunk of the data
train_data[:block_size]
# if 8block is until 18 then 47 is next char which should be generated and if util 47 is block size then the 56 should be generated and so on

In [None]:
x=train_data[:block_size]
y=train_data[1:block_size+1]# alread one next then the X
for t in range(block_size):
  context=x[:t+1]
  target=y[t]
  #print(f"When input is {context} model will generate {target}")

In [None]:
def get_batch(split):
  #generate a small batch of the input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y
   # generate a small batch of data of inputs x and targets y
input,label=get_batch("train")
#print("input_shape",input.shape)
#print(input)
#print("Target ",label.shape)
#print(label)
#print("___________________")
for b in range(batch_size):
  for t in range(block_size):
    context=input[b,:t+1]
    target=label[b,t]
    #print(f"when the input is {context.tolist()} the target is: {target}")

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
  "Multiple heads of the self_attention in parallel"
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj=nn.Linear(n_embd,n_embd)
    self.dropout=nn.Dropout(dropout)

  def forward(self,x):
    out= torch.cat([h(x) for h in self.heads],dim=-1)
    out=self.dropout (self.proj(out))
    return out

In [None]:
from torch.nn.modules.dropout import Dropout
class FeedForwad(nn.Module):

  """Simple a linear layer followed by a non linearity"""
  def __init__(self,n_embd):
    super().__init__()
    self.net=nn.Sequential(
        nn.Linear(n_embd,4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd,n_embd),
        nn.Dropout(dropout)
    )
  def forward(self,x):
    return self.net(x)


In [None]:
class Block(nn.Module):
  def __init__(self,n_embd,n_head):
    super().__init__()
    head_size=n_embd//n_head
    self.sa=MultiHeadAttention(n_head,head_size)
    self.feedforward=FeedForwad(n_embd)
    self.ln1=nn.LayerNorm(n_embd)
    self.ln2=nn.LayerNorm(n_embd)
  def forward(self,x):
    x=x+self.sa(self.ln1(x))
    x=x+self.feedforward(self.ln2(x))
    return x
    

In [None]:
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    #each token directly read off the logits for the next lookup table
    self.token_embedding_table=nn.Embedding(vocab_size,n_embd)
    self.position_emdedding_tabel=nn.Embedding(block_size,n_embd)
    #self.sa_head=MultiHeadAttention(4,n_embd//4) # 4 heads with 8-diminsional self attention this then concatnates and give us the original 32 whihc is n-emdb for positional encoding
    self.blocks=nn.Sequential(*[Block(n_embd,n_head=n_head) for _ in range(n_layer)]   )
    self.ln_f=nn.LayerNorm(n_embd)
    self.lm_head=nn.Linear(n_embd,vocab_size)

  def forward(self,idx,targets=None):
      B,T=idx.shape
      tok_emd=self.token_embedding_table(idx)
      pos_emd=self.position_emdedding_tabel(torch.arange(T,device=device))
      x=tok_emd+pos_emd
      x=self.blocks(x)
      x=self.ln_f(x)
      logits=self.lm_head(x)
       #dim(B,T,C) B: Btach Size T: block_size and C: vocab_size
      #before computing loss we have to flatten the channels or inot
      if targets==None:
          loss=None
      else:
          B,T,C=logits.shape 
          logits=logits.view(B*T,C)
          targets=targets.view(B*T)
          loss=F.cross_entropy(logits,targets)
        
      return logits,loss

  def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
  def generate2(self,idx,max_new_tokens):
    #idx is the B,T array of the indices in the current context
      for _ in range(max_new_tokens):
          logits,loss=self(idx,None)# make prediction
          logits=logits[:,-1,:]# make B,C
          probs=F.softmax(logits,dim=-1)# (B,C)
          #sample from the distribution 
          idx_next=torch.multinomial(probs,num_samples=1) #(B,1)
          idx=torch.cat((idx,idx_next),dim=1) #(B,T+1)
      return idx


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iter).to(device)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            X=X.to(device)
            Y=Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:

model= BigramLanguageModel()
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iter):

    # every once in a while evaluate the loss on train and val sets''
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb=xb.to(device)
    yb=yb.to(device)
    # evaluate the loss
    logits, loss = model(xb, yb)
    loss=loss.to(device)
    logits=logits.to(device)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model


step 0: train loss 4.3662, val loss 4.3594
step 500: train loss 2.0144, val loss 2.0930
step 1000: train loss 1.6022, val loss 1.7851
step 1500: train loss 1.4368, val loss 1.6354
step 2000: train loss 1.3390, val loss 1.5593
step 2500: train loss 1.2781, val loss 1.5274
step 3000: train loss 1.2253, val loss 1.5055
step 3500: train loss 1.1820, val loss 1.4822
step 4000: train loss 1.1453, val loss 1.4729
step 4500: train loss 1.1090, val loss 1.4864


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)

print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

In [None]:
context