## Pre-Requisites

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
input_path = os.getenv("input_path")


In [3]:
with open (input_path, 'r', encoding='utf-8') as file:
    text = file.read()

In [4]:
print("Length of the input text: ", len(text))

Length of the input text:  1115393


In [5]:
print("First 500 characters of the input text: \n\n")
print(text[:500])  

First 500 characters of the input text: 


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


## Building the Vocabulary

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print("Vocabulary size: ", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size:  65


### Tokenization Strategy

In [7]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # This is the encoder part, it takes a string as input and returns a list of integers as output
decode = lambda l: ''.join([itos[i] for i in l]) # This is the decoder part, it takes a list of integers as input and returns a string as output 

For this research we are using a simple character level tokenizer and not the sub-word level tokenizer like the tiktoken or sentenepiece 

In [8]:
print(encode("I am Atharva"))
print(decode(encode("I am Atharva")))

[21, 1, 39, 51, 1, 13, 58, 46, 39, 56, 60, 39]
I am Atharva


So we basically have used our vocabulary size and mapped different character with numbers and then we encode it using that mapping and similarly decode it using the reverse logic.

In [9]:
import torch 
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built()) 

True
True


In [10]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

## Train / Val Split

In [11]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [12]:
len(train_data), len(val_data)

(1003853, 111540)

In [13]:
block_size = 8 # This is the size of the data that we will pass in one pass to the transformer model
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [14]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

In [15]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When the input is {context}, the target is {target}")

When the input is tensor([18]), the target is 47
When the input is tensor([18, 47]), the target is 56
When the input is tensor([18, 47, 56]), the target is 57
When the input is tensor([18, 47, 56, 57]), the target is 58
When the input is tensor([18, 47, 56, 57, 58]), the target is 1
When the input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


So if we give the input character length of 1, then our transformer can predict and it can predict till we give length equal to block size but if we give more than that, it will then start truncating the input as the maximum input is 8.

In [16]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

if device == 'mps':
    print(f"Using Metal GPU")
else:
    print(f"Using CPU")

Using device: mps
Using Metal GPU


In [17]:
torch.manual_seed(1337)

batch_size = 4
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb.shape)
print(xb)

print("\n\nTargets: ")
print(yb.shape)
print(yb)

print("----\n\n")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is: {context.tolist()}, the target is: {target.item()}")

Inputs: 
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]], device='mps:0')


Targets: 
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]], device='mps:0')
----


When input is: [53], the target is: 59
When input is: [53, 59], the target is: 6
When input is: [53, 59, 6], the target is: 1
When input is: [53, 59, 6, 1], the target is: 58
When input is: [53, 59, 6, 1, 58], the target is: 56
When input is: [53, 59, 6, 1, 58, 56], the target is: 47
When input is: [53, 59, 6, 1, 58, 56, 47], the target is: 40
When input is: [53, 59, 6, 1, 58, 56, 47, 40], the target is: 59
When input is: [49], the target is: 43
When input is: [49, 43], the target is: 43
When input is: [49, 43, 43], the target is: 54
When input is: [49, 43, 

In [18]:
print(xb)

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]], device='mps:0')


This is the actual data that we are going to feed to the transformer

## Transformer Model Building

In [22]:
import torch
import torch.nn as nn 
from torch.nn import functional as F
torch.manual_seed(1337)

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # Create embedding table: maps each character to a vector of size vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        # Get embeddings for input tokens
        logits = self.token_embedding_table(idx) # Currently in the shape (B, T, C)

        # If no targets provided, we're just generating (no loss needed)
        if targets == None:
            loss = None
        else:
            # We are changing the shape of logits from (B, T, C) to (B*T, C) so that we can compute the loss using nn.CrossEntropyLoss, converting it to 2 dimensions so that the channel 
            # dimension is the second dimension as per the requirement of nn.CrossEntropyLoss and we will do the same for targets as well.
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # Generate max_new_tokens new characters one at a time
        for _ in range(max_new_tokens):
            # Get predictions from the model
            logits, loss = self(idx)

            # Focus only on the last time step's predictions
            logits = logits[:, -1, :]

            # Apply softmax to convert logits to probabilities
            probs = F.softmax(logits, dim = -1)

            # Sample the next token from the probability distribution
            idx_next = torch.multinomial(probs, num_samples = 1)

            # Append the sampled token to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1)

        return idx

Using device: mps


In [24]:
# Create model and move to Metal GPU
m = BigramLanguageModel(vocab_size).to(device)

# Move data batches to Metal GPU
xb, yb = xb.to(device), yb.to(device)

# Get predictions and loss
logits, loss = m(xb, yb)
print(logits.shape)  # (batch_size, block_size, vocab_size)
print(loss)

torch.Size([32, 65])
tensor(4.7525, device='mps:0', grad_fn=<NllLossBackward0>)


In [26]:
# Generate 100 new tokens starting from a newline character (index 0)
# Create starting tensor directly on Metal device
start_idx = torch.zeros((1,1), dtype=torch.long, device=device)
generated = m.generate(idx=start_idx, max_new_tokens=100)
print(decode(generated[0].tolist()))


UwasU3TKMYMj-fEcqNPVQbRe.OGOuUfZEiewNy::dl-jkczCOIiHeg EggeuTpbDbYhMYVcoS:rXcuSrng&?ofeOAZrYftKyLXcz


## Creating a PyTorch Optimizer

In [27]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [30]:
batch_size  = 32

for steps in range(10000):

    # select a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

    print(loss.item())

3.6721177101135254
3.4195034503936768
3.554668664932251
3.5639002323150635
3.6065502166748047
3.5714240074157715
3.5082268714904785
3.516465425491333
3.4981212615966797
3.5273213386535645
3.5587916374206543
3.4893698692321777
3.545210838317871
3.4814209938049316
3.6327245235443115
3.6286215782165527
3.5297632217407227
3.55173921585083
3.5442099571228027
3.5351815223693848
3.5235321521759033
3.557861328125
3.583927631378174
3.5715746879577637
3.56547212600708
3.518418312072754
3.5554051399230957
3.561654806137085
3.504202365875244
3.6575636863708496
3.4487438201904297
3.5674800872802734
3.4843430519104004
3.520141839981079
3.4228527545928955
3.5801799297332764
3.565098762512207
3.5463428497314453
3.4698381423950195
3.544498920440674
3.5077643394470215
3.651529550552368
3.5060548782348633
3.4964399337768555
3.423705577850342
3.575643539428711
3.5256776809692383
3.6102371215820312
3.542004108428955
3.5313563346862793
3.5704798698425293
3.5773720741271973
3.457606315612793
3.6320960521698


In [31]:
print("\nGenerated Text After Training: \n\n")
start_idx = torch.zeros((1,1), dtype=torch.long, device=device)
generated = m.generate(idx=start_idx, max_new_tokens=500)
print(decode(generated[0].tolist()))


Generated Text After Training: 



HEayo in mpery way avend oubur'er sickes bokecard dhiceny

He tw el fe oupise he, lbustselownthous;
I m w
T:
TIONTouly me EUjerk mondrn itheland's oe, oghithet f, badogienthofathatey foueay wad,
ureisold array n
ICoyockind m murs, in mamybalorthyongmyooe, d Vofetthindy st
HBy:

My, meay alsteanerm to, oupomp rede d pre h, gavitfithrer'GENUpsts lathindKIO:
Berouerse IOLUEDzXjulKathicerire.
II IS:
IOMISpequt f keithunghant d An myorerrofe find ans I andoovyonon-hu he nd youlliler pt iciHATh y onee


## Self Attention Trick 

In [33]:
# Using an example to understand the trick:

torch.manual_seed(1337)

B, T, C = 4, 8, 2 # batch, time, channel

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

We want the information to flow only in the forward direction. That means we dont want the previous tokens to access the future tokens as this will harm the training.

Example, the token at time 5 should only be able to interact with tokens at time 4,3,2,1 and not at all with tokens at time 6,7,8. 
As we are outputting 1 token at a time, this makes sure that at every time step we have previous context and we dont have access to future tokens in order to make the model generalise well on 
unseen data.

In [34]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [35]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [36]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [38]:
torch.manual_seed(1337)

a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()

c = a@b 

In [39]:
print(a)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


In [40]:
print(b)

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])


In [41]:
print(c)

tensor([[12., 10.],
        [12., 10.],
        [12., 10.]])


In this previous operation c = a@b, all the tokens have access to all the other tokens, this is an example of what we dont want.

In order to solve that future access, we have a function in torch called tril. See the output of that below, keeping the values as zeros means that the weighted average will simply be zero 
and it will block future access.

In [42]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [43]:
torch.manual_seed(1337)

a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float()

c = a@b 

In [44]:
print(a)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])


In [45]:
print(b)

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])


In [46]:
print(b)

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])


Now we have tackled the future access part but now we will take an average of it to give equal priority to all the tokens previous to the token currently being accessed.

In [47]:
torch.manual_seed(1337)

a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()

c = a@b 

In [48]:
print(a)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])


In [49]:
print(b)

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])


In [50]:
print(c)

tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])


Creating a weights tril

In [52]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [53]:
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [54]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

We can also use Softmax function to achieve the xbow variable

In [55]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True