In [1]:
import numpy as np
import os
import torch
import random

In [2]:
with open('./input.txt') as f:
    text = f.read()

In [4]:
print(len(text))

1115394


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



## Unique characters in the text

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('Unique characters count = ', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Unique characters count =  65


## Tokenize using the char to index mapping

In [15]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
print('String to index mapping ', stoi)
print('\nIndex to string mapping', itos)

encode = lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

## sample
print('\n')
print('Encoded string for sample hi there is ' , encode('hii there'))
print('Decoded string for sample hi there is ' , decode(encode('hii there')))

String to index mapping  {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}

Index to string mapping {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42:

## Tokenize complete the data

In [16]:
data = torch.tensor(encode(text))
print(data.shape, data.dtype)
print(data[:1000])


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

## Split the data into train and test with 90%train and 10%test

In [22]:
n = int(len(data)*0.9)
train_data = data[:n]
val_data  = data[n:]
print('Val data chars ', val_data.shape)
print('Train data chars ', train_data.shape)

Val data chars  torch.Size([111540])
Train data chars  torch.Size([1003854])


## Max length and adding max length+1 because max_length+1 is target of max_length

In [24]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [26]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    input_data = x[:i+1]
    output_data = y[i]
    print(f'When inputs is {input_data} the target : {output_data}')

When inputs is tensor([18]) the target : 47
When inputs is tensor([18, 47]) the target : 56
When inputs is tensor([18, 47, 56]) the target : 57
When inputs is tensor([18, 47, 56, 57]) the target : 58
When inputs is tensor([18, 47, 56, 57, 58]) the target : 1
When inputs is tensor([18, 47, 56, 57, 58,  1]) the target : 15
When inputs is tensor([18, 47, 56, 57, 58,  1, 15]) the target : 47
When inputs is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target : 58


## batch generator for training

In [27]:
batch_size = 4

In [359]:
def batch_generator(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(0, len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i: i+block_size ]for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [72]:
inp, targets = batch_generator("train")

In [73]:

print(inp.shape, target.shape)
for i in range(inp.shape[0]):
    for j in range(block_size):
        input_data = inp[i][:j+1]
        output_data = targets[i][j]
        print(f'When inputs is {input_data} the target : {output_data}')
    print('\n')

torch.Size([4, 8]) torch.Size([4, 8])
When inputs is tensor([46]) the target : 1
When inputs is tensor([46,  1]) the target : 51
When inputs is tensor([46,  1, 51]) the target : 43
When inputs is tensor([46,  1, 51, 43]) the target : 8
When inputs is tensor([46,  1, 51, 43,  8]) the target : 0
When inputs is tensor([46,  1, 51, 43,  8,  0]) the target : 0
When inputs is tensor([46,  1, 51, 43,  8,  0,  0]) the target : 19
When inputs is tensor([46,  1, 51, 43,  8,  0,  0, 19]) the target : 24


When inputs is tensor([1]) the target : 39
When inputs is tensor([ 1, 39]) the target : 1
When inputs is tensor([ 1, 39,  1]) the target : 45
When inputs is tensor([ 1, 39,  1, 45]) the target : 43
When inputs is tensor([ 1, 39,  1, 45, 43]) the target : 52
When inputs is tensor([ 1, 39,  1, 45, 43, 52]) the target : 58
When inputs is tensor([ 1, 39,  1, 45, 43, 52, 58]) the target : 50
When inputs is tensor([ 1, 39,  1, 45, 43, 52, 58, 50]) the target : 43


When inputs is tensor([59]) the targ

## Implement on Bigram Language model

In [74]:
import torch.nn as nn
from torch.nn import functional as F

In [174]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        #idx and targets are (B, T) tensor of integers
        logits = self.token_embedding_table(idx) #(B, T, vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)  # idx = (1, 1)
#             print(logits.shape) # (1, 1, 65)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
#             idx_next = torch.argmax(probs) # (B, 1)
#             idx_next  = idx_next.view(1, 1)
#             print(idx_next.shape, )
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel(vocab_size)
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.5572, grad_fn=<NllLossBackward0>)

ErQ
ymzlViXae:!ByjvAPQ.Xg3eT
 UpmH.EIqfmLGTCs'fMnvBPypPK!PqQN;AO.j-jDpN.S $;mgQCu$vtaTpgb Nz
lV!t-UI


In [319]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)

In [184]:
batch_size = 32
for epoch in range(10000):
    
    xb, yb = batch_generator('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.474486827850342


In [187]:
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=200)[0].tolist()))


MPrg r t ngr se ha me IAninoud
Tith thas. is y,
A:
Whil incthery ivistcer.
LAuly d trer hederon meff t I inde ILORI frengur ss f the ccheee.
be by,

Indemy s sou erito ay, s de uran IELonn tho the KAn


## Mathematical trick for self attention

In [189]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

#### Inorder to interact between the tokens it needs to communicate between each other and we need to write the script to communicate with the previous tokens because the future tokens will be predicted. For example, for t8 token we will average the tokens till t0 to t8 and store them

In [191]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1]
        xbow[b, t] = torch.mean(x_prev, 0)

In [195]:
x[0]

tensor([[ 0.0495, -1.8143],
        [ 0.1679,  1.5913],
        [-1.9818, -0.5628],
        [-0.8134, -2.1756],
        [ 0.4517, -0.2635],
        [ 0.1707,  0.7125],
        [ 2.8604,  1.0229],
        [-0.2694,  1.2245]])

In [194]:
xbow[0]

tensor([[ 0.0495, -1.8143],
        [ 0.1087, -0.1115],
        [-0.5882, -0.2619],
        [-0.6445, -0.7403],
        [-0.4252, -0.6450],
        [-0.3259, -0.4187],
        [ 0.1293, -0.2128],
        [ 0.0794, -0.0331]])

#### Efficient way to perform this trick is using the matrix multiplication and using tril matrix

In [196]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [205]:
a = torch.tril(torch.ones(3, 3))
a = a/torch.sum(a, 1, keepdims=True)
print('a = ')
print(a)
b = torch.randint(0, 10, (3, 3)).float()
print('b = ')
print(b)
c = a@b
print('c = ')
print(c)

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = 
tensor([[7., 1., 3.],
        [1., 1., 4.],
        [4., 6., 9.]])
c = 
tensor([[7.0000, 1.0000, 3.0000],
        [4.0000, 1.0000, 3.5000],
        [4.0000, 2.6667, 5.3333]])


In [208]:
## Rewriting the xbow script
xbow2 = torch.zeros(B, T, C)
wei = torch.tril(torch.ones(T, T))
wei = wei/torch.sum(wei, 1, keepdims=True)
xbow2 = wei @ x ## (B, T,T)(broadcasting to B batches) @ (B, T, C) --> (B, T , C)
xbow2[0]

tensor([[ 0.0495, -1.8143],
        [ 0.1087, -0.1115],
        [-0.5882, -0.2619],
        [-0.6445, -0.7403],
        [-0.4252, -0.6450],
        [-0.3259, -0.4187],
        [ 0.1293, -0.2128],
        [ 0.0794, -0.0331]])

In [211]:
torch.allclose(xbow, xbow2) ## comparing previous xbow and latest xbow2

True

In [225]:
## Using softmax 
tril = torch.tril(torch.ones((T, T)))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
print('wei = ')
print(wei)
wei = F.softmax(wei, dim=1)
print('wei after softmax = ')
print(wei)
xbow3 = wei @ x
# print(xbow3[0])
torch.allclose(xbow, xbow3) ## comparing previous xbow and latest xbow3

wei = 
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
wei after softmax = 
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.12

True

## Adding the positional encoder to the bi-gram model and predicting for maximum block size itself 


In [227]:
n_emb = 32

In [239]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.positional_encoder = nn.Embedding(block_size, n_emb)
        self.lm_head = nn.Linear(n_emb, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        #idx and targets are (B, T) tensor of integers
        token_emb = self.token_embedding_table(idx) #(B, T, n_emb)
        pos_emb = self.positional_encoder(torch.arange(T)) #(T, n_emb)
        x = token_emb+pos_emb #(B, T, n_emb)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
#             logits, loss = self(idx)  # idx = (1, 1)
            logits, loss = self(idx[:, -block_size:])  ## giving maximum previous block size characters itself
#             print(logits.shape) # (1, 1, 65)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.6901, grad_fn=<NllLossBackward0>)

nStZrDUhF&RdVlw?y E!D'vaSVMZ:3a;ml;AJmmvOdCGgBiDmYJ&;bOjLyLJAa.&ow;JAbFWi'yl;;d!SyHz;i;vPvJ$oSpQLcCA


## Adding Multi head self attention Block

In [275]:
## self attention each token has query and key where each token query tries to multply with the key 

B, T, C = 4, 8, 32
# x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)


k = key(x) #(B, T, head_size)
q = query(x) #(B, T, head_size)
 
wei = q @ k.transpose(-2, -1) #(B, T, head_size) @ (B, head_size, T) --> (B, T, T)
# print(wei[0])

tril = torch.tril(torch.ones((T, T)))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
output = wei @ v

output.shape

torch.Size([4, 8, 16])

In [276]:
(wei[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5771, 0.4229, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5207, 0.3891, 0.0903, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2874, 0.1826, 0.3358, 0.1943, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1798, 0.7538, 0.0209, 0.0213, 0.0242, 0.0000, 0.0000, 0.0000],
        [0.0800, 0.0267, 0.0760, 0.1100, 0.4024, 0.3049, 0.0000, 0.0000],
        [0.0262, 0.1885, 0.1014, 0.0784, 0.0856, 0.2900, 0.2299, 0.0000],
        [0.0638, 0.0546, 0.0880, 0.1961, 0.1057, 0.2379, 0.0953, 0.1585]],
       grad_fn=<SelectBackward0>)

# Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [277]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [282]:
k.var()

tensor(0.9416)

In [279]:
q.var()

tensor(1.0104)

In [281]:
wei.var()

tensor(1.0879)

In [283]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [284]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [287]:
class Head(nn.Module):
    " one head of self-attention"
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(C, head_size, bias=False)
        self.query = nn.Linear(C, head_size, bias=False)
        self.value = nn.Linear(C, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
        
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5 ## (B, T, C) @ (B, C, T) --> (B, T, T)
        
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1)
        out = wei @ v # (B, T, T) @ (B, T, C) --> (B, T, C)
        return out

In [288]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.positional_encoder = nn.Embedding(block_size, n_emb)
        self.head = Head(n_emb)
        self.lm_head = nn.Linear(n_emb, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        #idx and targets are (B, T) tensor of integers
        token_emb = self.token_embedding_table(idx) #(B, T, n_emb)
        pos_emb = self.positional_encoder(torch.arange(T)) #(T, n_emb)
        x = token_emb+pos_emb #(B, T, n_emb)
        x = self.head(x) #(B, T, n_emb)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
#             logits, loss = self(idx)  # idx = (1, 1)
            logits, loss = self(idx[:, -block_size:])  ## giving maximum previous block size characters itself
#             print(logits.shape) # (1, 1, 65)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.3215, grad_fn=<NllLossBackward0>)

d3BR'o&lGnPrLK3bhf$v
WFSggLRLHObfr?G.E hwNrmGHX!WO,VwAMJJjtaNfwTo$xf$hh:IAZBOHChJOlm b!-iZL-yxU?P
mh


In [296]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.mhsa_blocks = nn.ModuleList([Head(head_size) for n in range(num_heads)])
    def forward(self, x):
        return torch.cat([h(x) for h in self.mhsa_blocks], dim=-1)

In [297]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.positional_encoder = nn.Embedding(block_size, n_emb)
        self.head = MultiHeadSelfAttention(4, n_emb//4) # i.e., 4 heads of 8 dimensional self attention 
        self.lm_head = nn.Linear(n_emb, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        #idx and targets are (B, T) tensor of integers
        token_emb = self.token_embedding_table(idx) #(B, T, n_emb)
        pos_emb = self.positional_encoder(torch.arange(T)) #(T, n_emb)
        x = token_emb+pos_emb #(B, T, n_emb)
        x = self.head(x) #(B, T, n_emb)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
#             logits, loss = self(idx)  # idx = (1, 1)
            logits, loss = self(idx[:, -block_size:])  ## giving maximum previous block size characters itself
#             print(logits.shape) # (1, 1, 65)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.1563, grad_fn=<NllLossBackward0>)

ivmnUwScdkPGczaZ.u3u ,eBBtFl Wp,.QgwbcYRGUhTJz3cu-o3Nv?E!XG:j!ZDW.ajRqekQSXs.X;X$h?tkx&aDs&GTpPaC
An


In [304]:
batch_size = 32
for epoch in range(10000):
    
    xb, yb = batch_generator('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.033801555633545


In [308]:
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))


Cat wor song cove my beetster:
Bans Yould.

RULIUS: Eing par sponomins is ide, blabe fories?

Cot.




### Adding MLP with multi blocks of MHSA and add projection layer along with the skip connections

In [322]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), ## projection layer
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [323]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.mhsa_blocks = nn.ModuleList([Head(head_size) for n in range(num_heads)])
        self.projection_layer = nn.Linear(n_emb, n_emb)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.mhsa_blocks], dim=-1)
        out = self.projection_layer(out)
        return out

In [324]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadSelfAttention(n_head, head_size)
        self.ffwd = FeedForward(n_emb)
    def forward(self, x):
        x = x+self.sa(x) ## skip connection
        x = x+self.ffwd(x) # skip connection
        return x

In [325]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.positional_encoder = nn.Embedding(block_size, n_emb)
        self.block = nn.Sequential(Block(n_emb, 4), Block(n_emb, 4), Block(n_emb, 4))
        self.lm_head = nn.Linear(n_emb, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        #idx and targets are (B, T) tensor of integers
        token_emb = self.token_embedding_table(idx) #(B, T, n_emb)
        pos_emb = self.positional_encoder(torch.arange(T)) #(T, n_emb)
        x = token_emb+pos_emb #(B, T, n_emb)
        x = self.block(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
#             logits, loss = self(idx)  # idx = (1, 1)
            logits, loss = self(idx[:, -block_size:])  ## giving maximum previous block size characters itself
#             print(logits.shape) # (1, 1, 65)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.4380, grad_fn=<NllLossBackward0>)

HEkkP!u'XNV,:hOcd$&RENJa? Z&esjjUVv;&$Rc$$;&,o'ADUT&eqqtb,&'$3C?ojfAfeNOz&RNRazB.n$&.,aO$&CS!s?Tsa&W


In [326]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)

In [327]:
batch_size = 32
for epoch in range(10000):
    
    xb, yb = batch_generator('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.0799484252929688


In [329]:
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))


O len
CARWICUS:
My wo we:
Thou brem good bill of 'twent once, is I saway,
Kentle? 
LY VOLIXENBRYORD 


## Adding layer normalization

In [345]:
## Normalizes mean = 0 and stddev = 1 for every batch

In [342]:
p = torch.randn(32, 100)
layer_norm = nn.LayerNorm(100)
p1 = layer_norm(p)
p1.shape

torch.Size([32, 100])

In [343]:
p1[:,0].mean(), p1[:,0].std()

(tensor(-0.1147, grad_fn=<MeanBackward0>),
 tensor(0.9386, grad_fn=<StdBackward0>))

In [344]:
p1[0, :].mean(), p1[0, :].std()

(tensor(1.4305e-08, grad_fn=<MeanBackward0>),
 tensor(1.0050, grad_fn=<StdBackward0>))

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadSelfAttention(n_head, head_size)
        self.ffwd = FeedForward(n_emb)
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)
    def forward(self, x):
        x = x+self.sa(self.ln1(x)) ## skip connection
        x = x+self.ffwd(self.ln1(x)) # skip connection
        return x

## Complete updated code with adding few dropout layers

In [356]:
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
batch_size = 64
block_size = 256


In [357]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
logits, loss = model(inp, targets)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.2443, grad_fn=<NllLossBackward0>)

Fh!tLuPesSCCTy!$H,s-3M$bYi?&?!qv$W!?Rg;I,qG,yF3ieSIba-?S'-.&,r!:-YA-lfob.WTq?GS?DGb
R,S?Y?iyYZiELeKK


In [360]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
# batch_size = 32
for epoch in range(1000):
    
    xb, yb = batch_generator('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

1.3831497430801392


In [362]:
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


a altes it jurth, tefore and to see,
And fear at heaven to adminon brother:
As time madam, and commortains poor minds.

KING LEWARD IV:
I'll spoke mouch them?
Will is thn I say I claid 'twast crave to Rome.

YORK:
Norfoline emband Deat Marcius Rick? we prope, our stay
and wills, that could be seem thee, with suspervant,
Tis find freed withoutlessome whave, the worst
That he this. GoO me, sir, becaused it is, in hard;
Yea seal to clies, titlemany: mone thou kAjoints
The doth man's swater is the d
