In [2]:
from IPython.display import display, Markdown, Latex

In [3]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [4]:
with open("input.txt") as f:
    text = f.read()

In [5]:
display(Markdown(text[:500]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor

In [6]:
# Vocab Size - All unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Tokenization

In [7]:
# Tokenization - Mapping Characters to Integers and vice-versa
encodings = {ch: i for i, ch in enumerate(chars)}
decodings = {i: ch for i, ch in enumerate(chars)}

encode = lambda enc_str: [encodings[char] for char in enc_str]
decode = lambda dec_int: ''.join([decodings[i] for i in dec_int])

print(encode("Hi There!"))
print(decode(encode("Hi There!")))

[20, 47, 1, 32, 46, 43, 56, 43, 2]
Hi There!


#### Tokenizing entire text

In [8]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
# train-val split
split = int(0.9*len(data))
train_data = data[:split]
val_data = data[split:]

In [10]:
block_size = 8 # Batching the input data, dont want all at once
train_data[:block_size+1] # +1 because 9 tokens include 8 examples => a1 -> a2; a1,a2 -> a3; a1,a2,a3 -> a4

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
X = train_data[:block_size] # input to transformer
y = train_data[1:block_size+1] # target

for t in range(block_size):
    context = X[:t+1]
    target = y[t]
    print(f"Input: {context}\tTarget: {target}")

Input: tensor([18])	Target: 47
Input: tensor([18, 47])	Target: 56
Input: tensor([18, 47, 56])	Target: 57
Input: tensor([18, 47, 56, 57])	Target: 58
Input: tensor([18, 47, 56, 57, 58])	Target: 1
Input: tensor([18, 47, 56, 57, 58,  1])	Target: 15
Input: tensor([18, 47, 56, 57, 58,  1, 15])	Target: 47
Input: tensor([18, 47, 56, 57, 58,  1, 15, 47])	Target: 58


In [12]:
torch.manual_seed(40)
batch_size = 4 # number of contexts
block_size = 8 # maximum context length

def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:block_size+i] for i in ix])
    y = torch.stack([data[i+1:block_size+i+1] for i in ix])
    return x, y
    
xb, yb = get_batch('train')
print(f'inputs: {xb.shape}\n{xb}')
print(f'targets: {yb.shape}\n{yb}')
print('-----------')
print('32 different examples')

inputs: torch.Size([4, 8])
tensor([[46, 47, 51,  8,  0,  0, 23, 21],
        [ 0, 13, 52, 42,  1, 52, 43, 60],
        [46,  5, 42,  1, 49, 47, 52, 45],
        [63, 53, 59,  8,  1, 24, 53, 56]])
targets: torch.Size([4, 8])
tensor([[47, 51,  8,  0,  0, 23, 21, 26],
        [13, 52, 42,  1, 52, 43, 60, 43],
        [ 5, 42,  1, 49, 47, 52, 45,  6],
        [53, 59,  8,  1, 24, 53, 56, 42]])
-----------
32 different examples


### Baseline - Bigram Model

In [13]:
import torch
import torch.nn as nn 
from torch.nn import functional as F 

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding(idx) # (B,T,C) (Batch, Time, Channel)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens=4):
        
        for _ in range(max_new_tokens):
            
            logits, loss = self(idx)
            # focus on the last time step
            logits = logits[:, -1, :] # (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        return idx
    
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.6348, grad_fn=<NllLossBackward0>)


The expected loss in this case (Negative Log Likelihood)

e(loss) = -(ln(1/vocab_size))

-(ln(1/65)) = 4.17


In [14]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))



UhKUL?apQiiecTJtedr-&FiQ3zldLkyvdaaD zDkvPFT?'sflcNg:TEF
KwfhFjUUUQBCj'UrTblCn,Sp.az-xjP
YC-RT,xnwA


In [15]:
# Create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5792176723480225


In [17]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))


Te'se t to l f cer
Thorwo punPo And, s r g
man w ar,
Ylvevirumangnoshosorepancurof u Cl, fourenthtld


### The mathematical trick in self-attention

In [18]:
torch.manual_seed(1337)

B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

**Aggregating over the past of a given token**

For example, for the 3rd token in 8 token block, we don't want information from 4th-8th block, basically they are masked. 
So talking about the 3rd token, we want info at the 3rd token, but also the preceding tokens.

Therefore, we are averaging the channels (feature vector) upto the 3rd token.
This is a weak, but a simple form of aggregation

In [19]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [32]:
# Matrix trick to vectorize the above opertion
torch.manual_seed(42)

a = torch.ones(3,3)
# Lower triangular matrix
a = torch.tril(a)
# Normalizing each row, so that we have average
a = a / torch.sum(a, dim=1, keepdim=True)

print(f'a:\n{a}')
print('======'*7)

b = torch.randint(0,10,(3,2)).float()
print(f'b:\n{b}')
print('======'*7)

c = a @ b
print(f'c:\n{c}')
print('======'*7)

# This averages rows of b upto a particular row  

a:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b:
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c:
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [39]:
# Vectorizing the aggregation step

vector_xbow = torch.zeros((B,T,C))

normalizing_matrix = torch.tril(torch.ones(8,8))
normalizing_matrix /= torch.sum(normalizing_matrix, 1, keepdim=True)

for b in range(B):
    xprev = x[b]
    avg_vector = normalizing_matrix @ xprev
    vector_xbow[b] = avg_vector

In [41]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [42]:
vector_xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [43]:
xbow.shape, vector_xbow.shape

(torch.Size([4, 8, 2]), torch.Size([4, 8, 2]))

In [46]:
# version 2
final_xbow = normalizing_matrix @ x
final_xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [47]:
# version 3
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

# Same output, because softmax is, in a way, a normalising function. 

xbow3 = wei @ x

In [48]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [49]:
xbow3[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])