In [25]:
with open('Rabindranath.txt','r',encoding='utf-8') as f:
    text = f.read()
len(text)

269576

In [26]:
text[:100]

"Rabindranath Tagore\n- poems -\n\n\n\n\nPublication Date:\n 2012\nPublisher:\nPoemhunter.com - The World's Po"

In [27]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab_size

84

In [28]:
str2int = {ch:i for i,ch in enumerate(chars)}
int2str = {i:ch for i,ch in enumerate(chars)}
encode = lambda s:[str2int[c] for c in s]
decode = lambda l:''.join([int2str[n] for n in l])

e = encode('Rabindranath Tagore')
print(e)
d = decode(e)
print(d)

[42, 51, 52, 59, 64, 54, 68, 51, 64, 51, 70, 58, 1, 44, 51, 57, 65, 68, 55]
Rabindranath Tagore


In [29]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape)
data[:100]

torch.Size([269576])


tensor([42, 51, 52, 59, 64, 54, 68, 51, 64, 51, 70, 58,  1, 44, 51, 57, 65, 68,
        55,  0, 10,  1, 66, 65, 55, 63, 69,  1, 10,  0,  0,  0,  0,  0, 40, 71,
        52, 62, 59, 53, 51, 70, 59, 65, 64,  1, 28, 51, 70, 55, 22,  0,  1, 14,
        12, 13, 14,  0, 40, 71, 52, 62, 59, 69, 58, 55, 68, 22,  0, 40, 65, 55,
        63, 58, 71, 64, 70, 55, 68, 11, 53, 65, 63,  1, 10,  1, 44, 58, 55,  1,
        47, 65, 68, 62, 54,  6, 69,  1, 40, 65])

In [30]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [31]:
block_size = 8
train_data[:block_size+1]

tensor([42, 51, 52, 59, 64, 54, 68, 51, 64])

In [32]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when context is {context} then target: {target}')

when context is tensor([42]) then target: 51
when context is tensor([42, 51]) then target: 52
when context is tensor([42, 51, 52]) then target: 59
when context is tensor([42, 51, 52, 59]) then target: 64
when context is tensor([42, 51, 52, 59, 64]) then target: 54
when context is tensor([42, 51, 52, 59, 64, 54]) then target: 68
when context is tensor([42, 51, 52, 59, 64, 54, 68]) then target: 51
when context is tensor([42, 51, 52, 59, 64, 54, 68, 51]) then target: 64


In [33]:
torch.manual_seed(7)
batch_size = 4 # size of parallel batches of block_size (batch_dimension)
block_size = 8 # size of chunk of data we process (time_dimension)

def get_batch(split):
    # to get random chunk of data for each training or validation
    data = train_data if split == 'train' else val_data
    idx = torch.randint(len(data)-block_size-1,(batch_size,))
    x = torch.stack([data[i:block_size+i] for i in idx])
    y = torch.stack([data[i+1:block_size+i+1] for i in idx])
    return x,y

xb,yb = get_batch('train')
xb.shape,yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [34]:
for b in range(batch_size): # (batch_dimension)
    for t in range(block_size): # (time_dimension)
        context = xb[b,:t+1]
        target = yb[b,t]
        print({context:target})

{tensor([51]): tensor(72)}
{tensor([51, 72]): tensor(55)}
{tensor([51, 72, 55]): tensor(1)}
{tensor([51, 72, 55,  1]): tensor(70)}
{tensor([51, 72, 55,  1, 70]): tensor(58)}
{tensor([51, 72, 55,  1, 70, 58]): tensor(55)}
{tensor([51, 72, 55,  1, 70, 58, 55]): tensor(0)}
{tensor([51, 72, 55,  1, 70, 58, 55,  0]): tensor(69)}
{tensor([51]): tensor(64)}
{tensor([51, 64]): tensor(54)}
{tensor([51, 64, 54]): tensor(69)}
{tensor([51, 64, 54, 69]): tensor(1)}
{tensor([51, 64, 54, 69,  1]): tensor(73)}
{tensor([51, 64, 54, 69,  1, 73]): tensor(58)}
{tensor([51, 64, 54, 69,  1, 73, 58]): tensor(65)}
{tensor([51, 64, 54, 69,  1, 73, 58, 65]): tensor(1)}
{tensor([54]): tensor(65)}
{tensor([54, 65]): tensor(73)}
{tensor([54, 65, 73]): tensor(64)}
{tensor([54, 65, 73, 64]): tensor(1)}
{tensor([54, 65, 73, 64,  1]): tensor(71)}
{tensor([54, 65, 73, 64,  1, 71]): tensor(66)}
{tensor([54, 65, 73, 64,  1, 71, 66]): tensor(65)}
{tensor([54, 65, 73, 64,  1, 71, 66, 65]): tensor(64)}
{tensor([1]): tensor(

In [35]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(7)

class BigramModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        # lookup table for tokens
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        
    def forward(self,idx,target=None):
        # for each index in idx it reutrns token rows
        logits = self.token_embedding_table(idx) # return (B,T,C) (Batch,Time,Channel)
        if target is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C) # since cross_entropy accept differnt dime of input (B,C,T)
            target = target.view(B*T)
            loss = F.cross_entropy(logits,target)
        return logits,loss

    def generate(self,idx,max_tokens):
        for _ in range(max_tokens):
            logits,loss = self.forward(idx) # getting predictions
             # taking only the last idx prediction --> (B,C)
            logits = logits[:,-1,:] # (B,C)
            # softing for get probabilities
            probs = F.softmax(logits,dim=-1) # (B,C)
            # get one sample index from given probabilities and add to end of idx
            sample_idx = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,sample_idx),dim=1) # (B,T+1)
        return idx
    
m = BigramModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)


# sample text to test the model
input_idx = torch.zeros((1,1), dtype = torch.long)
# generated tokens of length 100
g_idx = m.generate(input_idx,100)

print(g_idx.shape)
decode(g_idx[0].tolist())

torch.Size([32, 84])
tensor(5.1680, grad_fn=<NllLossBackward0>)
torch.Size([1, 101])


'\n3NyOGn7rE !6v?e8(H7–06…7D19:Hdnz—3vVF4Pb“VErS1–JCivF—4 6b’FOV”qcvo"vi!ayhVGn`”h"TU\nQ“o“roVB1o\'VyK&NQ'

In [36]:
print(m.parameters)

<bound method Module.parameters of BigramModel(
  (token_embedding_table): Embedding(84, 84)
)>


In [42]:
# pytorch optimizer model AdamW
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [43]:
batch_size = 32
for _ in range(1000):
    xb,yb = get_batch('train')
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

4.034038543701172


In [44]:
# sample text to test the model
input_idx = torch.zeros((1,1), dtype = torch.long)
# generated tokens of length 100
g_idx = m.generate(input_idx,300)

print(g_idx.shape)
decode(g_idx[0].tolist())

torch.Size([1, 301])


'\n6!o C\'l!e5GDykjugguo ’o"nmo"SLD1 Ob 9`U&2VB8rS2p,Q\'rUX0E3u--–cvO\'s-kCd,NX9c130PyA";oXaXbkTaTgi60CU2’rGcCGn#)Yhj6.2`IixbbuVu&;1gh-gEAAS5U’?2b!0!–EzQk\'D.2-&C1…2#sM;paEO#sF8z#kxTByoh’-dD&FJbnvu12qjXl.Abf?BenUAPHP:RMm‘EHthdJ:MrLryzeU;:”(aTRN3gi.…czbfsaQzmmssu?; K`“QAP)Q#6Tg3u 2q#fN7y G,-c2SBvaIOQgK2B70,'

# Self attention

In [23]:
# self attention
torch.manual_seed(7)
a = torch.tril(torch.ones(3,3))
a = a/torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()

print('a \n',a)
print()
print('b \n',b)
print()
c = a @ b

print('mean of each row')
print(c)

a 
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

b 
 tensor([[5., 2.],
        [1., 6.],
        [3., 7.]])

mean of each row
tensor([[5., 2.],
        [3., 4.],
        [3., 5.]])


In [117]:
# let 
torch.manual_seed(7)
B,T,C = 4,4,2
x = torch.rand(B,T,C)
x.shape

torch.Size([4, 4, 2])

In [118]:
# 1 way
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev,dim=0)
xbow

tensor([[[0.5349, 0.1988],
         [0.5971, 0.4278],
         [0.4756, 0.4269],
         [0.4085, 0.4776]],

        [[0.3653, 0.8513],
         [0.6101, 0.7011],
         [0.5024, 0.5362],
         [0.4880, 0.4920]],

        [[0.7204, 0.0731],
         [0.8451, 0.0904],
         [0.8577, 0.1980],
         [0.8326, 0.3222]],

        [[0.5209, 0.5932],
         [0.7003, 0.6109],
         [0.7220, 0.4450],
         [0.7555, 0.5018]]])

In [119]:
# 2nd way
wei = torch.tril(torch.ones(T,T))
wei = wei/torch.sum(wei,1,keepdim=True)
xbow1 = wei @ x
torch.allclose(xbow1,xbow)

True

In [120]:
xbow1

tensor([[[0.5349, 0.1988],
         [0.5971, 0.4278],
         [0.4756, 0.4269],
         [0.4085, 0.4776]],

        [[0.3653, 0.8513],
         [0.6101, 0.7011],
         [0.5024, 0.5362],
         [0.4880, 0.4920]],

        [[0.7204, 0.0731],
         [0.8451, 0.0904],
         [0.8577, 0.1980],
         [0.8326, 0.3222]],

        [[0.5209, 0.5932],
         [0.7003, 0.6109],
         [0.7220, 0.4450],
         [0.7555, 0.5018]]])

In [121]:
# 3rd way
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0,float('-inf'))
wei = F.softmax(wei,dim=-1)
xbow2 = wei @ x
xbow2

tensor([[[0.5349, 0.1988],
         [0.5971, 0.4278],
         [0.4756, 0.4269],
         [0.4085, 0.4776]],

        [[0.3653, 0.8513],
         [0.6101, 0.7011],
         [0.5024, 0.5362],
         [0.4880, 0.4920]],

        [[0.7204, 0.0731],
         [0.8451, 0.0904],
         [0.8577, 0.1980],
         [0.8326, 0.3222]],

        [[0.5209, 0.5932],
         [0.7003, 0.6109],
         [0.7220, 0.4450],
         [0.7555, 0.5018]]])

In [122]:
torch.allclose(xbow,xbow2)

True

In [134]:
#version 4 for self attention
torch.manual_seed(7)
B,T,C = 4,8,32
x = torch.randn(B,T,C)
# single head performing self attention
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)
k = key(x) # (B,T,16)
q = query(x) # (B,T,16)
v = value(x)

wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) --> (B,T,T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei,dim=-1)

# out = wei @ x
out = wei @ v

In [135]:
out.shape

torch.Size([4, 8, 16])

In [136]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3683, 0.6317, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3519, 0.5907, 0.0574, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3486, 0.1974, 0.1348, 0.3193, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0112, 0.1489, 0.1402, 0.0980, 0.6017, 0.0000, 0.0000, 0.0000],
        [0.1663, 0.0633, 0.1945, 0.5067, 0.0494, 0.0198, 0.0000, 0.0000],
        [0.1435, 0.0190, 0.0247, 0.0396, 0.5494, 0.1052, 0.1185, 0.0000],
        [0.0690, 0.3477, 0.0400, 0.0806, 0.2945, 0.0274, 0.0459, 0.0949]],
       grad_fn=<SelectBackward0>)

In [137]:
out[0]

tensor([[ 0.5680,  0.9682,  1.0205, -0.7679, -0.4110,  0.0781, -0.5025, -0.4608,
         -0.0136,  0.5688, -0.4703, -0.6915, -0.9830,  0.1170, -0.1094, -0.4791],
        [-0.6052,  0.1062, -0.3205,  0.6433, -0.0320,  0.0016,  0.4219, -0.2060,
         -0.4843,  0.1708, -0.3733, -0.4373, -0.5565,  0.6194,  0.2879,  0.3988],
        [-0.5433,  0.0536, -0.3201,  0.6323, -0.0060, -0.0088,  0.3987, -0.2368,
         -0.4794,  0.1564, -0.3282, -0.4107, -0.5633,  0.6294,  0.2876,  0.3543],
        [ 0.1141,  0.2567, -0.0596, -0.0305,  0.0438,  0.0878, -0.0655, -0.3837,
         -0.5834, -0.0459, -0.0787, -0.5164, -0.6511,  0.2166,  0.1639,  0.1029],
        [ 0.1636, -0.2151,  0.0176,  0.3056,  0.1306,  0.3668,  0.0538, -0.1521,
         -0.1427, -0.2161, -0.1026, -0.1479,  0.0219,  0.3281, -0.0691, -0.0648],
        [ 0.2963,  0.1498, -0.1679, -0.1112,  0.1672,  0.1563, -0.1724, -0.4091,
         -0.7127, -0.2730,  0.1151, -0.4793, -0.5435,  0.0611,  0.1502,  0.1074],
        [ 0.2294, -0.0

In [138]:
wei.var()

tensor(0.0532, grad_fn=<VarBackward0>)

In [139]:
out.var()

tensor(0.1892, grad_fn=<VarBackward0>)

In [140]:
k.var()

tensor(0.3272, grad_fn=<VarBackward0>)

In [141]:
q.var()

tensor(0.3398, grad_fn=<VarBackward0>)