In [2]:
with open('Rabindranath.txt','r',encoding='utf-8') as f:
    text = f.read()
len(text)

269576

In [3]:
text[:100]

"Rabindranath Tagore\n- poems -\n\n\n\n\nPublication Date:\n 2012\nPublisher:\nPoemhunter.com - The World's Po"

In [39]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab_size

84

In [11]:
str2int = {ch:i for i,ch in enumerate(chars)}
int2str = {i:ch for i,ch in enumerate(chars)}
encode = lambda s:[str2int[c] for c in s]
decode = lambda l:''.join([int2str[n] for n in l])

e = encode('Rabindranath Tagore')
print(e)
d = decode(e)
print(d)

[42, 51, 52, 59, 64, 54, 68, 51, 64, 51, 70, 58, 1, 44, 51, 57, 65, 68, 55]
Rabindranath Tagore


In [33]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape)
data[:100]

torch.Size([269576])


tensor([42, 51, 52, 59, 64, 54, 68, 51, 64, 51, 70, 58,  1, 44, 51, 57, 65, 68,
        55,  0, 10,  1, 66, 65, 55, 63, 69,  1, 10,  0,  0,  0,  0,  0, 40, 71,
        52, 62, 59, 53, 51, 70, 59, 65, 64,  1, 28, 51, 70, 55, 22,  0,  1, 14,
        12, 13, 14,  0, 40, 71, 52, 62, 59, 69, 58, 55, 68, 22,  0, 40, 65, 55,
        63, 58, 71, 64, 70, 55, 68, 11, 53, 65, 63,  1, 10,  1, 44, 58, 55,  1,
        47, 65, 68, 62, 54,  6, 69,  1, 40, 65])

In [34]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [35]:
block_size = 8
train_data[:block_size+1]

tensor([42, 51, 52, 59, 64, 54, 68, 51, 64])

In [36]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when context is {context} then target: {target}')

when context is tensor([42]) then target: 51
when context is tensor([42, 51]) then target: 52
when context is tensor([42, 51, 52]) then target: 59
when context is tensor([42, 51, 52, 59]) then target: 64
when context is tensor([42, 51, 52, 59, 64]) then target: 54
when context is tensor([42, 51, 52, 59, 64, 54]) then target: 68
when context is tensor([42, 51, 52, 59, 64, 54, 68]) then target: 51
when context is tensor([42, 51, 52, 59, 64, 54, 68, 51]) then target: 64


In [37]:
torch.manual_seed(7)
batch_size = 4 # size of parallel batches of block_size (batch_dimension)
block_size = 8 # size of chunk of data we process (time_dimension)

def get_batch(split):
    # to get random chunk of data for each training or validation
    data = train_data if split == 'train' else val_data
    idx = torch.randint(len(data)-block_size-1,(batch_size,))
    x = torch.stack([data[i:block_size+i] for i in idx])
    y = torch.stack([data[i+1:block_size+i+1] for i in idx])
    return x,y

xb,yb = get_batch('train')
xb.shape,yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [38]:
for b in range(batch_size): # (batch_dimension)
    for t in range(block_size): # (time_dimension)
        context = xb[b,:t+1]
        target = yb[b,t]
        print({context:target})

{tensor([51]): tensor(72)}
{tensor([51, 72]): tensor(55)}
{tensor([51, 72, 55]): tensor(1)}
{tensor([51, 72, 55,  1]): tensor(70)}
{tensor([51, 72, 55,  1, 70]): tensor(58)}
{tensor([51, 72, 55,  1, 70, 58]): tensor(55)}
{tensor([51, 72, 55,  1, 70, 58, 55]): tensor(0)}
{tensor([51, 72, 55,  1, 70, 58, 55,  0]): tensor(69)}
{tensor([51]): tensor(64)}
{tensor([51, 64]): tensor(54)}
{tensor([51, 64, 54]): tensor(69)}
{tensor([51, 64, 54, 69]): tensor(1)}
{tensor([51, 64, 54, 69,  1]): tensor(73)}
{tensor([51, 64, 54, 69,  1, 73]): tensor(58)}
{tensor([51, 64, 54, 69,  1, 73, 58]): tensor(65)}
{tensor([51, 64, 54, 69,  1, 73, 58, 65]): tensor(1)}
{tensor([54]): tensor(65)}
{tensor([54, 65]): tensor(73)}
{tensor([54, 65, 73]): tensor(64)}
{tensor([54, 65, 73, 64]): tensor(1)}
{tensor([54, 65, 73, 64,  1]): tensor(71)}
{tensor([54, 65, 73, 64,  1, 71]): tensor(66)}
{tensor([54, 65, 73, 64,  1, 71, 66]): tensor(65)}
{tensor([54, 65, 73, 64,  1, 71, 66, 65]): tensor(64)}
{tensor([1]): tensor(

In [99]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(7)

class BigramModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        # lookup table for tokens
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        
    def forward(self,idx,target=None):
        # for each index in idx it reutrns token rows
        logits = self.token_embedding_table(idx) # return (B,T,C) (Batch,Time,Channel)
        if target is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C) # since cross_entropy accept differnt dime of input (B,C,T)
            target = target.view(B*T)
            loss = F.cross_entropy(logits,target)
        return logits,loss

    def generate(self,idx,max_tokens):
        for _ in range(max_tokens):
            logits,loss = self.forward(idx) # getting predictions
             # taking only the last idx prediction --> (B,C)
            logits = logits[:,-1,:] # (B,C)
            # softing for get probabilities
            probs = F.softmax(logits,dim=-1) # (B,C)
            # get one sample index from given probabilities and add to end of idx
            sample_idx = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,sample_idx),dim=1) # (B,T+1)
        return idx
    
m = BigramModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)


# sample text to test the model
input_idx = torch.zeros((1,1), dtype = torch.long)
# generated tokens of length 100
g_idx = m.generate(input_idx,100)

print(g_idx.shape)
decode(g_idx[0].tolist())

torch.Size([8, 84])
tensor(5.1176, grad_fn=<NllLossBackward0>)
torch.Size([1, 101])


'\n3NyOGn7rE !6v?e8(H7–06…7D19:Hdnz—3vVF4Pb“VErS1–JCivF—4 6b’FOV”qcvo"vi!ayhVGn`”h"TU\nQ“o“roVB1o\'VyK&NQ'

In [100]:
print(m.parameters)

<bound method Module.parameters of BigramModel(
  (token_embedding_table): Embedding(84, 84)
)>


In [101]:
# pytorch optimizer model AdamW
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [117]:
batch_size = 32
for _ in range(1000):
    xb,yb = get_batch('train')
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.430452585220337


In [118]:
# sample text to test the model
input_idx = torch.zeros((1,1), dtype = torch.long)
# generated tokens of length 100
g_idx = m.generate(input_idx,300)

print(g_idx.shape)
decode(g_idx[0].tolist())

torch.Size([1, 301])


"\nakenky ve. tindy edrthee\n P\n town bl tr lead.\nWanglenndepiof wind t thee ten\n o o ts?\n bbrofin Th aye g spt s af a ben y bld usto ovaghord howe s Lousld here blllowed gin.\nF a's ckearn bs T fingle,\nI ather irof my to oot itrs atedees g we wame herar s woned t:My wart noy ary m\nTary tanat uth t tht a"