# **TinyGPT**

In [13]:
file_path="/kaggle/input/complete-works-of-rabindranath-tagore/txt/poem.txt"
with open(file_path,'r') as file:
    text=file.read()

In [14]:
#extension that erfills websites that I opened after clasing all of them
#like grouping them or something

In [15]:
#no of characters in our dataset
print("no of characters in our dataset is: ", len(text))
print(text[:200])

no of characters in our dataset is:  2368921
বজাও রে মোহন বাঁশি।
সারা দিবসক
বিরহদহনদুখ,
মরমক তিয়াষ নাশি।
রিঝমনভেদন
বাঁশরিবাদন
কঁহা শিখলি রে কান?
হানে থিরথির
মরমঅবশকর
লহু লহু মধুময় বাণ।
ধসধস করতহ
উরহ বিয়াকুলু,
ঢুলু ঢুলু অবশনয়ান ;
কত কত বরষক
বাত স


In [16]:
#we might change this part because byte pair encoding is done in the original GPT-2
#for now this implements character level encoding
vocab = list(set(text))
vocab_size = len(vocab)
print(vocab_size)

139


In [17]:
character_to_integer = {ch:i for i,ch in enumerate(vocab)}
integer_to_character = {i:ch for i,ch in enumerate(vocab)}

class encoder_decoder:
    def __init__(self):
        self.character_to_integer = character_to_integer
        self.integer_to_character = integer_to_character

    def encoder(self, input_string):
        return [self.character_to_integer[c] for c in input_string]

    def decoder(self,sequence):
        return ''.join([self.integer_to_character[i] for i in sequence])


In [18]:
print(encoder_decoder().encoder(input_string="রবীন্দ্রনাথ ঠাকুর"))
print(encoder_decoder().decoder(encoder_decoder().encoder(input_string="রবীন্দ্রনাথ ঠাকুর")))

[95, 138, 36, 1, 118, 53, 118, 95, 1, 52, 68, 117, 121, 52, 24, 14, 95]
রবীন্দ্রনাথ ঠাকুর


In [40]:
import torch
# hyperparameters

device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64 
context_length = 256 
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [20]:
data=torch.tensor(encoder_decoder().encoder(text))

# print(data.shape)
# print(data[:500])

In [33]:
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

print(train_data.shape)
print(val_data.shape)

torch.Size([2132028])
torch.Size([236893])


In [23]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    data = train_data if split == 'train' else val_data
    split_index = torch.randint(len(data) - context_length, (batch_size,))
    x = torch.stack([data[i:i+context_length] for i in split_index])
    y = torch.stack([data[i+1:i+context_length+1] for i in split_index])
    x, y = x.to(device), y.to(device)
    return x, y

#test
# x1,y1 = get_batch(train_data)
# print(x1.shape,y1.shape)

In [24]:
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [26]:
# a single head of self attention
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        K = self.key(x)  # Shape: (batch_size, seq_len, head_size)
        Q = self.query(x)  # Shape: (batch_size, seq_len, head_size)
        V = self.value(x)  # Shape: (batch_size, seq_len, head_size)
        
        # Compute attention logits (scaled dot-product)
        attn_logits = Q @ K.transpose(-2, -1) * (K.shape[-1] ** -0.5)  # Shape: (batch_size, seq_len, seq_len)
        
        # Apply mask
        seq_len = attn_logits.size(-1)
        attn_logits = attn_logits.masked_fill(self.tril[:seq_len, :seq_len] == 0, float('-inf'))
        
        # Compute attention weights
        attn_weights = F.softmax(attn_logits, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Weighted sum of values
        out = attn_weights @ V  # Shape: (batch_size, seq_len, head_size)
        return out


In [27]:
#Multiple self-attention heads in parallel

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [28]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1  = nn.Linear(n_embd,4 * n_embd)
        self.relu = nn.ReLU()
        self.ln2  = nn.Linear(4 * n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.ln1(x)
        x = self.relu(x)
        x = self.ln2(x)
        x = self.dropout(x)
        return x

In [42]:
class Block(nn.Module):
    '''Transformer Block'''
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = MLP(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [38]:
class tinyGPT(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_length, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_length:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] 
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [43]:
model = tinyGPT()
m = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(encoder_decoder().decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.9542, val loss 4.9460
step 500: train loss 2.1997, val loss 2.2514
step 1000: train loss 1.8696, val loss 1.9387
step 1500: train loss 1.7423, val loss 1.8302
step 2000: train loss 1.6679, val loss 1.7824
step 2500: train loss 1.6104, val loss 1.7436
step 3000: train loss 1.5658, val loss 1.7218
step 3500: train loss 1.5232, val loss 1.7016
step 4000: train loss 1.4882, val loss 1.6913
step 4500: train loss 1.4540, val loss 1.6858
step 4999: train loss 1.4253, val loss 1.6805
যথাসীগরাগিণী
বাজায় গ্রাইবানি অনাদিকালা।
স্তব্ধতা সঁপে
জহর্জন ঋতু সকালে
বনে যখনি তার পড়ে খুলি ;
খ্যাতিহারণ খাতা বেড়ে বালিকা;
সপ্তর্ষির উচ্চশি জাগে
গাদ্ বন লয়ে কুঞ্জকপতঙ্গম।
ক্লান্ত আকাশে ছায়া
নবপাগ্নের সরসে বৎসরপ্রব্যাপ্তি
আদিত্য শৈলে বনে ;
খণ্ডাতে প্রাণমনে তারে
এই ক্রেতস্তর,
অনাদ্তের
বাণিক ধরে আর সে একদিন
জাগে না।
জানি না।
কেন শুনা যেমন তার ভোর বিমুখে
নাচে তব যত
সেই কেন চিরচঞ্চল
সংকীচি প্রচণ্ডগুচ্ছায়ার আকার —
শুভ্ল পুষ্পীচিকায়
আকাশে জফা কোথা
শাঙ্ক্ষে শান্তি করে,
আগের অঞ্ছল পরিচয় অগণ্যলিপণ্ডলীর