<a href="https://colab.research.google.com/github/Akage1234/nn-zero-to-hero/blob/main/NLP/Transformer/GPT/gpt_exp_output.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Learning the concepts

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O shakespeare.txt

--2025-10-24 20:10:44--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2025-10-24 20:10:44 (50.1 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



In [None]:
from datasets import Dataset
with open("shakespeare.txt", 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print(len(text))

1115394


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

s = encode('Hello')
print(s)
print(decode(s))

[20, 43, 50, 50, 53]
Hello


In [None]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [None]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs:\n",xb.shape)
print(xb)
print("Outputs:\n",yb.shape)
print(yb)

Inputs:
 torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Outputs:
 torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [None]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context} output is :{target}")

when input is tensor([24]) output is :43
when input is tensor([24, 43]) output is :58
when input is tensor([24, 43, 58]) output is :5
when input is tensor([24, 43, 58,  5]) output is :57
when input is tensor([24, 43, 58,  5, 57]) output is :1
when input is tensor([24, 43, 58,  5, 57,  1]) output is :46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) output is :43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) output is :39
when input is tensor([44]) output is :53
when input is tensor([44, 53]) output is :56
when input is tensor([44, 53, 56]) output is :1
when input is tensor([44, 53, 56,  1]) output is :58
when input is tensor([44, 53, 56,  1, 58]) output is :46
when input is tensor([44, 53, 56,  1, 58, 46]) output is :39
when input is tensor([44, 53, 56,  1, 58, 46, 39]) output is :58
when input is tensor([44, 53, 56,  1, 58, 46, 39, 58]) output is :1
when input is tensor([52]) output is :58
when input is tensor([52, 58]) output is :1
when input is tensor([52, 58,  1]) outp

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # B, T, C
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # becomes B, C
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
m = BigramModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
idx = m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)
print(decode(idx[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.382369041442871


In [None]:
idx = m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)
print(decode(idx[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


# The mathematical trick and self attention

In [None]:
import torch
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a/ torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print(a)
print('-'*50)
print(b)
print('-'*50)
print(c)
print('-'*50)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--------------------------------------------------
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--------------------------------------------------
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
--------------------------------------------------


In [None]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True) # all the rows sum upto 1
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2)
print(xbow[0])
print(xbow2[0])

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [None]:
# different version
from torch.nn import functional as F
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
xbow3[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
# self-attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)


head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x
out.shape

# Fully Finished code & output

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

with open("shakespeare.txt", 'r', encoding='utf-8') as f:
    text = f.read()

print(len(text))

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = model(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

## Attention Head
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        out = wei @ v
        return out

# Multi head attention
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

# Feed Forward
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4* n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.sa_heads = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self,x):
        x = x + self.sa_heads(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


## Bigram Language Model
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_heads=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, n_embd)
        pos_emb = self.positional_embedding_table(torch.arange(T, device=device)) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x)
        logits = self.lm_head(x) # (B, T, Vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes B, C
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = GPTModel()
m = m.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

for iter in tqdm(range(max_iters), desc="Training"):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"\n  step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

1115394


Training:   0%|          | 0/5000 [00:00<?, ?it/s]


  step 0: train loss 3.0286, val loss 3.0682


Training:  10%|█         | 501/5000 [06:46<28:50:08, 23.07s/it]


  step 500: train loss 3.0320, val loss 3.0681


Training:  20%|██        | 1001/5000 [12:27<25:43:08, 23.15s/it]


  step 1000: train loss 3.0286, val loss 3.0676


Training:  30%|███       | 1501/5000 [18:06<22:18:21, 22.95s/it]


  step 1500: train loss 3.0289, val loss 3.0700


Training:  40%|████      | 2001/5000 [23:46<19:09:16, 22.99s/it]


  step 2000: train loss 3.0312, val loss 3.0676


Training:  50%|█████     | 2501/5000 [29:24<15:56:34, 22.97s/it]


  step 2500: train loss 3.0327, val loss 3.0690


Training:  60%|██████    | 3001/5000 [35:03<12:43:28, 22.92s/it]


  step 3000: train loss 3.0305, val loss 3.0662


Training:  70%|███████   | 3501/5000 [40:42<9:32:18, 22.91s/it]


  step 3500: train loss 3.0303, val loss 3.0679


Training:  80%|████████  | 4000/5000 [45:05<08:48,  1.89it/s]


  step 4000: train loss 3.0319, val loss 3.0680


Training:  90%|█████████ | 4501/5000 [51:58<3:11:11, 22.99s/it]


  step 4500: train loss 3.0286, val loss 3.0686


Training: 100%|██████████| 5000/5000 [56:21<00:00,  1.48it/s]



Touch the length of our proclaims cell'st off,
And confession out of exquision.

ANGELO:
Here can yes must report
Off the hands voices and regive; and not extremise,
There tus striught make the lose: be next not
Till I feel what was fain deputy.

MARIANA:
Let not on then charactes him and climbs.
To sliep then and poperticy'd and that
For wolving foot, bid their heads proclaiming
Were times and the changer first have
Till mine enemies or spurch'd
Than beautish'd them. He comes, fearful friends!



In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


We look too? think we this man
To the Earwise hence in being farewell
Oxenusuances. Shall I have so you to his honour!

Patience:
This true?

CARDITAL:
Nay, thou hast saint on thy conqueror is,
The courtesy thought to set the Capules.

BRUTUS:
Welcomment followned to obather and Christ's Sly
To that any martless to Good smalled thou know
And bear the wiser ruther. What couran do I can
thee's neithern?

First Senator:
My lord, durst banish'd in Tirture. O Thursdom!

CORIOLANUS:
First he, my deeds, cousin, I have;
And deal as Working o'er--O happy that setige,
Meaning to me by a word;
Or by with his honours and steeps them we
To Pound of Such high dry Gaunt a rashness!
That feel he will advenge
The ungerators to long; for whose boths doot
Which intended an oath ignoble pridage,
Whilst he towards my consuls than a kder
Of carversion Clifford and
More Earcum's little battle begther,
With fox revenger, war, but look'd the creature,
And venom upon these strong of cheeks in sight,
Hath grace