In [1]:
import torch
import torch.nn as nn
import tiktoken

GPT_CONFIG_124M = { 
    "vocab_size": 50257, 
    "context_length": 256, #1024, 
    "emb_dim": 768, 
    "n_heads": 12, 
    "n_layers": 12,  # Transformer-Block-Layers
    "drop_rate": 0.1, 
    "qkv_bias": False
}


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return (x_norm * self.scale + self.shift)

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return (0.5 * x * (1 + torch.tanh(
            (torch.sqrt(torch.tensor(2/torch.pi))) + (x + 0.044715 * torch.pow(x, 3))
        )))

class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, context_length, dropout=0.5, qkv_bias=False):
        super().__init__()
        assert (d_out % n_heads == 0)

        self.d_in = d_in
        self.d_out = d_out
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.d_head = (d_out // n_heads)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.ones(context_length, context_length).triu(1).bool())
    
    def forward(self, x):
        '''x: 3D. x => (batch_size, num_tokens, token_embed)'''
        b, n_tokens, token_embed = x.shape
        assert self.d_in == token_embed
        
        Q = self.W_q(x) # (b, n_tokens, d_out)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(b, n_tokens, self.n_heads, self.d_head) # (b, n_tokens, n_heads, d_head)
        K = K.view(b, n_tokens, self.n_heads, self.d_head) 
        V = V.view(b, n_tokens, self.n_heads, self.d_head) 

        Q = Q.transpose(1, 2) # (b, n_heads, n_tokens, d_head)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / self.d_head**0.5 #K.shape[-1]**0.5
        attn_scores = attn_scores.masked_fill(self.mask[: n_tokens, : n_tokens], -torch.inf)
        attn_weights = attn_scores.softmax(-1)
        attn_weights = self.dropout(attn_weights)
        context_vectors = attn_weights @ V
        context_vectors = context_vectors.transpose(1, 2)
        context_vectors = context_vectors.contiguous().view(b, n_tokens, self.d_out)
        return self.out_proj(context_vectors)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiheadAttention(
            d_in=cfg['emb_dim'],    # 768
            d_out=cfg['emb_dim'],   # 768
            n_heads=cfg['n_heads'], # 12
            context_length=cfg['context_length'], # 1024
            dropout=cfg['drop_rate'], # 0.1
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm_1 = LayerNorm(cfg['emb_dim'])
        self.norm_2 = LayerNorm(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
    
    def forward(self, x):
        # Part 1:
        shortcut = x
        x = self.norm_1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut

        # Part 2:
        shortcut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_emb = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_emb = nn.Dropout(config['drop_rate'])
        self.transf_layers = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = LayerNorm(config['emb_dim'])
        self.out_head = nn.Linear(config['emb_dim'], config['vocab_size'], bias=False)
    
    def forward(self, x, show_info=False):
        '''x: 2D Matrix'''
        batch_size, seq_len = x.shape 
        tok_emb = self.tok_emb(x) 
        pos_emb = self.pos_emb(torch.arange(seq_len))
        x = tok_emb + pos_emb
        if show_info:
            print(f'Token-Embed(shape): {tok_emb.shape}')
            print(f'POS-Embed(shape): {pos_emb.shape}')
            print(f'i/p Before TransfBlocks(shape): {x.shape}')
        x = self.drop_emb(x)
        x = self.transf_layers(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


In [2]:
torch.manual_seed(42)
model = GPTModel(GPT_CONFIG_124M)
tokenizer = tiktoken.get_encoding('gpt2')

In [3]:
def generate_text_simple(model, tokenizerGpt2:tiktoken.Encoding, txt, max_output_token):
    encoding = torch.tensor(tokenizerGpt2.encode(txt))
    encoding.unsqueeze_(0)
    for i in range(max_output_token):
        with torch.no_grad():
            out = model(encoding)
        logits = out[:, -1, :] 
        probs = logits.softmax(-1)
        idx = probs.argmax(dim=-1) # idx of the token with highest prob. [NOTE: This `idx` acts as the tokenId]
        idx.unsqueeze_(0)
        encoding =  torch.cat((encoding.squeeze(0), idx.squeeze(0)))

        if (i == max_output_token - 1):
            print(tokenizer.decode(encoding.numpy()))
        encoding.unsqueeze_(0)

In [4]:
generate_text_simple(model, tokenizer, "hello from the other side", 10)

hello from the other side reproduce Actress govern PROG SergeantHAEL Kw BulgariaSetup rate


In [5]:
# Example:
txt1 = "every effort moves"
txt2 = "I really like"
input = torch.stack(
    (torch.tensor(tokenizer.encode(txt1)), 
    torch.tensor(tokenizer.encode(txt2)))
)
input

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])

In [6]:
with torch.no_grad():
    logits = model(input)
probs = logits.softmax(-1)
print(probs)

tensor([[[6.6890e-05, 2.0877e-05, 5.4307e-06,  ..., 9.2617e-06,
          3.2934e-05, 1.2245e-05],
         [2.8620e-05, 1.3605e-05, 6.1841e-06,  ..., 9.4409e-06,
          1.4269e-05, 4.5582e-05],
         [1.3782e-05, 1.0714e-05, 2.4772e-05,  ..., 1.7069e-05,
          5.8540e-06, 4.0807e-05]],

        [[1.6585e-05, 3.2164e-05, 1.7628e-05,  ..., 2.9909e-05,
          1.7773e-05, 3.5851e-05],
         [1.4002e-05, 1.8186e-05, 7.5528e-06,  ..., 1.5048e-05,
          2.6338e-05, 6.7893e-05],
         [4.9658e-05, 6.5607e-06, 1.4103e-05,  ..., 7.7584e-06,
          7.9725e-06, 5.5176e-06]]])


In [7]:
preditedTokenIDs = probs.argmax(-1, keepdim=True)
preditedTokenIDs 

tensor([[[34873],
         [44449],
         [40115]],

        [[18938],
         [31548],
         [20399]]])

In [8]:
text_idx = 0 
target_probas_1 = probs[text_idx, [0, 1, 2], targets[text_idx]] 
print("Text 1:", target_probas_1)

NameError: name 'targets' is not defined

In [None]:
# Finding loss
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

with torch.no_grad():
    logits = model(inputs)

logits

tensor([[[ 7.7146e-01,  1.9050e-01, -6.7213e-01,  ..., -4.5644e-01,
           5.8417e-01, -1.5942e-01],
         [ 6.5998e-01, -3.6519e-01, -7.6065e-02,  ..., -8.1529e-01,
          -3.9120e-01,  1.3219e+00],
         [ 1.4689e-01, -3.7880e-01,  4.7630e-01,  ..., -5.2538e-02,
          -1.2468e+00,  1.1015e+00]],

        [[ 1.6733e-01,  2.7546e-01,  1.3350e-01,  ...,  1.0224e-01,
           3.6566e-01,  3.1196e-01],
         [ 1.4522e-01,  2.4268e-01, -3.0002e-01,  ..., -5.5778e-01,
           6.0691e-01,  7.3178e-01],
         [ 9.1627e-01, -9.9669e-01, -4.5703e-01,  ..., -7.9702e-01,
          -1.3285e+00, -1.1740e+00]]])

In [None]:
probs = logits.softmax(-1)
probs

tensor([[[3.6601e-05, 2.0474e-05, 8.6409e-06,  ..., 1.0721e-05,
          3.0350e-05, 1.4429e-05],
         [3.2548e-05, 1.1676e-05, 1.5591e-05,  ..., 7.4444e-06,
          1.1377e-05, 6.3094e-05],
         [1.9507e-05, 1.1531e-05, 2.7117e-05,  ..., 1.5980e-05,
          4.8408e-06, 5.0669e-05]],

        [[1.9817e-05, 2.2079e-05, 1.9157e-05,  ..., 1.8568e-05,
          2.4164e-05, 2.2900e-05],
         [1.9423e-05, 2.1411e-05, 1.2443e-05,  ..., 9.6161e-06,
          3.0819e-05, 3.4918e-05],
         [4.1983e-05, 6.1985e-06, 1.0633e-05,  ..., 7.5684e-06,
          4.4481e-06, 5.1914e-06]]])

In [None]:
# Target 1
logits[0].argmax(-1)

tensor([35316, 41252,  8311])

In [None]:
probs.argmax(-1, keepdim=True)

tensor([[[35316],
         [41252],
         [ 8311]],

        [[29716],
         [40825],
         [40942]]])

In [None]:
# Batch 01:
targets[0] # [3626, 6100,  345]
print(tokenizer.decode(targets[0].numpy())) # Target
print(tokenizer.decode(probs[0].argmax(-1).numpy())) # Output

 effort moves you
REP Heller discipl


In [None]:
# Batch 01:
print(probs[0])

tensor([[3.6601e-05, 2.0474e-05, 8.6409e-06,  ..., 1.0721e-05, 3.0350e-05,
         1.4429e-05],
        [3.2548e-05, 1.1676e-05, 1.5591e-05,  ..., 7.4444e-06, 1.1377e-05,
         6.3094e-05],
        [1.9507e-05, 1.1531e-05, 2.7117e-05,  ..., 1.5980e-05, 4.8408e-06,
         5.0669e-05]])


In [None]:
print(probs[0, : , targets[0]])

tensor([[4.1144e-06, 1.0326e-05, 2.1522e-05],
        [2.3320e-05, 2.4707e-05, 2.2847e-05],
        [9.9188e-06, 6.5877e-06, 9.7729e-06]])


In [None]:
print(probs[0, [0, 1, 2] , targets[0]])

tensor([4.1144e-06, 2.4707e-05, 9.7729e-06])


In [None]:
print(probs[0, [1, 2] , targets[0]])

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [2], [3]

In [None]:
targets.shape

torch.Size([2, 3])

In [None]:
probs.shape

torch.Size([2, 3, 50257])

In [10]:
probs = [0.7, 0.2, 0.1]
import math
math.log(probs)

TypeError: must be real number, not list

In [12]:
# Calculating Text Generation Loss
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [14]:
with torch.no_grad():
    logits = model(inputs)
probs = logits.softmax(-1)
probs

tensor([[[5.7891e-05, 2.3864e-05, 8.9075e-06,  ..., 1.2417e-05,
          1.8809e-05, 1.0259e-05],
         [2.9705e-05, 1.6748e-05, 1.0336e-05,  ..., 7.2643e-06,
          1.7164e-05, 5.9128e-05],
         [1.4773e-05, 8.2429e-06, 2.1888e-05,  ..., 1.6721e-05,
          4.2902e-06, 3.1793e-05]],

        [[1.6110e-05, 2.5260e-05, 2.4354e-05,  ..., 2.7548e-05,
          3.4932e-05, 3.1465e-05],
         [1.3854e-05, 3.4414e-05, 1.3104e-05,  ..., 1.0335e-05,
          1.5262e-05, 5.0872e-05],
         [3.2895e-05, 8.5632e-06, 1.5159e-05,  ..., 9.0781e-06,
          9.3584e-06, 7.5844e-06]]])

In [15]:
print(probs.shape)

torch.Size([2, 3, 50257])


In [20]:
token_ids = probs.argmax(-1, keepdim=True)
token_ids 

tensor([[[ 9586],
         [  406],
         [40115]],

        [[29716],
         [ 8185],
         [42576]]])

In [21]:
print(targets[0])
print(token_ids[0])

tensor([3626, 6100,  345])
tensor([[ 9586],
        [  406],
        [40115]])


In [31]:
def tokenid_to_text(tokens: torch.Tensor, tokenizer):
    tokens = tokens.flatten()
    return tokenizer.decode(tokens.numpy())

print(f'Targets: {tokenid_to_text(targets[0], tokenizer)}')
print(f'Output: {tokenid_to_text(token_ids[0], tokenizer)}')

Targets:  effort moves you
Output: aved L HO


In [38]:
# For txt1:
# ðŸ“Œ Probs corresponding to the Target Indexes:
# Our goal is to maximize this probs, i.e Bring them closer to 1
probs[0, [0, 1, 2], targets[0]]

tensor([5.3301e-06, 3.1335e-05, 1.3145e-05])

In [46]:
target_probas_1 = probs[0, [0, 1, 2], targets[0]]
target_probas_2 = probs[1, [0, 1, 2], targets[1]]

print(target_probas_1)
print(target_probas_2)
print(torch.cat((target_probas_1, target_probas_2), -1))

tensor([5.3301e-06, 3.1335e-05, 1.3145e-05])
tensor([9.3920e-06, 3.0030e-05, 7.5925e-06])
tensor([5.3301e-06, 3.1335e-05, 1.3145e-05, 9.3920e-06, 3.0030e-05, 7.5925e-06])


In [49]:
log_probs = torch.log(torch.cat((target_probas_1, target_probas_2), -1))
print(log_probs)

tensor([-12.1421, -10.3708, -11.2395, -11.5757, -10.4133, -11.7884])


In [51]:
avg_log_probs = log_probs.mean()
avg_log_probs 

tensor(-11.2549)

In [52]:
neg_avg_log_probs = avg_log_probs * (-1)
neg_avg_log_probs

tensor(11.2549)

In [54]:
logits, targets

(tensor([[[ 1.2325,  0.3463, -0.6392,  ..., -0.3070,  0.1082, -0.4979],
          [ 0.5678, -0.0053, -0.4879,  ..., -0.8406,  0.0193,  1.2562],
          [-0.1296, -0.7131,  0.2635,  ..., -0.0058, -1.3661,  0.6368]],
 
         [[-0.0434,  0.4064,  0.3699,  ...,  0.4931,  0.7306,  0.6261],
          [-0.1906,  0.7193, -0.2463,  ..., -0.4837, -0.0938,  1.1101],
          [ 0.6722, -0.6736, -0.1025,  ..., -0.6152, -0.5848, -0.7950]]]),
 tensor([[ 3626,  6100,   345],
         [ 1107,   588, 11311]]))

In [67]:
# logits.flatten(0, 1).shape, targets.shape
loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), # (2x3, 50275) = (6, 50275)
    targets.flatten() # (2x3) = (6)
)
print(loss)

tensor(11.2549)


In [68]:
perplexity = torch.exp(loss)
perplexity 

tensor(77261.0156)

In [70]:
l = torch.tensor([
    [0.5, 0.3, 0.2],
    [0.4, 0.1, 0.5],
])

t = torch.tensor([1, 2])

torch.nn.functional.cross_entropy(l, t)

tensor(1.0429)

In [82]:
FILE_PATH = '../../ch02/01_main-chapter-code/the-verdict.txt'

In [95]:
with open(FILE_PATH, 'r') as f:
    txt = f.read()
len(txt) # <Total-Characters = 20479>

20479

In [96]:
total_tokens = tokenizer.encode(txt)
print(total_tokens)
print(len(total_tokens)) # 5145

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13079, 410, 12523, 286, 

In [112]:
ratio = 0.9
split_idx = int(len(txt) * ratio)
train_txt = txt[: split_idx]
val_txt = txt[ split_idx :]

In [None]:
# Dataloader:

def create_dataloader(): 
    ...

tokenizer = tiktoken.get_encoding('gpt2')
dataset = 