In [14]:
import torch
import torch.nn as nn

torch.set_printoptions(sci_mode=False)

## **Basic Idea for normalization**

In [15]:
x = torch.rand((2, 4, 5))
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True)
std = x.std(dim=-1, keepdim=True)

print(mean,'\n')
print(var, '\n')
print(std, '\n')

tensor([[[0.6342],
         [0.4079],
         [0.4294],
         [0.3939]],

        [[0.5814],
         [0.4683],
         [0.6340],
         [0.3814]]]) 

tensor([[[0.0433],
         [0.0765],
         [0.0690],
         [0.0905]],

        [[0.0776],
         [0.0980],
         [0.0629],
         [0.1295]]]) 

tensor([[[0.2082],
         [0.2765],
         [0.2627],
         [0.3009]],

        [[0.2786],
         [0.3130],
         [0.2508],
         [0.3598]]]) 



In [17]:
x = torch.rand((2, 4, 5))
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True)
std = x.std(dim=-1, keepdim=True)

x_norm = (x - mean) / std
print(x_norm.mean(-1, keepdim=True))
print(x_norm.var(-1, keepdim=True))

tensor([[[-0.0000],
         [-0.0000],
         [-0.0000],
         [ 0.0000]],

        [[-0.0000],
         [ 0.0000],
         [ 0.0000],
         [-0.0000]]])
tensor([[[1.0000],
         [1.0000],
         [1.0000],
         [1.0000]],

        [[1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]])


## **LayerNorm Class**

In [97]:
# Layer Normalization
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        # x_norm = (x - mean) / torch.sqrt(var)
        # print(x_norm.)
        print(x_norm.mean(-1, keepdims=True))
        print(x_norm.var(-1, keepdims=True))
        return (x_norm * self.scale + self.shift)

In [116]:
# IN action
x = torch.randn(2, 129750)
ln = LayerNorm(emb_dim=129750)
out_norm = ln(x)
# print(out_norm.mean(-1, keepdims=True))
# print(out_norm.var(-1, keepdims=True))

tensor([[-0.0000],
        [-0.0000]])
tensor([[1.0000],
        [1.0000]])


# **`GELU()` Activation**


<img src = "./gelu.png" width = "900" height = "300">

In [118]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return (0.5 * x * (1 + torch.tanh(
            (torch.sqrt(torch.tensor(2/torch.pi))) + (x + 0.044715 * torch.pow(x, 3))
        )))

# **`FeedForward` Network**

In [119]:
GPT_CONFIG_124M = { 
    "vocab_size": 50257, 
    "context_length": 1024, 
    "emb_dim": 768, 
    "n_heads": 12, 
    "n_layers": 12,  # Transformer-Block-Layers
    "drop_rate": 0.1, 
    "qkv_bias": False
}

In [121]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

In [122]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
out.shape

torch.Size([2, 3, 768])

# **Skip-Connections**

In [138]:
class Example_ForSkipConnections(nn.Module):
    def __init__(self, skip, layers):
        super().__init__()
        self.use_skipconnect = skip
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layers[0], layers[1]), GELU()),
            nn.Sequential(nn.Linear(layers[1], layers[2]), GELU()),
            nn.Sequential(nn.Linear(layers[2], layers[3]), GELU()),
            nn.Sequential(nn.Linear(layers[3], layers[4]), GELU()),
            nn.Sequential(nn.Linear(layers[4], layers[5]), GELU()),
        ])
    
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            layer_output = layer(x)
            if self.use_skipconnect and x.shape == layer_output.shape:
                x += layer_output
            else:
                x = layer(x)
        return x
            

layers = [5, 5, 4, 3, 3, 1]
x = torch.rand((5, 4, 5))
model = Example_ForSkipConnections(True, layers)
model(x)

tensor([[[-0.3882],
         [-0.3819],
         [-0.3874],
         [-0.3842]],

        [[-0.3808],
         [-0.3821],
         [-0.3846],
         [-0.3773]],

        [[-0.3813],
         [-0.3809],
         [-0.3837],
         [-0.3844]],

        [[-0.3794],
         [-0.3843],
         [-0.3772],
         [-0.3861]],

        [[-0.3802],
         [-0.3799],
         [-0.3831],
         [-0.3866]]], grad_fn=<MulBackward0>)

In [151]:
x = torch.tensor([-1., 0., 1.])
model_withSkipConnections = Example_ForSkipConnections(skip=True, layers=[3, 3, 3, 3, 3, 1])
model_withoutSkipConnections = Example_ForSkipConnections(skip=False, layers=[3, 3, 3, 3, 3, 1])

In [170]:
def print_gradient(model, x):
    out = model(x)
    target = torch.tensor([0.])
    criterion = nn.MSELoss()
    loss = criterion(out, target)
    loss.backward()

    for name, params in model.named_parameters():
        params: torch.Tensor
        
        print(f'{name} -> {params.grad.abs().mean().item()}\n')

In [171]:
print_gradient(model_withoutSkipConnections, x)

layers.0.0.weight -> 0.03002546913921833

layers.0.0.bias -> 0.04503820464015007

layers.1.0.weight -> 0.08033671975135803

layers.1.0.bias -> 0.20734548568725586

layers.2.0.weight -> 0.18563634157180786

layers.2.0.bias -> 0.6501616835594177

layers.3.0.weight -> 0.34845197200775146

layers.3.0.bias -> 0.9220051765441895

layers.4.0.weight -> 1.416274905204773

layers.4.0.bias -> 4.7667927742004395



# **Transformer Block**

In [192]:
import torch 
import torch.nn as nn

In [193]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

In [194]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return (x_norm * self.scale + self.shift)

In [195]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, context_length, dropout=0.5, qkv_bias=False):
        super().__init__()
        assert (d_out % n_heads == 0)

        self.d_in = d_in
        self.d_out = d_out
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.d_head = (d_out // n_heads)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.ones(context_length, context_length).triu(1).bool())
    
    def forward(self, x):
        '''x: 3D. x => (batch_size, num_tokens, token_embed)'''
        b, n_tokens, token_embed = x.shape
        assert self.d_in == token_embed
        
        Q = self.W_q(x) # (b, n_tokens, d_out)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(b, n_tokens, self.n_heads, self.d_head) # (b, n_tokens, n_heads, d_head)
        K = K.view(b, n_tokens, self.n_heads, self.d_head) 
        V = V.view(b, n_tokens, self.n_heads, self.d_head) 

        Q = Q.transpose(1, 2) # (b, n_heads, n_tokens, d_head)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / self.d_head**0.5 #K.shape[-1]**0.5
        attn_scores = attn_scores.masked_fill(self.mask[: n_tokens, : n_tokens], -torch.inf)
        attn_weights = attn_scores.softmax(-1)
        attn_weights = self.dropout(attn_weights)
        context_vectors = attn_weights @ V
        context_vectors = context_vectors.transpose(1, 2)
        context_vectors = context_vectors.contiguous().view(b, n_tokens, self.d_out)
        return self.out_proj(context_vectors)

In [196]:
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [197]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiheadAttention(
            d_in=cfg['emb_dim'],    # 768
            d_out=cfg['emb_dim'],   # 768
            n_heads=cfg['n_heads'], # 12
            context_length=cfg['context_length'], # 1024
            dropout=cfg['drop_rate'], # 0.1
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm_1 = LayerNorm(cfg['emb_dim'])
        self.norm_2 = LayerNorm(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
    
    def forward(self, x):
        # Part 1:
        shortcut = x
        x = self.norm_1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut

        # Part 2:
        shortcut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [204]:
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
block(x)

tensor([[[-0.0246,  0.3926,  1.2577,  ...,  1.0415, -0.0892,  1.1852],
         [ 0.7539,  0.2603,  1.3598,  ...,  0.1263,  0.5215,  0.7429],
         [ 0.8341, -0.2646,  1.3030,  ...,  0.4057,  0.1048,  0.6147],
         [ 1.4581,  0.5226,  1.0878,  ...,  0.5892,  0.1237,  0.1635]],

        [[ 1.0880,  0.7147,  0.4273,  ...,  0.8534,  0.5096,  0.8744],
         [ 1.0296,  0.6966,  0.6257,  ...,  0.7547,  0.5791,  1.3049],
         [ 1.1526,  1.1987,  0.9486,  ...,  0.2142,  0.7802,  0.5957],
         [ 1.0588,  0.5211,  0.6352,  ...,  0.0770,  1.0873,  0.7978]]],
       grad_fn=<AddBackward0>)

# **GPT Model**

In [280]:
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [None]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_emb = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_emb = nn.Dropout(config['drop_rate'])
        self.transf_layers = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = LayerNorm(config['emb_dim'])
        self.out_head = nn.Linear(config['emb_dim'], config['vocab_size'], bias=False)
    
    def forward(self, x, show_info=False):
        '''x: 2D Matrix'''
        batch_size, seq_len = x.shape 
        tok_emb = self.tok_emb(x) 
        pos_emb = self.pos_emb(torch.arange(seq_len))
        x = tok_emb + pos_emb
        if show_info:
            print(f'Token-Embed(shape): {tok_emb.shape}')
            print(f'POS-Embed(shape): {pos_emb.shape}')
            print(f'i/p Before TransfBlocks(shape): {x.shape}')
        x = self.drop_emb(x)
        x = self.transf_layers(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [281]:
GPT_CONFIG_2 = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 1, #12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [289]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

txt1 = 'how are you?'
txt2 = 'Sky is blue.'
encoding_1 = tokenizer.encode(txt1)
encoding_2 = tokenizer.encode(txt2)

print(encoding_1)
print(encoding_2)

[4919, 389, 345, 30]
[22308, 318, 4171, 13]


In [291]:
print(tokenizer.decode(encoding_1))
print(tokenizer.decode(encoding_2))

how are you?
Sky is blue.


In [292]:
batch = torch.tensor([encoding_1, encoding_2])
batch

tensor([[ 4919,   389,   345,    30],
        [22308,   318,  4171,    13]])

In [296]:
model = GPTModel(GPT_CONFIG_2)
y = model(batch)
y

tensor([[[ 7.8062e-01, -5.9063e-02, -1.0971e-01,  ...,  3.0334e-01,
           4.2442e-01,  1.0424e+00],
         [ 1.7165e-01,  6.2726e-01, -4.7989e-02,  ...,  1.0843e+00,
          -2.7839e-01, -6.6204e-01],
         [ 2.3094e-01, -2.7239e-02,  8.7531e-01,  ...,  3.6546e-01,
          -4.2931e-02, -1.3892e+00],
         [-6.0233e-01, -2.9709e-02,  2.3679e-01,  ..., -1.1108e+00,
          -4.1140e-01,  1.1067e+00]],

        [[ 8.4541e-01, -4.2585e-01, -2.6646e-01,  ..., -1.8422e-01,
          -2.9062e-01,  7.6912e-01],
         [-1.4991e-01,  5.9588e-01,  6.0895e-01,  ...,  5.0343e-01,
           2.4041e-01,  3.9879e-01],
         [ 4.2235e-01,  1.5683e-01,  5.1768e-01,  ..., -3.8709e-01,
          -9.5160e-01, -1.3042e+00],
         [-9.1603e-02,  1.2957e-01,  5.5106e-01,  ..., -7.3786e-01,
          -1.2140e+00, -5.1783e-02]]], grad_fn=<UnsafeViewBackward0>)

In [297]:
y.shape

torch.Size([2, 4, 50257])

In [311]:
# batch_1_next_token 
nxt_token = torch.argmax(y[0][-1].softmax(-1))
tokenizer.decode([nxt_token.item()])

' control'

# **`Experiment`**

In [341]:
model = GPTModel(GPT_CONFIG_2)

In [342]:
t = torch.arange(0, 5)
print(t)
print(t[-1:])
print(t[-2:])
print(t[-10:]) # everything..
# Assuming (10) is the context_length

tensor([0, 1, 2, 3, 4])
tensor([4])
tensor([3, 4])
tensor([0, 1, 2, 3, 4])


In [343]:
txt = "Hello I am "
encoding = torch.tensor(tokenizer.encode(txt))
encoding.unsqueeze_(0)
print(encoding)

tensor([[15496,   314,   716,   220]])


In [None]:
# torch.cat(torch.tensor(([1, 2, 3]), torch.tensor([-5])))
x1 = torch.tensor([1, 2])
x2 = torch.tensor([3, 4])
# torch.cat((x1, x2))


tensor([1, 2, 3, 4])

In [427]:
# encoding[1, 0] = -10
encoding

tensor([[15496,   314,   716,   220],
        [  -10,   995,  7510,    30]])

In [429]:
idx = encoding.argmax(-1)
idx

tensor([0, 2])

In [None]:
# encoding[torch.arange(encoding.shape[0]), idx].unsqueeze(1)

tensor([[15496],
        [ 7510]])

In [423]:
encoding

tensor([[15496,   314,   716,   220],
        [25374,   995,  7510,    30]])

In [473]:
t = torch.tensor([[15496,   314,   716,   220]])
model(t)

tensor([[[ 1.1944e-01, -1.4650e-01, -5.3666e-01,  ...,  9.6915e-01,
          -2.5227e-01,  8.9651e-02],
         [-5.2241e-01,  2.2536e-01, -6.1111e-01,  ...,  9.1724e-01,
           2.2657e-01,  2.1270e-01],
         [ 8.8319e-02, -7.8109e-02, -1.3477e-02,  ...,  1.2279e+00,
           7.6103e-01,  3.7268e-01],
         [-9.7600e-02, -1.0451e-01,  6.6069e-01,  ...,  6.9136e-02,
          -7.5239e-02, -6.6359e-01]]], grad_fn=<UnsafeViewBackward0>)

In [479]:
txt1 = "Hello I am "
# txt2 = "HI world shit?"
encoding1 = tokenizer.encode(txt1)
# encoding2 = tokenizer.encode(txt2)
encoding = torch.tensor([
    encoding1, 
    # encoding2
])
# encoding.unsqueeze_(0)

max_output_token = 15
for i in range(max_output_token):
    with torch.no_grad():
        out = model(encoding)
    logits = out[:, -1, :] 
    probs = logits.softmax(-1)
    idx = probs.argmax(dim=-1) # idx of the token with highest prob. [NOTE: This `idx` acts as the tokenId]
    idx.unsqueeze_(0)
    encoding =  torch.cat((encoding.squeeze(0), idx.squeeze(0)))
    print(tokenizer.decode(encoding.numpy()))
    encoding.unsqueeze_(0)

Hello I am  trenches
Hello I am  trenches digest
Hello I am  trenches digest credits
Hello I am  trenches digest credits (~
Hello I am  trenches digest credits (~ Ud
Hello I am  trenches digest credits (~ Udaturated
Hello I am  trenches digest credits (~ Udaturated restrain
Hello I am  trenches digest credits (~ Udaturated restrain testimonies
Hello I am  trenches digest credits (~ Udaturated restrain testimonies Senegal
Hello I am  trenches digest credits (~ Udaturated restrain testimonies Senegalanus
Hello I am  trenches digest credits (~ Udaturated restrain testimonies SenegalanusSteve
Hello I am  trenches digest credits (~ Udaturated restrain testimonies SenegalanusSteve teenagers
Hello I am  trenches digest credits (~ Udaturated restrain testimonies SenegalanusSteve teenagers龍�
Hello I am  trenches digest credits (~ Udaturated restrain testimonies SenegalanusSteve teenagers龍� PDF
Hello I am  trenches digest credits (~ Udaturated restrain testimonies SenegalanusSteve teenagers龍� PD