In [1]:
import torch
import torch.nn as nn

torch.set_printoptions(sci_mode=False)

## **Basic Idea for normalization**

In [2]:
x = torch.rand((2, 4, 5))
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True)
std = x.std(dim=-1, keepdim=True)

print(mean,'\n')
print(var, '\n')
print(std, '\n')

tensor([[[0.5235],
         [0.5720],
         [0.7532],
         [0.5854]],

        [[0.3447],
         [0.6899],
         [0.3227],
         [0.4088]]]) 

tensor([[[0.0785],
         [0.0656],
         [0.0765],
         [0.0984]],

        [[0.0488],
         [0.0514],
         [0.0227],
         [0.0591]]]) 

tensor([[[0.2801],
         [0.2562],
         [0.2767],
         [0.3137]],

        [[0.2209],
         [0.2268],
         [0.1505],
         [0.2432]]]) 



In [3]:
x = torch.rand((2, 4, 5))
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True)
std = x.std(dim=-1, keepdim=True)

x_norm = (x - mean) / std
print(x_norm.mean(-1, keepdim=True))
print(x_norm.var(-1, keepdim=True))

tensor([[[-0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000]],

        [[-0.0000],
         [-0.0000],
         [ 0.0000],
         [ 0.0000]]])
tensor([[[1.0000],
         [1.0000],
         [1.0000],
         [1.0000]],

        [[1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]])


## **LayerNorm Class**

In [4]:
# Layer Normalization
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        # x_norm = (x - mean) / torch.sqrt(var)
        # print(x_norm.)
        print(x_norm.mean(-1, keepdims=True))
        print(x_norm.var(-1, keepdims=True))
        return (x_norm * self.scale + self.shift)

In [5]:
# IN action
x = torch.randn(2, 129750)
ln = LayerNorm(emb_dim=129750)
out_norm = ln(x)
# print(out_norm.mean(-1, keepdims=True))
# print(out_norm.var(-1, keepdims=True))

tensor([[-0.0000],
        [-0.0000]])
tensor([[1.0000],
        [1.0000]])


# **`GELU()` Activation**


<img src = "./gelu.png" width = "900" height = "300">

In [6]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return (0.5 * x * (1 + torch.tanh(
            (torch.sqrt(torch.tensor(2/torch.pi))) + (x + 0.044715 * torch.pow(x, 3))
        )))

# **`FeedForward` Network**

In [7]:
GPT_CONFIG_124M = { 
    "vocab_size": 50257, 
    "context_length": 1024, 
    "emb_dim": 768, 
    "n_heads": 12, 
    "n_layers": 12,  # Transformer-Block-Layers
    "drop_rate": 0.1, 
    "qkv_bias": False
}

In [8]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

In [9]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
out.shape

torch.Size([2, 3, 768])

# **Skip-Connections**

In [10]:
class Example_ForSkipConnections(nn.Module):
    def __init__(self, skip, layers):
        super().__init__()
        self.use_skipconnect = skip
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layers[0], layers[1]), GELU()),
            nn.Sequential(nn.Linear(layers[1], layers[2]), GELU()),
            nn.Sequential(nn.Linear(layers[2], layers[3]), GELU()),
            nn.Sequential(nn.Linear(layers[3], layers[4]), GELU()),
            nn.Sequential(nn.Linear(layers[4], layers[5]), GELU()),
        ])
    
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            layer_output = layer(x)
            if self.use_skipconnect and x.shape == layer_output.shape:
                x += layer_output
            else:
                x = layer(x)
        return x
            

layers = [5, 5, 4, 3, 3, 1]
x = torch.rand((5, 4, 5))
model = Example_ForSkipConnections(True, layers)
model(x)

tensor([[[-0.2961],
         [-0.2931],
         [-0.2961],
         [-0.3034]],

        [[-0.2991],
         [-0.2960],
         [-0.2962],
         [-0.3004]],

        [[-0.2982],
         [-0.3028],
         [-0.2986],
         [-0.2958]],

        [[-0.3007],
         [-0.3000],
         [-0.2984],
         [-0.2957]],

        [[-0.2989],
         [-0.2999],
         [-0.2965],
         [-0.2982]]], grad_fn=<MulBackward0>)

In [11]:
x = torch.tensor([-1., 0., 1.])
model_withSkipConnections = Example_ForSkipConnections(skip=True, layers=[3, 3, 3, 3, 3, 1])
model_withoutSkipConnections = Example_ForSkipConnections(skip=False, layers=[3, 3, 3, 3, 3, 1])

In [12]:
def print_gradient(model, x):
    out = model(x)
    target = torch.tensor([0.])
    criterion = nn.MSELoss()
    loss = criterion(out, target)
    loss.backward()

    for name, params in model.named_parameters():
        params: torch.Tensor
        
        print(f'{name} -> {params.grad.abs().mean().item()}\n')

In [13]:
print_gradient(model_withoutSkipConnections, x)

layers.0.0.weight -> 0.0012244501849636436

layers.0.0.bias -> 0.0018366752192378044

layers.1.0.weight -> 0.0074272919446229935

layers.1.0.bias -> 0.01643095351755619

layers.2.0.weight -> 0.06399044394493103

layers.2.0.bias -> 0.18550486862659454

layers.3.0.weight -> 0.12358677387237549

layers.3.0.bias -> 0.40518125891685486

layers.4.0.weight -> 0.4489266872406006

layers.4.0.bias -> 1.2557212114334106



# **Transformer Block**

In [14]:
import torch 
import torch.nn as nn

In [15]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

In [16]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return (x_norm * self.scale + self.shift)

In [17]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, context_length, dropout=0.5, qkv_bias=False):
        super().__init__()
        assert (d_out % n_heads == 0)

        self.d_in = d_in
        self.d_out = d_out
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.d_head = (d_out // n_heads)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.ones(context_length, context_length).triu(1).bool())
    
    def forward(self, x):
        '''x: 3D. x => (batch_size, num_tokens, token_embed)'''
        b, n_tokens, token_embed = x.shape
        assert self.d_in == token_embed
        
        Q = self.W_q(x) # (b, n_tokens, d_out)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(b, n_tokens, self.n_heads, self.d_head) # (b, n_tokens, n_heads, d_head)
        K = K.view(b, n_tokens, self.n_heads, self.d_head) 
        V = V.view(b, n_tokens, self.n_heads, self.d_head) 

        Q = Q.transpose(1, 2) # (b, n_heads, n_tokens, d_head)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / self.d_head**0.5 #K.shape[-1]**0.5
        attn_scores = attn_scores.masked_fill(self.mask[: n_tokens, : n_tokens], -torch.inf)
        attn_weights = attn_scores.softmax(-1)
        attn_weights = self.dropout(attn_weights)
        context_vectors = attn_weights @ V
        context_vectors = context_vectors.transpose(1, 2)
        context_vectors = context_vectors.contiguous().view(b, n_tokens, self.d_out)
        return self.out_proj(context_vectors)

In [18]:
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [19]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiheadAttention(
            d_in=cfg['emb_dim'],    # 768
            d_out=cfg['emb_dim'],   # 768
            n_heads=cfg['n_heads'], # 12
            context_length=cfg['context_length'], # 1024
            dropout=cfg['drop_rate'], # 0.1
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm_1 = LayerNorm(cfg['emb_dim'])
        self.norm_2 = LayerNorm(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
    
    def forward(self, x):
        # Part 1:
        shortcut = x
        x = self.norm_1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut

        # Part 2:
        shortcut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [20]:
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
block(x)

tensor([[[ 0.3881,  0.3762,  1.4581,  ...,  0.7932, -1.2334, -0.2798],
         [ 0.6409, -0.6456,  0.6692,  ..., -0.0151, -0.4119, -0.1861],
         [ 0.3886, -0.2022,  0.5368,  ..., -0.2549,  0.8459,  0.9049],
         [ 0.4216, -0.6736,  0.7514,  ...,  0.1523,  0.1979,  0.4250]],

        [[ 1.1130,  0.3264,  0.5296,  ...,  0.9354,  0.2601,  0.7476],
         [ 0.3056,  1.0750,  0.5819,  ...,  0.2299, -0.1598, -0.1544],
         [ 0.6481,  0.4353,  0.4790,  ...,  0.1865,  0.0681, -0.1181],
         [ 0.8531,  0.6861, -0.3147,  ...,  0.2521,  0.5660,  0.6883]]],
       grad_fn=<AddBackward0>)

# **GPT Model**

In [21]:
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [22]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_emb = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_emb = nn.Dropout(config['drop_rate'])
        self.transf_layers = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = LayerNorm(config['emb_dim'])
        self.out_head = nn.Linear(config['emb_dim'], config['vocab_size'], bias=False)
    
    def forward(self, x, show_info=False):
        '''x: 2D Matrix'''
        batch_size, seq_len = x.shape 
        tok_emb = self.tok_emb(x) 
        pos_emb = self.pos_emb(torch.arange(seq_len))
        x = tok_emb + pos_emb
        if show_info:
            print(f'Token-Embed(shape): {tok_emb.shape}')
            print(f'POS-Embed(shape): {pos_emb.shape}')
            print(f'i/p Before TransfBlocks(shape): {x.shape}')
        x = self.drop_emb(x)
        x = self.transf_layers(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [23]:
GPT_CONFIG_2 = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 1, #12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [24]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

txt1 = 'how are you?'
txt2 = 'Sky is blue.'
encoding_1 = tokenizer.encode(txt1)
encoding_2 = tokenizer.encode(txt2)

print(encoding_1)
print(encoding_2)

[4919, 389, 345, 30]
[22308, 318, 4171, 13]


In [25]:
print(tokenizer.decode(encoding_1))
print(tokenizer.decode(encoding_2))

how are you?
Sky is blue.


In [26]:
batch = torch.tensor([encoding_1, encoding_2])
batch

tensor([[ 4919,   389,   345,    30],
        [22308,   318,  4171,    13]])

In [27]:
model = GPTModel(GPT_CONFIG_2)
y = model(batch)
y

tensor([[[-0.0539, -1.0485, -0.4322,  ...,  0.5333, -0.4937, -1.1792],
         [-0.5220, -0.0211,  0.6120,  ...,  0.7940, -0.3497, -0.0212],
         [ 0.2401,  0.0296,  0.1745,  ...,  0.6775,  0.6055,  0.2999],
         [ 0.8294,  0.7989, -0.1112,  ..., -1.4778,  0.3479, -0.2939]],

        [[ 0.5718, -0.9469,  0.4932,  ...,  1.3646,  0.4097, -0.3681],
         [ 0.3502,  0.1984,  0.6038,  ...,  0.5483,  0.1599, -0.1177],
         [ 0.3745,  0.2255,  0.0441,  ..., -0.4783,  0.6850, -0.5604],
         [ 0.3580,  1.1778, -0.3584,  ..., -0.5991,  0.3171,  0.2838]]],
       grad_fn=<UnsafeViewBackward0>)

In [28]:
y.shape

torch.Size([2, 4, 50257])

In [29]:
# batch_1_next_token 
nxt_token = torch.argmax(y[0][-1].softmax(-1))
tokenizer.decode([nxt_token.item()])

'ROM'

# **`Experiment`**

In [30]:
model = GPTModel(GPT_CONFIG_2)

In [31]:
t = torch.arange(0, 5)
print(t)
print(t[-1:])
print(t[-2:])
print(t[-10:]) # everything..
# Assuming (10) is the context_length

tensor([0, 1, 2, 3, 4])
tensor([4])
tensor([3, 4])
tensor([0, 1, 2, 3, 4])


In [32]:
txt = "Hello I am "
encoding = torch.tensor(tokenizer.encode(txt))
encoding.unsqueeze_(0)
print(encoding)

tensor([[15496,   314,   716,   220]])


In [33]:
# torch.cat(torch.tensor(([1, 2, 3]), torch.tensor([-5])))
x1 = torch.tensor([1, 2])
x2 = torch.tensor([3, 4])
# torch.cat((x1, x2))


In [34]:
# encoding[1, 0] = -10
encoding

tensor([[15496,   314,   716,   220]])

In [35]:
idx = encoding.argmax(-1)
idx

tensor([0])

In [36]:
# encoding[torch.arange(encoding.shape[0]), idx].unsqueeze(1)

In [37]:
encoding

tensor([[15496,   314,   716,   220]])

In [38]:
t = torch.tensor([[15496,   314,   716,   220]])
model(t)

tensor([[[ 0.1659,  0.9068,  0.4957,  ..., -0.3982, -0.5369, -0.9261],
         [-0.4490, -0.4262, -0.8508,  ..., -0.7754, -0.4209, -1.2455],
         [-0.6343,  0.9729,  0.6555,  ..., -1.6129,  0.4038, -0.2435],
         [-1.1104, -0.8799,  0.7321,  ..., -0.6918,  0.1560, -0.2655]]],
       grad_fn=<UnsafeViewBackward0>)

In [39]:
txt1 = "Hello I am "
# txt2 = "HI world shit?"
encoding1 = tokenizer.encode(txt1)
# encoding2 = tokenizer.encode(txt2)
encoding = torch.tensor([
    encoding1, 
    # encoding2
])
# encoding.unsqueeze_(0)

max_output_token = 15
for i in range(max_output_token):
    with torch.no_grad():
        out = model(encoding)
    logits = out[:, -1, :] 
    probs = logits.softmax(-1)
    idx = probs.argmax(dim=-1) # idx of the token with highest prob. [NOTE: This `idx` acts as the tokenId]
    idx.unsqueeze_(0)
    encoding =  torch.cat((encoding.squeeze(0), idx.squeeze(0)))
    print(tokenizer.decode(encoding.numpy()))
    encoding.unsqueeze_(0)

Hello I am mers
Hello I am mers fastball
Hello I am mers fastball Ud
Hello I am mers fastball Ud measurable
Hello I am mers fastball Ud measurable fascism
Hello I am mers fastball Ud measurable fascism GENERAL
Hello I am mers fastball Ud measurable fascism GENERAL feasible
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen detonated
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen detonated slightest
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen detonated slightest Nicola
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen detonated slightest Nicola374
Hello I am mers fastball Ud measurable fascism GENERAL feasible camer Rw phylogen deto

In [42]:
# Outputing the Final Sentence:

txt1 = "Hello I am "
# txt2 = "HI world shit?"
encoding1 = tokenizer.encode(txt1)
# encoding2 = tokenizer.encode(txt2)
encoding = torch.tensor([
    encoding1, 
    # encoding2
])
# encoding.unsqueeze_(0)

max_output_token = 15
for i in range(max_output_token):
    with torch.no_grad():
        out = model(encoding)
    logits = out[:, -1, :] 
    probs = logits.softmax(-1)
    idx = probs.argmax(dim=-1) # idx of the token with highest prob. [NOTE: This `idx` acts as the tokenId]
    idx.unsqueeze_(0)
    encoding =  torch.cat((encoding.squeeze(0), idx.squeeze(0)))

    if (i == max_output_token - 1):
        print(tokenizer.decode(encoding.numpy()))
    encoding.unsqueeze_(0)

Hello I am  Hosp bonuses notoriety pulls priv monster instances camer thedefault chunk held cru Ten compelled


In [54]:
# Putting Text-Generation in a function
def generate_text_simple(model, tokenizerGpt2:tiktoken.Encoding, txt, max_output_token):
    encoding = torch.tensor(tokenizerGpt2.encode(txt))
    encoding.unsqueeze_(0)
    for i in range(max_output_token):
        with torch.no_grad():
            out = model(encoding)
        logits = out[:, -1, :] 
        probs = logits.softmax(-1)
        idx = probs.argmax(dim=-1) # idx of the token with highest prob. [NOTE: This `idx` acts as the tokenId]
        idx.unsqueeze_(0)
        encoding =  torch.cat((encoding.squeeze(0), idx.squeeze(0)))

        if (i == max_output_token - 1):
            print(tokenizer.decode(encoding.numpy()))
        encoding.unsqueeze_(0)


model = GPTModel(GPT_CONFIG_2)
tokenizer = tiktoken.get_encoding('gpt2')
txt = 'hello man how are you doing'
model.eval()
generate_text_simple(model, tokenizer, txt, 10)

hello man how are you doing pinWARE'), stat Europa tells Systems Keystone Fernando graduates
