In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
def scaled_dot_product(q, k, v, mask=None):
    # q: 8 x 1024 x 288, k: 8 x 1024 x 288, v: 8 x 1024 x 288, mask 1024 x 1024
    d_k = q.size()[-1] 
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k) # 8 x 1024 x 1024
    if mask is not None:
        scaled += mask # 8 x 1024 x 1024
    attention = F.softmax(scaled, dim=-1) # 8 x 1024 x 1024
    values = torch.matmul(attention, v)
    return values, attention

In [3]:
class GPT2Attention(nn.Module):
    def __init__(self,d_model,num_heads,drop_prob):
        super(GPT2Attention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.c_attn = nn.Linear(d_model , 3 * d_model)
        self.c_proj = nn.Linear(d_model, d_model)
        self.attn_dropout = nn.Dropout(p=drop_prob)
    def forward(self, x):
        # Implement the forward pass for GPT2Attention
        sequence_length=1024
        qkv = self.c_attn(x)
        qkv = qkv.reshape(sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(1,0,2)
        q, k, v = qkv.chunk(3, dim=-1)
        mask = torch.full([1024,1024] , float('-inf'))
        mask = torch.triu(mask, diagonal=1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.reshape(sequence_length, self.num_heads * self.head_dim)
        out = self.c_proj(values)
        
        return out

In [4]:
class GPT2MLP(nn.Module):
    def __init__(self,d_model,hidden,drop_prob):
        super(GPT2MLP, self).__init__()
        self.c_fc = nn.Linear(d_model, hidden)
        self.c_proj = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x=self.c_fc(x)
        x=self.c_proj(x)
        x=self.relu(x)
        x=self.dropout(x)

In [5]:
class GPT2Block(nn.Module):
    def __init__(self,d_model,num_heads,fnn_hidden,drop_prob):
        super(GPT2Block, self).__init__()

        self.ln_1 = nn.LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        # GPT2SelfAttention
        self.attn = GPT2Attention(d_model,num_heads,drop_prob)
        self.ln_2 = nn.LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        # GPT2 MultiLayerPerceptron
        self.mlp = GPT2MLP(d_model,fnn_hidden,drop_prob)

    def forward(self, x):
        x = self.ln_1(x)
        x = self.attn(x)
        x = self.ln_2(x)
        x = self.mlp(x)
        return x

In [6]:
class GPT2Model(nn.Module):
    def __init__(self, vocab_size=50257, embedding_dim=768,sequence_length=1024, num_blocks=12):
        super(GPT2Model, self).__init__()

        # Embedding layer
        self.wte = nn.Embedding(vocab_size, embedding_dim)
        self.wpe = nn.Embedding(sequence_length, embedding_dim)
        self.drop = nn.Dropout(p=0.1)

        # GPT2 Transformer blocks
        self.h = nn.ModuleList([GPT2Block(d_model=embedding_dim,num_heads=8,fnn_hidden=768*4,drop_prob=0.1) for _ in range(num_blocks)])

        # Final layer normalization
        self.ln_f = nn.LayerNorm((embedding_dim,), eps=1e-05, elementwise_affine=True)

    def forward(self, x):
        # Input x should be a sequence of token indices
        embedded = self.wte(x) + self.wpe(x)
        embedded = self.drop(embedded)
        
        
        # GPT2 Transformer blocks
        for block in self.h:
            embedded = block(embedded)

        # Final layer normalization
        output = self.ln_f(embedded)

        return output


In [7]:
model=GPT2Model()

In [8]:
model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Linear(in_features=768, out_features=2304, bias=True)
        (c_proj): Linear(in_features=768, out_features=768, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [9]:
# Function to count the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    

In [10]:
count_parameters(model)

124439808

In [22]:
from transformers import GPT2Model,GPT2Tokenizer

In [12]:
original_model = GPT2Model.from_pretrained('gpt2')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

**Our model's no. of parameters as well as architecture both are same as that of the original GPT2**

In [18]:
original_model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [20]:
count_parameters(original_model)

124439808

In [15]:
# Copying the pretrained weights from the original GPT2
checkpoint=original_model.state_dict()

In [17]:
# Loading the pretrained weights
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [23]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
prompt="What can I do?"
input_ids = tokenizer.encode(prompt, return_tensors='pt')


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [24]:
print(input_ids)

tensor([[2061,  460,  314,  466,   30]])


In [26]:
output=model.forward(input_ids)
print(output)

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.1643,  0.0957, -0.2844,  ..., -0.1632, -0.0774, -0.2154],
         [-0.4465, -0.0057,  0.1490,  ..., -0.1882,  0.2139, -0.1063],
         [-0.3189, -0.6003, -0.0595,  ..., -0.1304,  0.4610,  0.2858],
         [ 0.2894, -0.7909, -1.8807,  ...,  0.2506,  0.4825,  0.5021],
         [-0.0727, -0.6138,  0.1467,  ...,  0.2517,  0.0465, -0.1001]]],
       grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.3190e+00,  1.8644e+00,  8.9757e-01,  ..., -1.4033e+00,
           -2.3651e-01,  1.2896e+00],
          [-2.0304e+00,  2.3346e+00,  1.9655e+00,  ..., -1.5131e+00,
           -2.1805e+00,  2.2125e+00],
          [-2.6342e+00,  2.3403e+00,  2.8411e+00,  ..., -1.4082e+00,
           -1.9446e+00,  2.0003e+00],
          [-2.3474e+00,  2.0661e+00,  2.0008e+00,  ..., -1.1002e+00,
           -2.8614e+00,  2.3751e+00],
          [-2.5361e+00,  2.9027e+00,  1.9364e+00,  ..., -1.0628e+00,
           -2.3543e+00,  1.9235e+00