In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
def scaled_dot_product(q, k, v, mask=None):
    # q: 8 x 1024 x 288, k: 8 x 1024 x 288, v: 8 x 1024 x 288, mask 1024 x 1024
    d_k = q.size()[-1] 
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k) # 8 x 1024 x 1024
    if mask is not None:
        scaled += mask # 8 x 1024 x 1024
    attention = F.softmax(scaled, dim=-1) # 8 x 1024 x 1024
    values = torch.matmul(attention, v)
    return values, attention

In [3]:
class GPT2Attention(nn.Module):
    def __init__(self,d_model,num_heads,drop_prob):
        super(GPT2Attention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.c_attn = nn.Linear(d_model , 3 * d_model)
        self.c_proj = nn.Linear(d_model, d_model)
        self.attn_dropout = nn.Dropout(p=drop_prob)
    def forward(self, x):
        # Implement the forward pass for GPT2Attention
        sequence_length=1024
        qkv = self.c_attn(x)
        qkv = qkv.reshape(sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(1,0,2)
        q, k, v = qkv.chunk(3, dim=-1)
        mask = torch.full([1024,1024] , float('-inf'))
        mask = torch.triu(mask, diagonal=1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.reshape(sequence_length, self.num_heads * self.head_dim)
        out = self.c_proj(values)
        
        return out

In [4]:
class GPT2MLP(nn.Module):
    def __init__(self,d_model,hidden,drop_prob):
        super(GPT2MLP, self).__init__()
        self.c_fc = nn.Linear(d_model, hidden)
        self.c_proj = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x=self.c_fc(x)
        x=self.c_proj(x)
        x=self.relu(x)
        x=self.dropout(x)

In [5]:
class GPT2Block(nn.Module):
    def __init__(self,d_model,num_heads,fnn_hidden,drop_prob):
        super(GPT2Block, self).__init__()

        self.ln_1 = nn.LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        # GPT2SelfAttention
        self.attn = GPT2Attention(d_model,num_heads,drop_prob)
        self.ln_2 = nn.LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        # GPT2 MultiLayerPerceptron
        self.mlp = GPT2MLP(d_model,fnn_hidden,drop_prob)

    def forward(self, x):
        x = self.ln_1(x)
        x = self.attn(x)
        x = self.ln_2(x)
        x = self.mlp(x)
        return x

In [6]:
class RotaryEmbedding(nn.Module):
    def __init__(self, d_model):
        super(RotaryEmbedding, self).__init__()
        self.d_model = d_model
        self.cos_embedding = nn.Embedding(d_model // 2, 1)
        self.sin_embedding = nn.Embedding(d_model // 2, 1)

    def forward(self, positions):
        angles = positions / torch.pow(10000, torch.arange(0, self.d_model, 2).float() / self.d_model)
        angles[:, 0::2] = torch.sin(angles[:, 0::2])
        angles[:, 1::2] = torch.cos(angles[:, 1::2])
        return angles.unsqueeze(0)

In [12]:
class GPT2WithRotaryEmbedding(nn.Module):
    def __init__(self, vocab_size=50257, embedding_dim=768,sequence_length=1024, num_blocks=12):
        super(GPT2WithRotaryEmbedding, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(sequence_length,768)
        self.rotary_pos_emb = RotaryEmbedding(768)
        self.drop = nn.Dropout(p=0.1)

        # GPT2 Transformer blocks
        self.h = nn.ModuleList([GPT2Block(d_model=embedding_dim,num_heads=8,fnn_hidden=768*4,drop_prob=0.1) for _ in range(num_blocks)])

        # Final layer normalization
        self.ln_f = nn.LayerNorm((embedding_dim,), eps=1e-05, elementwise_affine=True)

    def forward(self, x):
        # Input x should be a sequence of token indices
        positions = torch.arange(x.size(1)).unsqueeze(0)
        x = self.embedding(x) + self.rotary_pos_emb(positions)
        
        
        # GPT2 Transformer blocks
        for block in self.h:
            embedded = block(embedded)

        # Final layer normalization
        output = self.ln_f(embedded)

        return output


In [13]:
model=GPT2WithRotaryEmbedding()

In [14]:
model.eval()

GPT2WithRotaryEmbedding(
  (embedding): Embedding(1024, 768)
  (rotary_pos_emb): RotaryEmbedding(
    (cos_embedding): Embedding(384, 1)
    (sin_embedding): Embedding(384, 1)
  )
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Linear(in_features=768, out_features=2304, bias=True)
        (c_proj): Linear(in_features=768, out_features=768, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [15]:
# Function to count the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    

In [16]:
count_parameters(model)

85843200