In [6]:
#importing libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Config

class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(RotaryPositionalEmbedding, self).__init__()
        self.dim = d_model
        self.embedding = nn.Embedding(max_len, d_model * 2)

    def forward(self, x):
        sinusoids = torch.sin(self.embedding.weight)
        cosinuses = torch.cos(self.embedding.weight)
        pos_emb = torch.cat([sinusoids, cosinuses], dim=-1)
        pos_emb = pos_emb.unsqueeze(0)
        return x + pos_emb[:, :x.size(1)].detach()

# Replace original positional embedding with rotary positional embedding in GPT-2
class GPT2WithRotaryPositionalEmbedding(nn.Module):
    def __init__(self, config):
        super(GPT2WithRotaryPositionalEmbedding, self).__init__()
        self.config = config
        self.embeddings = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embeddings = RotaryPositionalEmbedding(config.n_embd, config.max_position_embeddings)
        # ... other layers of GPT-2 ...

# Instantiate the model
config = GPT2Config.from_pretrained("gpt2")
model_rotary = GPT2WithRotaryPositionalEmbedding(config)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
import torch
import torch.nn as nn

class GroupQueryAttentionLayer(nn.Module):
    def __init__(self, config):
        super(GroupQueryAttentionLayer, self).__init__()
        # Implement the Group Query Attention mechanism
        # ...

# Modify GPT-2 to use Group Query Attention
class GPT2WithGroupQueryAttention(nn.Module):
    def __init__(self, config):
        super(GPT2WithGroupQueryAttention, self).__init__()
        self.config = config
        self.embeddings = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.n_embd)
        self.layer = GroupQueryAttentionLayer(config)
        # ... other layers of GPT-2 ...

# Instantiate the model
config_group_query = GPT2Config.from_pretrained("gpt2")
model_group_query = GPT2WithGroupQueryAttention(config_group_query)

In [9]:
import torch
import torch.nn as nn

class SlidingWindowAttentionLayer(nn.Module):
    def __init__(self, config):
        super(SlidingWindowAttentionLayer, self).__init__()
        # Implement the Sliding Window Attention mechanism
        # ...

# Modify GPT-2 to use Sliding Window Attention
class GPT2WithSlidingWindowAttention(nn.Module):
    def __init__(self, config):
        super(GPT2WithSlidingWindowAttention, self).__init__()
        self.config = config
        self.embeddings = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.n_embd)
        self.layer = SlidingWindowAttentionLayer(config)
        # ... other layers of GPT-2 ...

# Instantiate the model
config_sliding_window = GPT2Config.from_pretrained("gpt2")
model_sliding_window = GPT2WithSlidingWindowAttention(config_sliding_window)
