In [None]:
import torch
import torch.nn as nn

# Embedding Layer
class GPT2Embeddings(nn.Module):
    def __init__(self, vocab_size, max_seq_len, embed_dim):
        super(GPT2Embeddings, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(max_seq_len, embed_dim)

    def forward(self, x):
        seq_len = x.size(1)
        position_ids = torch.arange(seq_len, dtype=torch.long, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand_as(x)

        token_embeds = self.token_embeddings(x)
        position_embeds = self.position_embeddings(position_ids)

        return token_embeds + position_embeds

# Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(ScaledDotProductAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"

        self.scale = self.head_dim ** -0.5
        self.qkv_linear = nn.Linear(embed_dim, embed_dim * 3)
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        qkv = self.qkv_linear(x).view(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        q, k, v = qkv.chunk(3, dim=-1)

        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, v)

        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        output = self.out_linear(context)
        return output

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.scaled_dot_product_attention = ScaledDotProductAttention(embed_dim, num_heads)

    def forward(self, x):
        return self.scaled_dot_product_attention(x)

# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden_dim):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.GELU(),
            nn.Linear(ff_hidden_dim, embed_dim),
        )

    def forward(self, x):
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)
        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)
        return x

# GPT-2 Model
class GPT2(nn.Module):
    def __init__(self, vocab_size, max_seq_len, embed_dim, num_heads, ff_hidden_dim, num_layers):
        super(GPT2, self).__init__()
        self.embeddings = GPT2Embeddings(vocab_size, max_seq_len, embed_dim)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_hidden_dim) for _ in range(num_layers)
        ])
        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, x):
        x = self.embeddings(x)
        for block in self.transformer_blocks:
            x = block(x)
        logits = self.lm_head(x)
        return logits

# Parameters for GPT-2 small model
vocab_size = 50257
max_seq_len = 1024
embed_dim = 768
num_heads = 12
ff_hidden_dim = 3072
num_layers = 12

# Create the GPT-2 model instance
gpt2_model = GPT2(vocab_size, max_seq_len, embed_dim, num_heads, ff_hidden_dim, num_layers)

In [None]:
pip install transformers




In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def load_model_and_generate_text(prompt, max_length=50):
    # Load pre-trained model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Encode the input prompt to get the tensor
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text using the model
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

# Sample prompt
prompt = "In a distant future, humanity has discovered"

# Generate text
generated_text = load_model_and_generate_text(prompt)
print(generated_text)


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a distant future, humanity has discovered a way to make the world a better place.

The world is a better place.

The world is a better place.

The world is a better place.

The world is
