In [None]:
# !pip install transformers

In [None]:
import torch
import torch.nn as nn

from transformers import GPT2Tokenizer, GPT2Config

In [None]:
config = GPT2Config()
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.27.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
# Class to implement the self-attention. Inherit from nn.Module.
class GPT2Attention(nn.Module):
  def __init__(self, config):
    super().__init__()

    max_positions = config.n_positions # Max number of tokens.
    self.mask = torch.tril(torch.ones(max_positions, max_positions), dtype = torch.uint8) # Create a lower triangle binary mask (1s and 0s), where the lower triangular elements = 1.
    self.mask = self.mask.view(1, 1, max_positions, max_positions) # Reshape the 2-dimensional tensor into a 4-dimensional tensor.
    self.embed_dim = config.n_embed # Retrieve the embedding dimensions.
    self.num_heads = config.n_head # Retrive the number of heads.
    self.head_dim = self.embed_dim // self.num_heads
    self.split_size = self.embed_dim
    self.c_attn = nn.Linear(self.embed_dim, 3 * self.embed_dim)
    self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
    self.dropout = nn.Dropout(0.1)

  def _attn(self, query, key, value):
    attn_weights = torch.matmul(query, key.transpose(-1, -2))
    attn_weights = attn_weights / float(value.size(-1)) ** 0.5

    T = query.size(-2)
    casual_mask = self.mask[:, :, :T, :T].bool()
    attn_weights = torch.where(casual_mask, attn_weights, torch.tensor(-1e4))

    attn_weights = nn.Softmax(dim = -1)(attn_weights) # Apply softmax.
    attn_weights = self.dropout(attn_weights) # Drop random elements to avoid overfitting.
    attn_output = torch.matmul(attn_weights, value)

    return attn_output

  # Forward function that performs multi-head self-attention.
  def forward(self, x):
    B, T, C = x.size() # B = batch size, T = sequence length, C = input features.
    query, key, value = self.c_attn(x).split(self.split_size, dim = -1) # Compute the query, key and value matrices for the attention mechanism.
    
    # Reshape the query, key, value such that the multi-head self-attention can be applied.
    query = query.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
    key = key.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
    value = value.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    attn_output = _attn(query, key, value) # Apply the attention mechanism.
    attn_output = attn_output.transpose(1, 2).view(B, T, C)
    attn_output = self.c_proj(attn_output)
    attn_output = self.dropout(attn_output)

    return attn_output

In [None]:
# Class to implement the feed-forward network. Inherit from nn.Module.
class GPT2MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    embed_dim = config.n_embd
    
    self.mlp = nn.Sequential(nn.Linear(embed_dim, 4 * embed_dim),
                             nn.GELU(),
                             nn.Linear(4 * embed_dim, embed_dim),
                             nn.Dropout(0.1)
                             )
    
  def forward(self, x):
    return self.mlp(x)

In [None]:
# Class to define the block (attention layer + mlp layer). Inherit from nn.Module.
class GPT2Block(nn.Module):
  def __init__(self, config):
    super().__init__()
    embed_dim = config.n_embd
    self.ln_1 = nn.LayerNorm(embed_dim)
    self.ln_2 = nn.LayerNorm(embed_dim)
    self.attn = GPT2Attention(config)
    self.mlp = GPT2MLP(config)

  def forward(self, hidden_states):
    residual = hidden_states
    hidden_states = self.ln_1(hidden_states) # Apply layer normalization.
    attn_output = self.attn(hidden_states) # Run it through the attention.
    hidden_states = attn_output + residual # Add the residual.

    residual = hidden_states
    feed_forward_hidden_states = self.mlp(hidden_states) # Run it through the mlp layer.
    hidden_states = residual + feed_forward_hidden_states # Add the residual.

    return hidden_states

In [None]:
# Class to define the GPT2 model.
class GPT2Model(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embed_dim = config.n_embd
    self.vocab_size = config.n_embd

    self.wte = nn.Embedding(self.vocab_size, self.embed_dim) # Define the token embedding layer.
    self.wpe = nn.Embedding(config.n_positions, self.embed_dim) # Define the positional embedding.

    self.drop = nn.Dropout(0.1)
    self.blocks = nn.ModuleList([GPT2Block(config) for _ in range(config.n_layer)]) # Define the model stack of identical blocks, with the number of blocks = config.n_layer.
    self.ln_final = nn.LayerNorm(self.embed_dim) # Define the final layer normalization.

  def forward(self, input_ids = None, position_ids = None):
    input_sphape = input_ids.size()
    batch_size = input_ids.size(0)
    device = input_ids.device

    position_ids = torch.arrange(0, input_ids.size(-1), dtype = torch.long, device = device) # Create a 1D tensor of positional indices for each element.
    position_ids = position_ids.unsqueeze(0) # Add an extra dimension. Convert 1D tensor to 2D tensor.

    input_embeds = self.wte(input_ids)
    position_embeds = self.wpe(position_ids)
    hidden_states = input_embeds + position_embeds
    hidden_states = self.drop(hidden_states)

    # Iterate through all the blocks.
    for block in self.blocks:
      hidden_states = block(hidden_states)

    hidden_states = self.ln_final(hidden_states)

    return hidden_states

In [None]:
# Class to define the language model head for text generation. Inherit from nn.Module.
class GPT2LMHead(nn.Module):
  def __init__(self, config):
    super().__init__()

    self.transformer = GPT2Model(config) # The encoder for the GPT2 architecture.
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False) # Map the output of the transformer to a vocabulary-sized output vector.
    self.xe = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token) # Compute the loss during training.

  def forward(self, input_ids = None, position_ids = None, labels = None):
    hidden_states = self.transformer(input_ids)
    lm_logits = self.lm_head(hidden_states)

    loss = None
    if labels is not None:
      shift_logits = lm_logits[:, :-1, :]
      shift_labels = labels[:, 1:]
      loss = self.xe(shift_logits.view(-1, shift_logits(-1)), shift_labels.view(-1))

    return lm_logits, loss