In [2]:
# conda activate torch_gpu
import os
import pandas as pd
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import GPT2LMHeadModel


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M

In [4]:
model_hf

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# This class is combination of 2 classes we generated in gpt_dev_aj i.e. class Head and MultiHeadAttention
class CausalSelfAttention(nn.Module): 
    def __init__(self):
        super().__init__()

        

In [None]:
## No need to approximate it but we are replicating GPT2 so we will keep this
# GELU is similar to RELU instead it does not have any dead neuron as in RELU if x < 0 so y = 0
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(approximate = "tanh"),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.Dropout),
        )

    def forward(self, x):
        return(self.net(x))


In [None]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        # head_size = self.n_embd // self.n_head
        # self.sa_heads = MultiHeadAttention(self.n_head, head_size)
        # self.fwd = FeedForwardLayer(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = MLP(config)
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [None]:
@dataclass
class GPTConfig:
    vocab_size: int = 50257
    block_size: int = 1024
    n_embd: int = 768
    dropout: int = 0.1
    lr: int = 3e-04
    nlayer = 12

    
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = config

    self.transformer = nn.ModuleDict(Dict{
        wte = nn.Embedding(config.vocab_size, config.n_embd)
        wpe = nn.Embedding(config.block_size, config.n_embd)
        h = nn.Sequential(*[Block(config) for _ in config.nlayer])
        self.ln_norm = nn.LayerNorm(config.n_embd)
    })
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)




