In [20]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")

print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [2]:
list(model.state_dict().keys())


['transformer.wte.weight',
 'transformer.wpe.weight',
 'transformer.h.0.ln_1.weight',
 'transformer.h.0.ln_1.bias',
 'transformer.h.0.attn.c_attn.weight',
 'transformer.h.0.attn.c_attn.bias',
 'transformer.h.0.attn.c_proj.weight',
 'transformer.h.0.attn.c_proj.bias',
 'transformer.h.0.ln_2.weight',
 'transformer.h.0.ln_2.bias',
 'transformer.h.0.mlp.c_fc.weight',
 'transformer.h.0.mlp.c_fc.bias',
 'transformer.h.0.mlp.c_proj.weight',
 'transformer.h.0.mlp.c_proj.bias',
 'transformer.h.1.ln_1.weight',
 'transformer.h.1.ln_1.bias',
 'transformer.h.1.attn.c_attn.weight',
 'transformer.h.1.attn.c_attn.bias',
 'transformer.h.1.attn.c_proj.weight',
 'transformer.h.1.attn.c_proj.bias',
 'transformer.h.1.ln_2.weight',
 'transformer.h.1.ln_2.bias',
 'transformer.h.1.mlp.c_fc.weight',
 'transformer.h.1.mlp.c_fc.bias',
 'transformer.h.1.mlp.c_proj.weight',
 'transformer.h.1.mlp.c_proj.bias',
 'transformer.h.2.ln_1.weight',
 'transformer.h.2.ln_1.bias',
 'transformer.h.2.attn.c_attn.weight',
 'tra

In [3]:
from transformers import pipeline, set_seed

generator = pipeline("text-generation", model="gpt2")
set_seed(42)

generator("Hello, I'm GPT-2,", max_length=30, num_return_sequences=5)

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm GPT-2, and my name is Lutz. I came to Rokkot from Hungary, I have a son"},
 {'generated_text': "Hello, I'm GPT-2, but the second episode, which is quite an interesting one, I've also written up to go with that"},
 {'generated_text': "Hello, I'm GPT-2, and I'm just a normal guy. I am a little bit like a bachelorette.\n"},
 {'generated_text': "Hello, I'm GPT-2, so thank you so much!\n\nThe guys here at Team GPs are the best. We take"},
 {'generated_text': "Hello, I'm GPT-2, so I'm no one really worried about it. It's just not relevant as much as I'd like"}]

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# GPT Implementation

### Config

In [5]:
from dataclasses import dataclass

@dataclass
class GPT2Config:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

![](attn.png)

# Attention Head

In [6]:
B, T, C = 2, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key_getter = nn.Linear(C, head_size, bias=False)
query_getter = nn.Linear(C, head_size, bias=False)

k = key_getter(x) # what does this token represent?
q = query_getter(x) # what is this token looking for?

print(k.shape, q.shape) # (B, T, head_size)

torch.Size([2, 8, 16]) torch.Size([2, 8, 16])


In [7]:
# (B, T, 16) @ (B, 16, T) -> (B, T, T)
masked_attn_scores = q @ k.transpose(-2, -1)
print(masked_attn_scores.shape) # (B, T, T)

torch.Size([2, 8, 8])


In [8]:
masked_attn_scores *= 1 / math.sqrt(head_size)
print(masked_attn_scores)

tensor([[[-0.3352,  0.2766, -0.3269,  0.4146, -0.3710, -0.1537, -0.0427,
          -0.1284],
         [ 0.2032,  0.0540, -0.1558,  0.3877, -0.2162,  1.1867,  0.3357,
           0.4529],
         [ 0.1822, -0.0837, -0.1061,  0.0885, -0.1683, -0.2100,  0.3359,
          -0.1270],
         [ 0.0239,  0.2039, -0.4211,  0.0917, -0.2047,  0.7772, -0.2167,
           0.2692],
         [-0.0526, -0.1334, -0.2102, -0.0587, -0.0825, -0.0722,  0.0053,
           0.0309],
         [-0.6124, -0.1651, -0.4587,  0.0884, -0.3484,  1.1364, -0.1815,
           0.3092],
         [-0.2513, -0.1293, -0.2516, -0.0489,  0.0567,  0.6245, -0.2788,
           0.5562],
         [ 0.0560,  0.2676,  0.1502, -0.0554,  0.2297, -0.0465,  0.1581,
          -0.1356]],

        [[-0.3393,  0.2480,  0.1879,  0.0455, -0.0734,  0.1622, -0.1689,
           0.3859],
         [ 0.0393,  0.2520, -0.4839, -0.3853,  0.1411,  0.0875, -0.0131,
           0.0977],
         [-0.1508, -0.1411,  0.2021,  0.1708, -0.2341, -0.0830, -0.0

In [9]:
trill = torch.tril(torch.ones(T, T))
print(trill)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


In [10]:
masked_attn_scores = masked_attn_scores.masked_fill(trill == 0, float('-inf'))
print(masked_attn_scores)

tensor([[[-0.3352,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.2032,  0.0540,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.1822, -0.0837, -0.1061,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.0239,  0.2039, -0.4211,  0.0917,    -inf,    -inf,    -inf,
             -inf],
         [-0.0526, -0.1334, -0.2102, -0.0587, -0.0825,    -inf,    -inf,
             -inf],
         [-0.6124, -0.1651, -0.4587,  0.0884, -0.3484,  1.1364,    -inf,
             -inf],
         [-0.2513, -0.1293, -0.2516, -0.0489,  0.0567,  0.6245, -0.2788,
             -inf],
         [ 0.0560,  0.2676,  0.1502, -0.0554,  0.2297, -0.0465,  0.1581,
          -0.1356]],

        [[-0.3393,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.0393,  0.2520,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [-0.1508, -0.1411,  0.2021,    -inf,    -inf,    -inf,    -

In [11]:
attn_scores = F.softmax(masked_attn_scores, dim=-1)
print(attn_scores)

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5372, 0.4628, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3974, 0.3046, 0.2979, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2559, 0.3063, 0.1640, 0.2738, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2109, 0.1946, 0.1802, 0.2096, 0.2047, 0.0000, 0.0000, 0.0000],
         [0.0782, 0.1222, 0.0911, 0.1575, 0.1018, 0.4492, 0.0000, 0.0000],
         [0.1100, 0.1243, 0.1100, 0.1347, 0.1497, 0.2642, 0.1070, 0.0000],
         [0.1211, 0.1497, 0.1331, 0.1084, 0.1441, 0.1093, 0.1342, 0.1000]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4470, 0.5530, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2913, 0.2941, 0.4146, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2169, 0.2894, 0.2301, 0.2636, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2429, 0.1597, 0.2325, 0.1476, 0.2173, 0.0000, 0.0000, 0.0000],
         [0.1409, 0.204

In [12]:
value_getter = nn.Linear(C, head_size, bias=False)
v = value_getter(x)
print(v.shape) # (B, T, head_size)

torch.Size([2, 8, 16])


In [13]:
out = attn_scores @ v
print(out.shape) # (B, T, head_size)


torch.Size([2, 8, 16])


# Testing our GPT

In [3]:
from model_correction import GPT2

In [4]:
gpt = GPT2.from_pretrained()
gpt.eval()
gpt.to('mps')

loading pretrained weights for model gpt2


In [2]:
hf_gpt = GPT2LMHeadModel.from_pretrained("gpt2")
hf_gpt.eval()
hf_gpt.to('mps')

NameError: name 'GPT2LMHeadModel' is not defined

In [2]:
hf_or_ours = "ours" # "ours" or "hf"

In [3]:
num_sequences = 5
max_length = 30

In [14]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tokens = tokenizer.encode("Hello, I'm GPT-2,")
tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).repeat(num_sequences, 1)
x = tokens.to('mps')

In [15]:
torch.manual_seed(42)
torch.mps.manual_seed(42)

while x.size(1) < max_length:
    with torch.no_grad():
        logits = gpt(x) if hf_or_ours == "ours" else hf_gpt(x)[0]
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        # topk sampling 50 by default on HF
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, num_samples=1)
        # gather the tokens at the specific indices
        xcol = torch.gather(topk_indices, dim=-1, index=ix)
        x = torch.cat((x, xcol), dim=1)

In [18]:
for i in range(num_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = tokenizer.decode(tokens)
    print(">", decoded)

> Hello, I'm GPT-2, , , , , , , , , , , , , , , , , , , , , ,
> Hello, I'm GPT-2,,,,,,,,,,,,,,,,,,,,,,
> Hello, I'm GPT-2,,,,,,,,,,,,,,,,,,

"}"
> Hello, I'm GPT-2,,,,,,,,,,,,,,,,,,,,,,
> Hello, I'm GPT-2,,,,,,,,,,

5 5 0 0 0 0 0 0 0 0
