In [1]:
import torch
# from transformers import GPT2Config, GPT2Model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# config = GPT2Config()
# model = GPT2Model(config)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

print(model)

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [2]:
from torchviz import make_dot

input_ids = torch.randint(0, 1000, (1, 128))
out = model(input_ids)

logits = out.logits

net_vis = make_dot(logits, params=dict(model.named_parameters()), show_attrs=False, show_saved=False)
net_vis.format = 'svg'
net_vis.render('gpt2')


'gpt2.svg'

In [3]:
# import hiddenlayer as hl
# g = hl.build_graph(model, input_ids)
# g.save("gpt2_model_hl", format="png")

# save the model as onnx
input_names = ["input_ids"]
output_names = ["logits"]
torch.onnx.export(model, input_ids, "gpt2.onnx", input_names=input_names, output_names=output_names)

  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:


In [4]:
prompt = "GPT2 is a model developed by OpenAI."
input_ids = tokenizer.encode(prompt, return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT2 is a model developed by OpenAI. It allows us to do more with less at a higher cost through better user experience while giving you more control over how you use your smart phone.

If you are like most Android users, you are familiar with the Android operating system as it is generally used as a browser or application on the Internet, but there is a vast amount of information available about it that can only be viewed through your phone. So, it has a big impact on


In [5]:
layer_info = []
hooks = []


In [6]:
def hook_pre_fn(module, input):
    try:
        input_shape = tuple(tuple(x.size()) for x in input)
    except AttributeError:
        input_shape = None

    layer_info.append(('pre', module.__class__.__name__, input_shape))


def hook_fn(module, input, output):
    try:
        output_shape = tuple(tuple(x.size()) for x in output)
    except AttributeError:
        output_shape = None

    layer_info.append(('post', module.__class__.__name__, output_shape))



In [7]:
for hook in hooks:
    hook.remove()

for layer in model.named_modules():
    hooks.append(layer[1].register_forward_pre_hook(hook_pre_fn))
    hooks.append(layer[1].register_forward_hook(hook_fn))

input_ids = torch.randint(0, 1000, (1, 128))
out = model(input_ids)

for hook in hooks:
    hook.remove()

ln_cnt = 0
linear_ln_cnt = 0
conv1d_ln_cnt = 0
linear_dropout_ln_cnt = 0
conv1d_dropout_ln_cnt = 0

indent = 0

for i, layer in enumerate(layer_info):
    if layer[0] == 'pre':
        print('  ' * indent, '< ', layer[1], '<- ', layer[2], '>')
        indent += 1
    elif layer[0] == 'post':
        indent -= 1
        print('  ' * indent, '</', layer[1], ' ->', layer[2], '>')

pure_layers = []
for i in range(len(layer_info) - 1):
    if layer_info[i][0] == 'pre' and layer_info[i + 1][0] == 'post' and layer_info[i][1] == layer_info[i + 1][1]:
        pure_layers.append(layer_info[i][1])

print('Pure layers:', pure_layers)

for i, layer in enumerate(pure_layers):
    if layer == 'LayerNorm':
        ln_cnt += 1
        if i - 1 > 0:
            if pure_layers[i - 1] == 'Linear':
                linear_ln_cnt += 1
            elif i - 1 > 0 and pure_layers[i - 1] == 'Conv1D':
                conv1d_ln_cnt += 1

no_dropout_layers = [pure_layers[i] for i in range(len(pure_layers)) if pure_layers[i] != 'Dropout']

for i, layer in enumerate(no_dropout_layers):
    if layer == 'LayerNorm':
        if i - 1 > 0:
            if no_dropout_layers[i - 1] == 'Linear':
                linear_dropout_ln_cnt += 1
            elif i - 1 > 0 and no_dropout_layers[i - 1] == 'Conv1D':
                conv1d_dropout_ln_cnt += 1

print('LayerNorm:', ln_cnt)
print('Linear--LayerNorm:', linear_ln_cnt)
print('Conv1d--LayerNorm:', conv1d_ln_cnt)
print('Linear--LayerNorm (Ignore Dropout):', linear_dropout_ln_cnt)
print('Conv1d--LayerNorm (Ignore Dropout):', conv1d_dropout_ln_cnt)

   <  GPT2LMHeadModel <-  ((1, 128),) >
     <  GPT2Model <-  ((1, 128),) >
       <  Embedding <-  ((1, 128),) >
       </ Embedding  -> ((128, 768),) >
       <  Embedding <-  ((1, 128),) >
       </ Embedding  -> ((128, 768),) >
       <  Dropout <-  ((1, 128, 768),) >
       </ Dropout  -> ((128, 768),) >
       <  GPT2Block <-  ((1, 128, 768),) >
         <  LayerNorm <-  ((1, 128, 768),) >
         </ LayerNorm  -> ((128, 768),) >
         <  GPT2SdpaAttention <-  ((1, 128, 768),) >
           <  Conv1D <-  ((1, 128, 768),) >
           </ Conv1D  -> ((128, 2304),) >
           <  Conv1D <-  ((1, 128, 768),) >
           </ Conv1D  -> ((128, 768),) >
           <  Dropout <-  ((1, 128, 768),) >
           </ Dropout  -> ((128, 768),) >
         </ GPT2SdpaAttention  -> None >
         <  LayerNorm <-  ((1, 128, 768),) >
         </ LayerNorm  -> ((128, 768),) >
         <  GPT2MLP <-  ((1, 128, 768),) >
           <  Conv1D <-  ((1, 128, 768),) >
           </ Conv1D  -> ((128, 3