In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


## 1. Text generation example

In [None]:
from transformers import pipeline

In [None]:
text_generation = pipeline("text-generation")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
prefix_text = "The world is"

In [None]:
generated_text= text_generation(prefix_text, max_length=50, do_sample=False)[0]
print(generated_text['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The world is a better place if you're a good person.

I'm not saying that you should be a bad person. I'm saying that you should be a good person.

I'm not saying that you should be a bad


## 2. Model extraction

In [None]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
#from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

# Set seed for reproducibility.
set_seed(123)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Name of transformers model - will use already pretrained model.
# Path of transformer model - will load your own model from local disk.
model_name_or_path = 'gpt2'

# Dictionary of labels and their id - this will be used to convert.
# String labels to number ids.
labels_ids = {'neg': 0, 'pos': 1}

# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = len(labels_ids)

In [None]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cpu`


In [None]:
model_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.28.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [None]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

## 3. Features of a given text

In [None]:
from transformers import GPT2Model

In [None]:
# forward pass with only getting the output
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [None]:
print('Words in vocabulary: ', tokenizer.vocab_size)

Words in vocabulary:  50257


In [None]:
vocabulary = tokenizer.get_vocab()
vocabulary['Hi']

17250

In [None]:
# encode text
text_ids = tokenizer.encode(text, return_tensors='pt')
text_ids

tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]])

In [None]:
tokenizer.tokenize(text)

['Re', 'place', 'Ġme', 'Ġby', 'Ġany', 'Ġtext', 'Ġyou', "'d", 'Ġlike', '.']

In [None]:
tokenizer.tokenize(text)[0]

'Re'

In [None]:
vocabulary[tokenizer.tokenize(text)[0]]

3041

In [None]:
encoded_input

{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
output.attentions

In [None]:
output.last_hidden_state.shape

torch.Size([1, 10, 768])

In [None]:
# text generation
#text_generation = pipeline("text-generation")
#generated_text= text_generation(text, max_length=100, do_sample=False)[0]
#print(generated_text['generated_text'])

In [None]:
model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)>

## 4. Manually getting the hidden layers

In [None]:
def get_module_names(module):
    try:
        module_names = list(module._module.keys())
    except:
        module_names = []
    return module_names

In [None]:
# forward pass with only getting the output
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
encoded_input

{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [None]:
# embedding
print(type(model.wte))
print(model.wte.weight.shape)

<class 'torch.nn.modules.sparse.Embedding'>
torch.Size([50257, 768])


In [None]:
encoded_input

{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# first GPT2 block
model.h[0]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [None]:
model.h[1]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [None]:
model.h[1].ln_1

LayerNorm((768,), eps=1e-05, elementwise_affine=True)

In [None]:
model.h[1].attn

GPT2Attention(
  (c_attn): Conv1D()
  (c_proj): Conv1D()
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
model.h[1].attn.c_attn.weight.shape

torch.Size([768, 2304])

In [None]:
model.h[1].attn.c_proj.weight

Parameter containing:
tensor([[-0.0971, -0.0016,  0.1122,  ...,  0.0392,  0.1169,  0.1239],
        [ 0.1013,  0.0531, -0.0848,  ..., -0.0437, -0.0922, -0.0505],
        [-0.0225,  0.0412,  0.0546,  ...,  0.0936, -0.0534, -0.0834],
        ...,
        [ 0.0392, -0.0349, -0.1370,  ..., -0.0106, -0.0397, -0.0157],
        [ 0.0724, -0.0834, -0.0412,  ..., -0.0476, -0.0742, -0.0440],
        [-0.0368,  0.1253,  0.0460,  ..., -0.0314,  0.0441, -0.0880]],
       requires_grad=True)

In [None]:
model.parameters

<bound method Module.parameters of GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)>

In [None]:
module_names = list(model._modules.keys())
module_names

['wte', 'wpe', 'drop', 'h', 'ln_f']

In [None]:
model.h[0].attn.c_attn.weight.shape

torch.Size([768, 2304])

In [None]:
# fitting to the alpha stable distribution

"""
import numpy as np
import pandas as pd
from scipy.stats import levy_stable, norm, distributions
from tqdm import tqdm

pconv = lambda alpha, beta, mu, sigma: (alpha, beta, mu - sigma * beta * np.tan(np.pi * alpha / 2.0), sigma)
col_names = ["module_name", "alpha", "beta", "mu", "sigma", "shape"]
#df = pd.DataFrame(np.zeros((1,len(col_names))))
#df = df.astype('object')
#df.columns = col_names
df = pd.DataFrame(columns=col_names)
#l1_module_names = list(model._module.keys())
#for midx, module_name in enumerate(l1_module_names):
ridx = 0
for midx, module_name in tqdm(enumerate(['wte', 'wpe', 'h', 'ln_f'])):
    if module_name != "h":
        #l2_module_names = model._modules[module_name]
        weights = model._modules[module_name].weight.detach().numpy()
        params = pconv(*levy_stable._fitstart(weights.flatten()))
        df.loc[ridx,:] = [module_name] + list(params) + [weights.shape]
        ridx += 1
    else:
        for gpt_idx in range(len(model.h)):
            # f"GPT2Block {gpt_idx}"
            #l2_module_names = list(model.h[gpt_idx].keys())
            #for midx, l2_module_names in enumerate(l2_module_names):
            weights = model.h[gpt_idx].ln_1.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} ln_1"] + list(params) + [weights.shape]
            ridx += 1
            weights = model.h[gpt_idx].attn.c_attn.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} attn c_attn"] + list(params) + [weights.shape]
            ridx += 1
            weights = model.h[gpt_idx].attn.c_proj.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} attn c_proj"] + list(params) + [weights.shape]
            ridx += 1
            weights = model.h[gpt_idx].ln_2.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} ln_2"] + list(params) + [weights.shape]
            ridx += 1
            weights = model.h[gpt_idx].mlp.c_fc.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} mlp c_fc"] + list(params) + [weights.shape]
            ridx += 1
            weights = model.h[gpt_idx].mlp.c_proj.weight.detach().numpy()
            params = pconv(*levy_stable._fitstart(weights.flatten()))
            df.loc[ridx,:] = [f"GPT2Block {gpt_idx} mlp c_proj"] + list(params) + [weights.shape]
            ridx += 1

# save dataframe
df.to_csv("gpt2_stablefit.csv")

# plot alpha stablefit
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, sharex = False,sharey=False,figsize=(12.5 + 4.5, 9.5/2 + 0.5))
axs[0].hist(model.h[0].attn.c_attn.weight.flatten().detach().numpy(),1000,density=True)
axs[0].set_xlim(-1,1)
x = np.linspace(-1, 1, 1000)
y_stable = levy_stable.pdf(x, *df.iloc[3,1:5])
axs[0].plot(x,y_stable,'r')
axs[0].set_title(rf"$\alpha$ = {df.iloc[3,1]}")
axs[1].hist(model.h[0].attn.c_proj.weight.flatten().detach().numpy(),1000,density=True)
axs[1].set_xlim(-1,1)
x = np.linspace(-1, 1, 1000)
y_stable = levy_stable.pdf(x, *df.iloc[4,1:5])
axs[1].plot(x,y_stable,'r')
axs[1].set_title(rf"$\alpha$ = {df.iloc[4,1]}")
axs[2].hist(df.iloc[:,1],25)
#plt.show()
plt.savefig("gpt2_stablefit.pdf", bbox_inches='tight')

"""

'\nimport numpy as np\nimport pandas as pd\nfrom scipy.stats import levy_stable, norm, distributions\nfrom tqdm import tqdm\n\npconv = lambda alpha, beta, mu, sigma: (alpha, beta, mu - sigma * beta * np.tan(np.pi * alpha / 2.0), sigma)\ncol_names = ["module_name", "alpha", "beta", "mu", "sigma", "shape"]\n#df = pd.DataFrame(np.zeros((1,len(col_names)))) \n#df = df.astype(\'object\')\n#df.columns = col_names    \ndf = pd.DataFrame(columns=col_names)\n#l1_module_names = list(model._module.keys())\n#for midx, module_name in enumerate(l1_module_names):\nridx = 0\nfor midx, module_name in tqdm(enumerate([\'wte\', \'wpe\', \'h\', \'ln_f\'])):\n    if module_name != "h":\n        #l2_module_names = model._modules[module_name]\n        weights = model._modules[module_name].weight.detach().numpy()\n        params = pconv(*levy_stable._fitstart(weights.flatten()))\n        df.loc[ridx,:] = [module_name] + list(params) + [weights.shape]\n        ridx += 1\n    else:\n        for gpt_idx in range(

In [None]:
hidden_module_1 = model._modules[module_names[0]]
type(hidden_module_1)

torch.nn.modules.sparse.Embedding

In [None]:
# get weights
hidden_module_1.weight.shape

torch.Size([50257, 768])

In [None]:
model._modules['h'][0]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [None]:
model._modules['ln_f'].weight.shape

torch.Size([768])

In [None]:
type(model._modules['h'])

torch.nn.modules.container.ModuleList

In [None]:
type(model._modules['h'][0])

transformers.models.gpt2.modeling_gpt2.GPT2Block

In [None]:
# word embedding example
from transformers import GPT2LMHeadModel
model_embedder = GPT2LMHeadModel.from_pretrained('gpt2')
text_index = tokenizer.encode('man', add_prefix_space=True)
vector = model_embedder.transformer.wte.weight[text_index,:]
vector

tensor([[ 3.9063e-03, -4.5373e-03,  9.5696e-03, -3.2733e-02, -5.3765e-02,
          7.7608e-03, -3.4522e-01, -2.7065e-02,  1.3558e-02, -1.1779e-01,
          1.1969e-01, -2.9199e-02,  1.5931e-01,  2.5662e-03,  3.6582e-02,
          6.5900e-02,  8.8321e-02, -1.3107e-01, -4.5553e-02,  9.9681e-02,
          1.3703e-02,  8.6290e-02, -8.6695e-02,  1.4481e-01, -3.3118e-02,
          5.1048e-02,  3.9003e-02, -4.9227e-02, -7.9979e-03, -1.4979e-01,
         -2.3583e-02,  1.9372e-02,  6.7714e-02,  4.4881e-02, -9.2997e-02,
          1.7100e-01, -3.1543e-01,  6.9204e-02, -4.9375e-02, -7.6422e-02,
         -7.7999e-02,  3.6177e-02, -7.5628e-02,  1.6870e-01, -1.3687e-01,
          9.1324e-02,  6.0595e-02,  1.6535e-02,  8.5104e-03, -1.2422e-01,
         -1.6366e-02, -1.4682e-01, -4.5709e-03,  8.7555e-02, -5.5320e-02,
         -1.7597e-01, -6.9579e-02,  1.5793e-02, -4.5716e-02,  1.0274e-01,
         -1.8335e-01,  1.1280e-01, -1.8709e-01,  2.0712e-01, -1.4833e-01,
          8.7064e-02, -1.7650e-01, -3.

In [None]:
vector.shape

torch.Size([1, 768])

In [None]:
model.modules

<bound method Module.modules of GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)>