In [3]:
# !pip install transformers

In [4]:
from importlib.metadata import version

pkgs = ["numpy", "torch", "transformers"]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.26.4
torch version: 2.5.1
transformers version: 4.47.1


In [16]:
import torch
import numpy as np
import pandas as pd

In [5]:
from transformers import GPT2Model


# allowed model names
model_names = {
    "gpt2-small (124M)": "openai-community/gpt2",
    "gpt2-medium (355M)": "openai-community/gpt2-medium",
    "gpt2-large (774M)": "openai-community/gpt2-large",
    "gpt2-xl (1558M)": "openai-community/gpt2-xl"
}

CHOOSE_MODEL = "gpt2-small (124M)"

gpt_hf = GPT2Model.from_pretrained(model_names[CHOOSE_MODEL], cache_dir="checkpoints")
gpt_hf.eval()

  from .autonotebook import tqdm as notebook_tqdm
2024-12-20 12:28:29.707536: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-20 12:28:29.713868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734694109.721708  187845 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734694109.724010  187845 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-20 12:28:29.732010: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

### Why using Conv1D ?

However, the Conv1D implementation in PyTorch is optimized for certain tensor shapes, particularly when dealing with sequences. It allows for efficient handling of batched data and better memory access patterns, which are critical for large models like GPT-2.

A Conv1D layer is inherently compatible with variable sequence lengths, as it operates across the "sequence" dimension without needing to reshape the input explicitly (as would be required for a Linear layer). When using Linear layers, you typically need to reshape the input tensor (e.g., from `[batch_size, seq_len, hidden_dim]` to `[batch_size * seq_len, hidden_dim]`) and then reshape it back after the operation. With Conv1D, this reshaping is unnecessary, as it naturally supports the [batch_size, hidden_dim, seq_len] input format.

In [27]:
import torch.nn as nn


In [25]:
# Define input tensor: batch of sequences with token embeddings
batch_size = 2
sequence_length = 5
embedding_dim = 768
input_tensor = torch.randn(batch_size, sequence_length, embedding_dim)


In [26]:
# Define the Linear layer equivalent to c_attn
hidden_dim = embedding_dim
output_dim = 3 * hidden_dim  # For Q, K, V combined

In [28]:
linear_layer = nn.Linear(hidden_dim, output_dim)

In [29]:
# Define Conv1D equivalent to c_attn
conv1d_layer = nn.Conv1d(
    in_channels=hidden_dim,
    out_channels=output_dim,
    kernel_size=1
)

In [30]:
# Initialize Conv1D weights and biases to match Linear layer
conv1d_layer.weight.data = linear_layer.weight.data.view(output_dim, hidden_dim, 1)
conv1d_layer.bias.data = linear_layer.bias.data

In [32]:
linear_layer.weight[:2, :2]

tensor([[ 0.0232,  0.0121],
        [-0.0341,  0.0156]], grad_fn=<SliceBackward0>)

In [35]:
conv1d_layer.weight[:2, :2, 0]

tensor([[ 0.0232,  0.0121],
        [-0.0341,  0.0156]], grad_fn=<SelectBackward0>)

## Continue

In [6]:
BASE_CONFIG = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": True        # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}


BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [7]:
BASE_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'drop_rate': 0.0,
 'qkv_bias': True,
 'emb_dim': 768,
 'n_layers': 12,
 'n_heads': 12}

In [9]:
def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(right.clone().detach())

In [12]:
import torch
from scripts.previous_chapters import GPTModel


gpt = GPTModel(BASE_CONFIG)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# load_weights(gpt, gpt_hf)

In [13]:
d_gpt_hf = gpt_hf.state_dict()

In [19]:
def parse_state_dict_to_hierarchical_table(state_dict):
    """
    Parse the state_dict of a GPT2 model to create a hierarchical pandas DataFrame.
    """
    # Initialize a list to hold rows of the table
    table_data = []

    for key, tensor in state_dict.items():
        parts = key.split(".")  # Split the key into parts by '.'
        main_module = parts[0]  # First part (e.g., 'wte', 'wpe', 'h', 'ln_f')

        # Initialize hierarchy columns
        gpt2block_number = -1
        gpt2block_component = -1

        if main_module == "h" and len(parts) > 1:
            # Extract GPT2Block number and component
            gpt2block_number = int(parts[1])
            if len(parts) > 2:
                gpt2block_component = ".".join(parts[2:])  # The rest is the component name

        # Append the row to the table data
        table_data.append({
            "Main Module": main_module,
            "GPT2Block Number": gpt2block_number,
            "GPT2Block Component": gpt2block_component,
            "Tensor Shape": tuple(tensor.shape)
        })

    # Create a DataFrame from the collected data
    df = pd.DataFrame(table_data)
    return df

In [20]:
# Apply the function to d_gpt_hf
parsed_table = parse_state_dict_to_hierarchical_table(d_gpt_hf)

In [23]:
parsed_table.head(10)

Unnamed: 0,Main Module,GPT2Block Number,GPT2Block Component,Tensor Shape
0,wte,-1,-1,"(50257, 768)"
1,wpe,-1,-1,"(1024, 768)"
2,h,0,ln_1.weight,"(768,)"
3,h,0,ln_1.bias,"(768,)"
4,h,0,attn.c_attn.weight,"(768, 2304)"
5,h,0,attn.c_attn.bias,"(2304,)"
6,h,0,attn.c_proj.weight,"(768, 768)"
7,h,0,attn.c_proj.bias,"(768,)"
8,h,0,ln_2.weight,"(768,)"
9,h,0,ln_2.bias,"(768,)"


In [24]:
parsed_table.tail(10)

Unnamed: 0,Main Module,GPT2Block Number,GPT2Block Component,Tensor Shape
138,h,11,attn.c_proj.weight,"(768, 768)"
139,h,11,attn.c_proj.bias,"(768,)"
140,h,11,ln_2.weight,"(768,)"
141,h,11,ln_2.bias,"(768,)"
142,h,11,mlp.c_fc.weight,"(768, 3072)"
143,h,11,mlp.c_fc.bias,"(3072,)"
144,h,11,mlp.c_proj.weight,"(3072, 768)"
145,h,11,mlp.c_proj.bias,"(768,)"
146,ln_f,-1,-1,"(768,)"
147,ln_f,-1,-1,"(768,)"


In [36]:
import numpy as np


def load_weights(gpt, gpt_hf):

    d = gpt_hf.state_dict()

    gpt.pos_emb.weight = assign_check(gpt.pos_emb.weight, d["wpe.weight"])
    gpt.tok_emb.weight = assign_check(gpt.tok_emb.weight, d["wte.weight"])
    
    for b in range(BASE_CONFIG["n_layers"]):
        q_w, k_w, v_w = np.split(d[f"h.{b}.attn.c_attn.weight"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign_check(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign_check(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign_check(gpt.trf_blocks[b].att.W_value.weight, v_w.T)
    
        q_b, k_b, v_b = np.split(d[f"h.{b}.attn.c_attn.bias"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign_check(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign_check(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign_check(gpt.trf_blocks[b].att.W_value.bias, v_b)
    
    
        gpt.trf_blocks[b].att.out_proj.weight = assign_check(gpt.trf_blocks[b].att.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign_check(gpt.trf_blocks[b].att.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])
    
        gpt.trf_blocks[b].ff.layers[0].weight = assign_check(gpt.trf_blocks[b].ff.layers[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign_check(gpt.trf_blocks[b].ff.layers[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign_check(gpt.trf_blocks[b].ff.layers[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign_check(gpt.trf_blocks[b].ff.layers[2].bias, d[f"h.{b}.mlp.c_proj.bias"])
    
        gpt.trf_blocks[b].norm1.scale = assign_check(gpt.trf_blocks[b].norm1.scale, d[f"h.{b}.ln_1.weight"])
        gpt.trf_blocks[b].norm1.shift = assign_check(gpt.trf_blocks[b].norm1.shift, d[f"h.{b}.ln_1.bias"])
        gpt.trf_blocks[b].norm2.scale = assign_check(gpt.trf_blocks[b].norm2.scale, d[f"h.{b}.ln_2.weight"])
        gpt.trf_blocks[b].norm2.shift = assign_check(gpt.trf_blocks[b].norm2.shift, d[f"h.{b}.ln_2.bias"])
    
        gpt.final_norm.scale = assign_check(gpt.final_norm.scale, d[f"ln_f.weight"])
        gpt.final_norm.shift = assign_check(gpt.final_norm.shift, d[f"ln_f.bias"])
        gpt.out_head.weight = assign_check(gpt.out_head.weight, d["wte.weight"])