# Chapter-5 load open AI pretrained wieghts

In [61]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [62]:
print("Settings:",settings)
print("Parameter dictionary keys", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [63]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim":768, "n_layers":12, "n_heads":12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers":24, "n_heads":16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers":36, "n_heads":20},
    "gpt2-xl (1558M)": {"emb_dim":1600, "n_layers":48, "n_heads":25}
}

In [64]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [65]:
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

In [66]:
NEW_CONFIG.update({"context_length":1024})
NEW_CONFIG.update({"qkv_bias":True})

In [67]:
from gpt import GPTModel
import torch

gpt=GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_ptoj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=76

check whether the tensors have same dimensions or shape 

In [68]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape},"
                         f"Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [69]:
params.keys()

dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

In [70]:
params["blocks"][0]["attn"].keys()

dict_keys(['c_attn', 'c_proj'])

In [71]:
import numpy as np

def load_weights_into_gpt(gpt,params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"],3,axis=-1)
        gpt.trf_block[b].att.W_query.weight = assign(
            gpt.trf_block[b].att.W_query.weight, q_w.T
        )
        gpt.trf_block[b].att.W_key.weight = assign(
            gpt.trf_block[b].att.W_key.weight, k_w.T
        )
        gpt.trf_block[b].att.W_value.weight = assign(
            gpt.trf_block[b].att.W_value.weight, v_w.T
        )

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"],3,axis=-1)
        gpt.trf_block[b].att.W_query.bias = assign(
            gpt.trf_block[b].att.W_query.bias, q_b
        )
        gpt.trf_block[b].att.W_key.bias = assign(
            gpt.trf_block[b].att.W_key.bias, k_b
        )
        gpt.trf_block[b].att.W_value.bias = assign(
            gpt.trf_block[b].att.W_value.bias, v_b
        )

        gpt.trf_block[b].att.out_ptoj.weight = assign(
            gpt.trf_block[b].att.out_ptoj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T
        )
        gpt.trf_block[b].att.out_ptoj.bias = assign(
            gpt.trf_block[b].att.out_ptoj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"]
        )

        gpt.trf_block[b].ff.layers[0].weight = assign(
            gpt.trf_block[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T
        )
        gpt.trf_block[b].ff.layers[0].bias = assign(
            gpt.trf_block[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"]
        )
        gpt.trf_block[b].ff.layers[2].weight = assign(
            gpt.trf_block[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T
        )
        gpt.trf_block[b].ff.layers[2].bias = assign(
            gpt.trf_block[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"]
        )

        gpt.trf_block[b].norm1.scale = assign(
            gpt.trf_block[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"]
        )
        gpt.trf_block[b].norm1.shift = assign(
            gpt.trf_block[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"]
        )
        gpt.trf_block[b].norm2.scale = assign(
            gpt.trf_block[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"]
        )
        gpt.trf_block[b].norm2.shift = assign(
            gpt.trf_block[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"]
        )

        gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
        gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
        gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [72]:
load_weights_into_gpt(gpt, params)
device = "cuda" if torch.cuda.is_available() else "cpu"
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_ptoj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=76

In [73]:
from generate import generate, text_to_token_ids, token_ids_to_text
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [74]:
torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)
print(f"Output text:\n {token_ids_to_text(token_ids, tokenizer)}")

Output text:
 Every effort moves you an understanding of the new aeulve!

What makes

Fur on, what do you get one
