# Grow 70M Pythia to 160M Pythia

Model size table:

| Params | n_layers	| d_model |	n_heads | d_head | Hugging Face Checkpoints |
| ------ | -------- | ------- | ------- | ------ | ------------------------ |
|   70M  |    6     |   512   |    8    |   64   | [Standard](https://huggingface.co/EleutherAI/pythia-70m) |
|  160M  |    12    |   768   |    12   |   64   | [Standard](https://huggingface.co/EleutherAI/pythia-160m) |


Based on the code in https://github.com/allenai/staged-training/blob/main/gpt_pretrain.py.


## Instantiate Pythia 70M

In [2]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import torch
import re
import copy
import json
import importlib
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
importlib.reload(sys.modules['transformers'])

<module 'transformers' from '/Users/vibhamasti/Personal/CMU/S24/Capstone/code/LoRA-Instruction-Finetune/venv/lib/python3.11/site-packages/transformers/__init__.py'>

In [4]:
model_70m = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m",
  cache_dir="../.cache/pythia-70m",
)

tokenizer_70m = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m",
  cache_dir="../.cache/pythia-70m",
)

inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_70m.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses\n\nI have a question for you.'

## Use function preserving to grow the model to 160M

In [5]:
model_70m

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [6]:
for name, module in model_70m.named_modules():
  print(name, module)

 GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (

In [19]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer_gpt = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt = GPT2Model.from_pretrained('gpt2')

for name, module in model_gpt.named_modules():
    print(name, module)

 GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
wte Embedding(50257, 768)
wpe Embedding(1024, 768)
drop Dropout(p=0.1, inplace=False)
h ModuleList(
  (0-11): 12 x GPT2Block(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPT2Attention(
      (c_attn): Conv1D()
      (c_proj): Co

In [7]:
import importlib

In [8]:
import grow_depth

In [9]:
importlib.reload(grow_depth)

<module 'grow_depth' from '/Users/vibhamasti/Personal/CMU/S24/Capstone/code/LoRA-Instruction-Finetune/src/grow_depth.py'>

In [20]:
model_160m = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-160m",
  cache_dir="../.cache/pythia-160m",
)

tokenizer_160m = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-160m",
  cache_dir="../.cache/pythia-160m",
)

inputs = tokenizer_160m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_160m.generate(**inputs)
tokenizer_160m.decode(tokens[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses, and the flowers on the ground.\n'

In [21]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_160m.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses, and the flowers on the ground.\n'

In [37]:
model_70m.gpt_neox.embed_in

Embedding(50304, 512)

In [34]:
model_70m.gpt_neox.embed_in.weight.shape

torch.Size([50304, 512])

In [36]:
model_70m.embed_out.weight.shape


torch.Size([50304, 512])

In [38]:
model_70m.embed_out

Linear(in_features=512, out_features=50304, bias=False)

In [29]:
for name, module in model_70m.named_modules():
    print(name, module)

 GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (

In [22]:
for name, module in model_160m.named_modules():
    print(name, module)

 GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
         

In [26]:
model_70m.gpt_neox.layers[0].input_layernorm.weight.shape

GPTNeoXLayer(
  (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (post_attention_dropout): Dropout(p=0.0, inplace=False)
  (post_mlp_dropout): Dropout(p=0.0, inplace=False)
  (attention): GPTNeoXAttention(
    (rotary_emb): GPTNeoXRotaryEmbedding()
    (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
    (dense): Linear(in_features=512, out_features=512, bias=True)
    (attention_dropout): Dropout(p=0.0, inplace=False)
  )
  (mlp): GPTNeoXMLP(
    (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
    (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
    (act): GELUActivation()
  )
)

In [18]:
for name, module in model_70m.named_modules():
    print(name, module)

 GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (

In [11]:
model_70m_expanded = grow_depth.expand_layers(model_70m, 6, 12, expand_type='alternate')

for name, module in model_70m_expanded.gpt_neox.layers.named_modules():
    print(name, module)

ModuleList(
  (0-11): 12 x GPTNeoXLayer(
    (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (post_attention_dropout): Dropout(p=0.0, inplace=False)
    (post_mlp_dropout): Dropout(p=0.0, inplace=False)
    (attention): GPTNeoXAttention(
      (rotary_emb): GPTNeoXRotaryEmbedding()
      (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
      (dense): Linear(in_features=512, out_features=512, bias=True)
      (attention_dropout): Dropout(p=0.0, inplace=False)
    )
    (mlp): GPTNeoXMLP(
      (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
      (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
      (act): GELUActivation()
    )
  )
)
 ModuleList(
  (0-11): 12 x GPTNeoXLayer(
    (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (post_attention_layernorm): LayerNorm((512,), eps=

In [17]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_70m_expanded.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses\n\nI have a question for you.'

In [13]:
for name, module in model_70m.named_modules():

  layer_idx = re.findall("[-\d]+", name)
  print(name, module, layer_idx)

 GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (

In [None]:
def double_depth(model):
    print(model)
    config = model.config

    # create an instance of the model twice the size
    new_config_dict = config.to_dict()
    print(new_config_dict)
    new_config_dict['num_hidden_layers'] *= 2

    # new_config = type(config).from_dict(new_config_dict)
    # new_model = type(model)(new_config)

    # # load the weights from the old model into new model after duplicating them
    # # model.tie_weights()
    # # new_model.tie_weights()

    # new_state_dict = deep_state_dict(model.state_dict())
    # new_model.load_state_dict(new_state_dict)
    # # new_model.tie_weights()

    return new_model

In [17]:
model_160m = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-160m",
  cache_dir="../.cache/pythia-160m",
)

tokenizer_160m = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-160m",
  cache_dir="../.cache/pythia-160m",
)

inputs = tokenizer_160m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_160m.generate(**inputs)
tokenizer_160m.decode(tokens[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses, and the flowers on the ground.\n'