# Grow 70M Pythia to 410M Pythia

Model size table:

| Params | n_layers	| d_model |	n_heads | d_head | Hugging Face Checkpoints |
| ------ | -------- | ------- | ------- | ------ | ------------------------ |
|   70M  |    6     |   512   |    8    |   64   | [Standard](https://huggingface.co/EleutherAI/pythia-70m) |
|  410M  |    24    |  1024   |    16   |   64   | [Standard](https://huggingface.co/EleutherAI/pythia-410m) |


Based on the code in https://github.com/allenai/staged-training/blob/main/gpt_pretrain.py.


## Instantiate Pythia 70M

In [1]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import torch
import re
import copy
import json
import importlib
import sys

In [7]:
sys.path.append('..')
sys.path.append('../src')

In [4]:
model_70m = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m",
  cache_dir="../.cache/pythia-70m",
)

tokenizer_70m = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m",
  cache_dir="../.cache/pythia-70m",
)

inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_70m.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Downloading model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses\n\nI have a question for you.'

## Use function preserving to grow the model to 410M

In [11]:
import importlib

In [12]:
importlib.reload(grow_depth)

<module 'grow_depth' from '/Users/vibhamasti/Personal/CMU/F24/Capstone/lazy-pretrain/notebooks/../src/grow_depth.py'>

In [8]:
import grow_depth
import grow_width_hyper

In [7]:
importlib.reload(grow_depth)

<module 'grow_depth' from '/Users/vibhamasti/Personal/CMU/S24/Capstone/code/LoRA-Instruction-Finetune/src/grow_depth.py'>

### Output of pre-trained 410M model

In [9]:
model_410m = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-410m",
  cache_dir="../.cache/pythia-410m",
)

tokenizer_410m = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-410m",
  cache_dir="../.cache/pythia-410m",
)

inputs = tokenizer_410m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_410m.generate(**inputs)
tokenizer_410m.decode(tokens[0])

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses,\n\nA:\n\nI think'

In [13]:
model_70m_deep, copied_layers = grow_depth.expand_layers(model_70m, 6, 12, expand_type='alternate')
model_70m_deep, copied_layers = grow_depth.expand_layers(model_70m_deep, 12, 24, expand_type='alternate', copied_layers=copied_layers)

In [14]:
copied_layers

[True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False]

In [20]:

for i in range(len(copied_layers)):
    if not copied_layers[i]:
        parent_module = model_70m_deep.gpt_neox.layers[i]

        # Freeze the weights of the parent module
        for name, param in parent_module.named_parameters():
            param.requires_grad = False
            print(f'Param {name} requires grad: {param.requires_grad}')
    else:
        parent_module = model_70m_deep.gpt_neox.layers[i]

        # Freeze the weights of the parent module
        for name, param in parent_module.named_parameters():
            param.requires_grad = True
            print(f'Param {name} requires grad: {param.requires_grad}')
        

Param input_layernorm.weight requires grad: True
Param input_layernorm.bias requires grad: True
Param post_attention_layernorm.weight requires grad: True
Param post_attention_layernorm.bias requires grad: True
Param attention.query_key_value.weight requires grad: True
Param attention.query_key_value.bias requires grad: True
Param attention.dense.weight requires grad: True
Param attention.dense.bias requires grad: True
Param mlp.dense_h_to_4h.weight requires grad: True
Param mlp.dense_h_to_4h.bias requires grad: True
Param mlp.dense_4h_to_h.weight requires grad: True
Param mlp.dense_4h_to_h.bias requires grad: True
Param input_layernorm.weight requires grad: True
Param input_layernorm.bias requires grad: True
Param post_attention_layernorm.weight requires grad: True
Param post_attention_layernorm.bias requires grad: True
Param attention.query_key_value.weight requires grad: True
Param attention.query_key_value.bias requires grad: True
Param attention.dense.weight requires grad: True
Par

In [7]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_70m_deep.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses\n\nI have a question for you.'

In [8]:
model_70m_wide = grow_width.expand_width(model_70m_deep, 512, 1024)

In [9]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_70m_wide.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses\n\nI have a question for you about'

In [10]:
model_70m_wide

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [11]:
model_70m_wide.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-70m-expand-width-1024",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [12]:
model_410m

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [13]:
model_410m.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-410m",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

## 410M to 1.4B

In [17]:
model_1_4b = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-1.4b",
  cache_dir="../.cache/pythia-1.4b",
)

In [18]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_1_4b.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses,\n\nI love you.\n\n'

In [19]:
importlib.reload(grow_width)

<module 'grow_width' from '/Users/vibhamasti/Personal/CMU/S24/Capstone/code/LoRA-Instruction-Finetune/src/grow_width.py'>

In [20]:
model_410m_grown = grow_width.expand_width(model_410m, 1024, 2048, attn_heads=16)

In [21]:
inputs = tokenizer_70m("Finish the following sentence:\nRaindrops on roses", return_tensors="pt")
tokens = model_410m_grown.generate(**inputs)
tokenizer_70m.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Finish the following sentence:\nRaindrops on roses,\n\nA:\n\nI think'

In [22]:
model_410m_grown.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-410m-expand-width-2048",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [10]:
# model_70m.embed_out