In [1]:
%load_ext autoreload
%autoreload 2
import torch
import megatron
import sys
import os
os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'

arg ="pretrain_gpt2.py \
    --num-layers 12 \
    --hidden-size 768 \
    --num-attention-heads 12 \
    --seq-length 2048 \
    --max-position-embeddings 2048 \
    --micro-batch-size 4 \
    --global-batch-size 128 \
    --lr 1.5e-4 \
    --min-lr 1.5e-5 \
    --hidden-dropout 0.0 \
    --attention-dropout 0.0 \
    --train-iters 100 \
    --lr-decay-iters 100 \
    --lr-decay-style cosine \
    --use-parallel-residual \
    --weight-decay 1e-1 \
    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
    --normalization LayerNorm \
    --position-embedding-type rope \
    --rotary-percent 0.25 \
    --use-mcore-models \
    --untie-embeddings-and-output-weights \
    --tokenizer-type HFTokenizer \
    --vocab-file /workspace/gpt-neox-tokenizer.json \
    --load /data/pythia/pythia-160m-megatron \
    --save /data/pythia/pythia-160m-megatron/ft-test \
    --save-interval 100 \
    --finetune \
"
print(arg.split())
sys.argv = arg.split()
megatron.initialize_megatron(skip_mpu_initialization=True)

  from .autonotebook import tqdm as notebook_tqdm


['pretrain_gpt2.py', '--num-layers', '12', '--hidden-size', '768', '--num-attention-heads', '12', '--seq-length', '2048', '--max-position-embeddings', '2048', '--micro-batch-size', '4', '--global-batch-size', '128', '--lr', '1.5e-4', '--min-lr', '1.5e-5', '--hidden-dropout', '0.0', '--attention-dropout', '0.0', '--train-iters', '100', '--lr-decay-iters', '100', '--lr-decay-style', 'cosine', '--use-parallel-residual', '--weight-decay', '1e-1', '--lr-warmup-fraction', '.01', '--clip-grad', '1.0', '--normalization', 'LayerNorm', '--position-embedding-type', 'rope', '--rotary-percent', '0.25', '--use-mcore-models', '--untie-embeddings-and-output-weights', '--tokenizer-type', 'HFTokenizer', '--vocab-file', '/workspace/gpt-neox-tokenizer.json', '--load', '/data/pythia/pythia-160m-megatron', '--save', '/data/pythia/pythia-160m-megatron/ft-test', '--save-interval', '100', '--finetune']
using world size: 1, data-parallel size: 1, context-parallel size: 1 tensor-model-parallel size: 1, pipeline-

In [2]:
# Preamble to set pipelining stuff manually
import megatron.core.parallel_state as ps
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
ps.set_pipeline_model_parallel_rank(0)
ps.set_pipeline_model_parallel_world_size(1)
ps.set_tensor_model_parallel_rank(0)
ps.set_tensor_model_parallel_world_size(1)
ps._set_global_memory_buffer()
model_parallel_cuda_manual_seed(1234)

In [3]:
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training import get_model, load_checkpoint
from megatron.core.enums import ModelType
from pretrain_gpt import get_args, core_transformer_config_from_args, print_rank_0, GPTModel
from megatron.core.models.gpt import gpt_layer_specs
from megatron.global_vars import get_tokenizer

args = get_args()
def model_provider(pre_process=True, post_process=True):
    args = get_args()
    print_rank_0('building GPT model ...')
    # Experimental loading arguments from yaml
    config = core_transformer_config_from_args(args)
    assert args.use_mcore_models
    transformer_layer_spec = gpt_layer_specs.get_gpt_layer_local_spec(None, None)

    return GPTModel(
        config=config,
        transformer_layer_spec=transformer_layer_spec,
        vocab_size=args.padded_vocab_size,
        max_sequence_length=args.max_position_embeddings,
        pre_process=pre_process,
        post_process=post_process,
        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
        parallel_output=True,
        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
        position_embedding_type=args.position_embedding_type,
        rotary_percent=args.rotary_percent,
    )
    
model = get_model(model_provider, ModelType.encoder_or_decoder, wrap_with_ddp=False)
args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(model, None, None)
tokenizer = get_tokenizer()

building GPT model ...
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 162322944
 loading checkpoint from /data/pythia/pythia-160m-megatron at iteration 1
could not find arguments in the checkpoint ...
Converting checkpoint from *LayerNormLinear
Converting checkpoint from *LayerNormLinear
loaded model with incompat keys: <All keys matched successfully>
 checkpoint version 0
 succesfully fixed query-key-values ordering for checkpoint version 0
  successfully loaded checkpoint from /data/pythia/pythia-160m-megatron at iteration 0


In [4]:
from megatron.utils import get_ltor_masks_and_position_ids

with torch.no_grad():
    tokens = torch.tensor(tokenizer.tokenize("My name is Julien and I like to")).unsqueeze(0).cuda()
    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
        data=tokens,
        eod_token=None,
        reset_position_ids=False,
        reset_attention_mask=False,
        eod_mask_loss=False,
    )
    res = model[0](tokens, position_ids, attention_mask)
res.shape, attention_mask.shape, position_ids.shape



(torch.Size([1, 9, 50304]), torch.Size([1, 1, 9, 9]), torch.Size([1, 9]))

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
hf_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m").float()
hf_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
with torch.no_grad():
    hf_tokenizer_out = hf_tokenizer("My name is Julien and I like to", return_tensors="pt")
    print(hf_tokenizer_out)
    hf_res = hf_model(**hf_tokenizer_out).logits
    print(hf_tokenizer.decode(hf_res[0].argmax(-1)))

{'input_ids': tensor([[3220, 1416,  310, 9218, 1914,  285,  309,  751,  281]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
 name is Alexi and I'm to call


In [7]:
hf_tokenizer.decode(res[0].argmax(-1))

" name is Alexi and I'm to call"

In [8]:
from megatron.core.models.gpt.gpt_model import global_buffers
for k in list(global_buffers.keys())[:7]:
    print(k)

print(hf_model.gpt_neox.layers[0])

decoder_input
rotary_pos_emb
layer.1.input_layernorm_output
layer.1.attn_qkv_output
layer.1.attn_post_rotary_qkv
layer.1.post_core_attn
layer.1.post_self_attn
GPTNeoXLayer(
  (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (post_attention_dropout): Dropout(p=0.0, inplace=False)
  (post_mlp_dropout): Dropout(p=0.0, inplace=False)
  (attention): GPTNeoXAttention(
    (rotary_emb): GPTNeoXRotaryEmbedding()
    (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (attention_dropout): Dropout(p=0.0, inplace=False)
  )
  (mlp): GPTNeoXMLP(
    (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
    (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
    (act): GELUActivation()
  )
)


# Embeddings

In [None]:
dcinp_res = global_buffers["decoder_input"].transpose(0, 1).cpu()
assert torch.allclose(dcinp_res, hf_model.gpt_neox.embed_in(tokens.cpu()))
assert torch.allclose(model[0].embedding.word_embeddings.weight[tokens].cpu(), dcinp_res)
assert torch.allclose(hf_model.gpt_neox.embed_in(tokens.cpu()), dcinp_res)

## After input layernorm

In [None]:
with torch.no_grad():
    hf_embin = hf_model.gpt_neox.embed_in(tokens.cpu()) 
    hf_1inp_ln = hf_model.gpt_neox.layers[0].input_layernorm(hf_embin)
mg_1inp_ln = global_buffers["layer.1.input_layernorm_output"].cpu().transpose(0,1).contiguous()
print(hf_1inp_ln.shape, mg_1inp_ln.shape)

assert torch.allclose(hf_1inp_ln, mg_1inp_ln, atol=1e-6)

torch.Size([1, 9, 768]) torch.Size([1, 9, 768])


## After attention

### Whole Block

In [None]:

with torch.no_grad():
    hf_1attn_out,_ = hf_model.gpt_neox.layers[0].attention(hf_1inp_ln, attention_mask=hf_tokenizer_out["attention_mask"], position_ids=None)
    hf_1attn_out += hf_embin

mg_1attn_out = global_buffers["layer.1.post_self_attn"].cpu().transpose(0,1)
print(hf_1attn_out[0,0,:10])
print(mg_1attn_out[0,0,:10])
assert torch.allclose(mg_1attn_out, hf_1attn_out, atol=1e-5)

tensor([-0.1574, -0.4843,  0.2528,  0.5275,  0.1467, -0.0559, -0.1719, -0.1283,
        -0.1033, -0.1658])
tensor([-0.1574, -0.4843,  0.2528,  0.5275,  0.1467, -0.0559, -0.1719, -0.1283,
        -0.1033, -0.1658])


### After attention QKV

In [None]:
with torch.no_grad():
    hf_1qkv = hf_model.gpt_neox.layers[0].attention.query_key_value(hf_1inp_ln)
mg_1qkv = global_buffers["layer.1.attn_qkv_output"].transpose(0,1).cpu()
print(hf_1qkv.shape, mg_1qkv.shape)
assert torch.allclose(hf_1qkv, mg_1qkv, atol=1e-6)


torch.Size([1, 9, 2304]) torch.Size([1, 9, 2304])


### After rotary embedding

In [None]:
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

def qkv_forward(self, qkv, layer_past=None):
    position_ids = torch.arange(qkv.shape[1], dtype=torch.long, device=qkv.device)
    position_ids = position_ids.unsqueeze(0)
    has_layer_past = layer_past is not None
    new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
    qkv = qkv.view(*new_qkv_shape)

    # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
    query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
    key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
    value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)

    # Compute rotary embeddings on rotary_ndims
    query_rot = query[..., : self.rotary_ndims]
    query_pass = query[..., self.rotary_ndims :]
    key_rot = key[..., : self.rotary_ndims]
    key_pass = key[..., self.rotary_ndims :]

    # Compute token offset for rotary embeddings (when decoding)
    seq_len = key.shape[-2]
    if has_layer_past:
        seq_len += layer_past[0].shape[-2]
    cos, sin = self.rotary_emb(value, seq_len=seq_len)
    query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
    query = torch.cat((query, query_pass), dim=-1)
    key = torch.cat((key, key_pass), dim=-1)
    return query, key, value

hf_1 = hf_model.gpt_neox.layers[0]

with torch.no_grad():
    hf_1post_rotary = qkv_forward(hf_1.attention, hf_1qkv)
    mg_1post_rotary = list(global_buffers["layer.1.attn_post_rotary_qkv"])
    for i in range(3):
        mg_1post_rotary[i] = mg_1post_rotary[i].permute(1, 2, 0, 3).contiguous().cpu()

In [None]:
for i in range(3):
    print(hf_1post_rotary[i].shape, mg_1post_rotary[i].shape)
    assert torch.allclose(hf_1post_rotary[i], mg_1post_rotary[i], atol=1e-5)

torch.Size([1, 12, 9, 64]) torch.Size([1, 12, 9, 64])
torch.Size([1, 12, 9, 64]) torch.Size([1, 12, 9, 64])
torch.Size([1, 12, 9, 64]) torch.Size([1, 12, 9, 64])


### Post Core Attention

In [None]:
with torch.no_grad():
    hf_1attn = hf_1.attention
    hf_1attn_output, hf_1attn_weights = hf_1attn._attn(*hf_1post_rotary, hf_tokenizer_out["attention_mask"], None)
    hf_1attn_output = hf_1attn._merge_heads(hf_1attn_output, hf_1attn.num_attention_heads, hf_1attn.head_size)

mg_1attn_postcore = global_buffers["layer.1.post_core_attn"].transpose(0,1).cpu()
print(hf_1attn_output.shape, mg_1attn_postcore.shape)
assert torch.allclose(hf_1attn_output[0,0], mg_1attn_postcore[0,0], atol=1e-6)

torch.Size([1, 9, 768]) torch.Size([1, 9, 768])


### Post linear projection bias/output

In [None]:
with torch.no_grad():
    hf_1dense_output = hf_1attn.dense(hf_1attn_output)
    mg_1dense_output = global_buffers["layer.1.attn_dense_output"].transpose(0,1).cpu()