In [None]:
! pip install transformers torch accelerate huggingface-hub huggingface-cli hf-transfer

In [12]:
def count_parameters(model):
    # Calculate the number of parameters in billions
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 10**9
    print(f"Model size: {num_params:.3f}B parameters")
    return int(num_params)


## Load Reference Model

In [13]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Load meta-llama/Meta-Llama-3-8B model, config and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
count_parameters(model)

Model size: 8.030B parameters


8

In [15]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [16]:
def extract_model_weights(reference_model, n_layers):
    params = {}
    current_layer = 0  # To keep track of the main layer count

    # Iterate over all named modules
    for name, module in reference_model.named_modules():

        # Check and store parameters
        if hasattr(module, 'weight') and module.weight is not None:
            params[name + '.weight'] = module.weight.data.clone()
        if hasattr(module, 'bias') and module.bias is not None:
            params[name + '.bias'] = module.bias.data.clone()

        if 'model.layers.' in name:
            # Check the layer index
            layer_index = int(name.split('.')[2])  # This splits the name and gets the third element
            if layer_index > current_layer:
                current_layer = layer_index
                if current_layer > n_layers-1:
                    break  # Stop after reaching the specified main layer

    norm_layer = model.model.norm  # Adjust this path based on your model's architecture
    if hasattr(norm_layer, 'weight') and norm_layer.weight is not None:
        params['model.norm.weight'] = norm_layer.weight.data.clone()
    if hasattr(norm_layer, 'bias') and norm_layer.bias is not None:
        params['model.norm.bias'] = norm_layer.bias.data.clone()

    lm_head = reference_model.lm_head
    if hasattr(lm_head, 'weight') and lm_head.weight is not None:
        params["lm_head.weight"] = lm_head.weight.data
    if hasattr(lm_head, 'bias') and lm_head.bias is not None:
        params["lm_head.bias"] = lm_head.bias.data

    return params


In [17]:
target_model_n_layers = 24
pretrained_weights = extract_model_weights(model, target_model_n_layers)

In [18]:
from transformers import AutoModelForCausalLM, AutoConfig
config = AutoConfig.from_pretrained(model_name)
config.num_hidden_layers = target_model_n_layers
target_model = AutoModelForCausalLM.from_config(config)


In [19]:
target_model_size = count_parameters(target_model)

Model size: 6.285B parameters


In [21]:
target_model.load_state_dict(pretrained_weights)

<All keys matched successfully>

In [22]:
inputs = tokenizer.apply_chat_template(
    [
        # {"content":"","role":"system"},
        {"content":"""Given the question: Read the article and select the best
         answer. Article: Can you swim? Do you like swimming? Well, how can you
         learn to swim? I think the best way is to go into the water and learn.
        I'm afraid you'll never learn to swim just by reading books about
        Swimming or looking at others swimming. It's the same with the English
        study. We must practice, practice and practice. Listening and speaking
        are very important for beginners. We can listen to English programs on radio.
        You may just understand a few words. It doesn't matter. Just be relaxed,
        try to catch every word. Somebody may be a good listener, but he is afraid
        to speak because he's afraid of making mistakes. You know we sometimes
        make mistakes when we speak Chinese. Don't be afraid. We must be brave.
        If you really want to learn English well, you must try to speak with
        everyone as long as he knows English. When there's nobody to talk with,
        you can talk to yourself in English. It's interesting and also a good
        way to practice your spoken English. Remember, the more you speak, the
        fewer mistakes you'll make. Reading and writing are more important for
        senior school students. First we must choose the books we're interested
        in. A lot of reading will improve your language sense.
        This is very important. It's easier said than done. Well, let's do
        more practice from now on. I'm sure you'll learn English well in this
        way. ,A, B, C, D,. (10)
        Question: Which is the best title for the passage?
        Options:
            A: How to Learn English.
            B: Easier Said Than Done.
            C: Listen First, Speak Second.
            D: How to learn to Swim.\n
        The answer is:""","role":"user"}
    ], add_generation_prompt=True, return_tensors='pt',
)

In [23]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = target_model.generate(inputs, streamer = text_streamer, max_new_tokens = 128)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Given the question: Read the article and select the best
         answer. Article: Can you swim? Do you like swimming? Well, how can you
         learn to swim? I think the best way is to go into the water and learn.
        I'm afraid you'll never learn to swim just by reading books about
        Swimming or looking at others swimming. It's the same with the English
        study. We must practice, practice and practice. Listening and speaking
        are very important for beginners. We can listen to English programs on radio.
        You may just understand a few words. It doesn't matter. Just be relaxed,
        try to catch every word. Somebody may be a good listener, but he is afraid
        to speak because he's afraid of making mistakes. You know we sometimes
        make mistakes when we speak Chinese. Don't be afraid. We must be brave.
        If you really want to learn English well, you must try to speak with
      

In [None]:
target_model.push_to_hub("Llama-3-6B-Instruct-v0.1")
tokenizer.push_to_hub("Llama-3-6B-Instruct-v0.1")