In [56]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import init_empty_weights
from scripts.dataset_utils import get_dataloader, InstructionDataset

### Tokenizer

In [3]:
tokenizer_8b = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer_70b = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-70B-Instruct")

In [10]:
# same tokenizer for 3.1 models
assert tokenizer_8b.special_tokens_map == tokenizer_70b.special_tokens_map
assert tokenizer_8b.vocab == tokenizer_70b.vocab

In [None]:
tokenizer_8b.

In [28]:
tokenizer_8b.all_special_tokens, tokenizer_8b.vocab['<|start_header_id|>']

(['<|begin_of_text|>', '<|eot_id|>'], 128006)

In [35]:
tokenizer_8b.convert_ids_to_tokens([tokenizer_8b.vocab_size+i for i in range(len(tokenizer_8b.vocab) - tokenizer_8b.vocab_size)])[:20]

['<|begin_of_text|>',
 '<|end_of_text|>',
 '<|reserved_special_token_0|>',
 '<|reserved_special_token_1|>',
 '<|finetune_right_pad_id|>',
 '<|reserved_special_token_2|>',
 '<|start_header_id|>',
 '<|end_header_id|>',
 '<|eom_id|>',
 '<|eot_id|>',
 '<|python_tag|>',
 '<|reserved_special_token_3|>',
 '<|reserved_special_token_4|>',
 '<|reserved_special_token_5|>',
 '<|reserved_special_token_6|>',
 '<|reserved_special_token_7|>',
 '<|reserved_special_token_8|>',
 '<|reserved_special_token_9|>',
 '<|reserved_special_token_10|>',
 '<|reserved_special_token_11|>']

### Data

In [41]:
ds = load_from_disk("/workspace/data/llama_large_mix_dataset_v0_1536")

In [42]:
ds

Dataset({
    features: ['input_text', 'output_text', 'input_token_ids', 'output_token_ids', 'token_length'],
    num_rows: 90417
})

In [47]:
idx = 50
sample = ds[idx]

In [48]:
print(sample['input_text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the effect of decreasing the pH on the formation of the copper(II) EDTA complex in a solution containing a fixed concentration of copper(II) ion and excess EDTA? Provide a complete explanation using relevant chemical equations and equilibrium constants.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [49]:
print(sample['output_text'])

When the pH of a solution containing a fixed concentration of copper(II) ion and excess EDTA is decreased, the formation of the copper(II) EDTA complex is affected. To understand this, let's first look at the relevant chemical equations and equilibrium constants.

The formation of the copper(II) EDTA complex can be represented by the following equation:

Cu²⁺ + EDTA⁴⁻ ⇌ CuEDTA²⁻

The equilibrium constant for this reaction is given by:

Kf = [CuEDTA²⁻] / ([Cu²⁺][EDTA⁴⁻])

Now, let's consider the effect of decreasing the pH on this equilibrium. When the pH is decreased, the concentration of H⁺ ions in the solution increases. EDTA is a polyprotic ligand, meaning it can lose multiple protons. The fully deprotonated form of EDTA is EDTA⁴⁻, which is the form that binds to the Cu²⁺ ions. However, when the pH is decreased, some of the EDTA⁴⁻ will pick up protons and form its protonated species (HEDTA³⁻, H₂EDTA²⁻, H₃EDTA⁻, and H₄EDTA). The equilibria for these protonation reactions can be repre

In [57]:
args = {"dataset": "/workspace/data/llama_large_mix_dataset_v0_1536", "seed":42, "batch_size":8, 
        "gradient_accumulation_steps":1}
sft_ds = InstructionDataset(ds, tokenizer_8b, style='local')

In [62]:
len(sft_ds), sft_ds[0].keys()

(90417, dict_keys(['input_ids', 'labels', 'attention_mask']))

In [64]:
print(tokenizer_8b.decode(sft_ds[0]['input_ids']))

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Provide the best response to a given instruction. Follow the following steps to craft your response:
1. reason about the given instruction
2. provide a high-quality solution
3. offer a concise explanation
4. write tests to verify the correctness your solution

## Example 1
### Instruction
Here are two special formulas:

$$
f_1(a, b) = (a + 1) \cdot (b + 1) - 1
$$

$$
f_2(k) = \begin{cases}
    \frac{(k + 1)^2}{2} + k + 1 & \text{if } k \text{ is odd} \\
    \frac{k^2}{2} + 2k + 1 & \text{if } k \text{ is even}
\end{cases}
$$

Write a Python function to return $f_2(f_1(a, b))$ for given `a` and `b`.

### Response
[Reasoning]
Based on the formulas you provided, we can define two Python functions, `f1(a, b)` and `f2(k)`, respectively, and then combine them to calculate

In [67]:
sft_ds[0]['labels']

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

### Unfreeze Layernorm

In [14]:
cfg = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
with init_empty_weights():
	model = AutoModelForCausalLM.from_config(cfg)
	model_type = cfg.model_type

In [17]:
for n,p in model.named_parameters():
    if "norm" in n:
        print(n, p.numel())

model.layers.0.input_layernorm.weight 4096
model.layers.0.post_attention_layernorm.weight 4096
model.layers.1.input_layernorm.weight 4096
model.layers.1.post_attention_layernorm.weight 4096
model.layers.2.input_layernorm.weight 4096
model.layers.2.post_attention_layernorm.weight 4096
model.layers.3.input_layernorm.weight 4096
model.layers.3.post_attention_layernorm.weight 4096
model.layers.4.input_layernorm.weight 4096
model.layers.4.post_attention_layernorm.weight 4096
model.layers.5.input_layernorm.weight 4096
model.layers.5.post_attention_layernorm.weight 4096
model.layers.6.input_layernorm.weight 4096
model.layers.6.post_attention_layernorm.weight 4096
model.layers.7.input_layernorm.weight 4096
model.layers.7.post_attention_layernorm.weight 4096
model.layers.8.input_layernorm.weight 4096
model.layers.8.post_attention_layernorm.weight 4096
model.layers.9.input_layernorm.weight 4096
model.layers.9.post_attention_layernorm.weight 4096
model.layers.10.input_layernorm.weight 4096
model.