In [1]:
from datetime import datetime
import transformers
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
project = "Evol-Instruct-Code-80k-v1-10-precent"
base_model_name = "phi-2"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

In [2]:
huggingface_dataset_name = "nickrosh/Evol-Instruct-Code-80k-v1"
# For the sake of this example, we will only use the first 10% of the training set
train_dataset = load_dataset(huggingface_dataset_name, split="train[:10%]")
print(train_dataset)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 7826
})


In [6]:
model_name = 'microsoft/phi-2'


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    # This is the PR that adds gradient checkpointing. It is not merged yet. This kind of thing is the cost of being on the bleeding edge.
    revision="refs/pr/23",
)

model = prepare_model_for_kbit_training(model)

modeling_phi.py: 100%|██████████| 33.7k/33.7k [00:00<00:00, 174kB/s]
Downloading shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s]
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [23]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    use_fast=True,
)

# add special tokens for ChatML formatting and a pad token
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

# resize model embeddings
model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   # phi2 default is 64, see configuration_phi.py
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def get_prompt(inst):
    return f"""
    # System:
    You are a helpful AI assistant. Follow the instruction. 
    # INSTRUCTION:
    {inst}
    # CODE:
    """


def tokenize_function(data_point):
    prompts = [get_prompt(inst) for inst in data_point['instruction']]
    data_point['input_ids'] = tokenizer(
        prompts,
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    data_point['labels'] = tokenizer(
        data_point['output'],
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    return data_point


tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, num_proc=os.cpu_count())

Map (num_proc=32): 100%|██████████| 7826/7826 [00:00<00:00, 11640.09 examples/s]


In [17]:
# print state dict
print(model.state_dict().keys())

odict_keys(['transformer.embd.wte.weight', 'transformer.h.0.ln.weight', 'transformer.h.0.ln.bias', 'transformer.h.0.mixer.Wqkv.weight', 'transformer.h.0.mixer.Wqkv.bias', 'transformer.h.0.mixer.Wqkv.weight.absmax', 'transformer.h.0.mixer.Wqkv.weight.quant_map', 'transformer.h.0.mixer.Wqkv.weight.nested_absmax', 'transformer.h.0.mixer.Wqkv.weight.nested_quant_map', 'transformer.h.0.mixer.Wqkv.weight.quant_state.bitsandbytes__nf4', 'transformer.h.0.mixer.out_proj.weight', 'transformer.h.0.mixer.out_proj.bias', 'transformer.h.0.mixer.out_proj.weight.absmax', 'transformer.h.0.mixer.out_proj.weight.quant_map', 'transformer.h.0.mixer.out_proj.weight.nested_absmax', 'transformer.h.0.mixer.out_proj.weight.nested_quant_map', 'transformer.h.0.mixer.out_proj.weight.quant_state.bitsandbytes__nf4', 'transformer.h.0.mlp.fc1.weight', 'transformer.h.0.mlp.fc1.bias', 'transformer.h.0.mlp.fc1.weight.absmax', 'transformer.h.0.mlp.fc1.weight.quant_map', 'transformer.h.0.mlp.fc1.weight.nested_absmax', 'tra

In [26]:


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=[
        'Wqkv', 'out_proj'
    ],
    bias="none",
    lora_dropout=0.05,
    # because we added new tokens
    modules_to_save=["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 130799744 || all params: 1647603968 || trainable%: 7.938785444828451


In [28]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_eval_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=0,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        max_steps=100,
        learning_rate=2.5e-5,
        lr_scheduler_type="constant",
        logging_steps=50,
        dataloader_num_workers=4,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        report_to="tensorboard",
        # Name of the W&B run (optional)
        # run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, mlm=False),
)

In [29]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,2.5213
100,1.7437


TrainOutput(global_step=100, training_loss=2.1325, metrics={'train_runtime': 34.3687, 'train_samples_per_second': 11.638, 'train_steps_per_second': 2.91, 'total_flos': 1706260640563200.0, 'train_loss': 2.1325, 'epoch': 0.05})

In [30]:
model.save_pretrained(output_dir)