In [1]:
import transformers
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define model and project details
model_name = 'microsoft/phi-2'
project = "Evol-Instruct-Code-80k-v1-10-precent"

# Extract base model name from the model name
# Use -1 to handle cases where model_name doesn't contain "/"
base_model_name = model_name.split("/")[-1]

# Construct run name and output directory
run_name = f"{base_model_name}-{project}"
output_dir = f"./{run_name}"

# Define device for training
device = 'cpu'

In [3]:
# Load the dataset from the HuggingFace Hub
huggingface_dataset_name = "nickrosh/Evol-Instruct-Code-80k-v1"
# For the sake of this example, we will only use the first 10% of the training set
train_dataset = load_dataset(huggingface_dataset_name, split="train[:10%]")
print(train_dataset)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 7826
})


In [4]:
# ...Or load a generator from a local file
# Download the dataset
! wget 'https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1/resolve/main/EvolInstruct-Code-80k.json?download=true' -O './EvolInstruct-Code-80k.json'

data_files = {"train": "./EvolInstruct-Code-80k.json"}
train_dataset = load_dataset(
    "json", data_files=data_files, split="train", streaming=True)
print(train_dataset)

--2024-01-12 17:27:06--  https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1/resolve/main/EvolInstruct-Code-80k.json?download=true
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving huggingface.co (huggingface.co)... 13.224.167.24, 13.224.167.105, 13.224.167.123, ...
Connecting to huggingface.co (huggingface.co)|13.224.167.24|:443... connected.
HTTP request sent, awaiting response... 

302 Found
Location: https://cdn-lfs.huggingface.co/repos/a8/e2/a8e206c6c64ab0de37467edf7c35ade23c3f66faefa205ec52ca31b0a15f5ec8/df31f664fe2db3c2e3e7c34774fdb2b1ad37e526ebfaa8f68d822087c5ab8d8d?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27EvolInstruct-Code-80k.json%3B+filename%3D%22EvolInstruct-Code-80k.json%22%3B&response-content-type=application%2Fjson&Expires=1705310826&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTMxMDgyNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9hOC9lMi9hOGUyMDZjNmM2NGFiMGRlMzc0NjdlZGY3YzM1YWRlMjNjM2Y2NmZhZWZhMjA1ZWM1MmNhMzFiMGExNWY1ZWM4L2RmMzFmNjY0ZmUyZGIzYzJlM2U3YzM0Nzc0ZmRiMmIxYWQzN2U1MjZlYmZhYThmNjhkODIyMDg3YzVhYjhkOGQ%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=GRC3FLf6WpGQt4enNvlsnqSuXinG95NUx6rAJAFKuWGxVX%7Eabp2go%7EN7WMZ1Zu0w9zOqswpeTaDAzAmUXJYilpG2LCkDLsPYSr6S%7EcMrM99MuaADk4aI8JWlubcrH97WUjiU1PtUD1czddcNBCCd4rbkC9

In [5]:
# ...Or load the whole dataset from a local file if the dataset is small enough
data_files = {"train": "./EvolInstruct-Code-80k.json"}
train_dataset = load_dataset(
    "json", data_files=data_files, split="train[:10%]")
print(train_dataset)

Dataset({
    features: ['output', 'instruction'],
    num_rows: 7826
})


For more on information about creating your own dataset, please see https://huggingface.co/docs/datasets/create_dataset

In [6]:
# Sadly, we can only train quantized model on GPU using BitsandBytes, which is a warpper for CUDA.
# For cpu, use float32.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    # This is the PR that adds gradient checkpointing. It is not merged yet. This kind of thing is the cost of being on the bleeding edge.
    revision="refs/pr/23",
)
model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.29s/it]


PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )
    )
  )
  (lm

In [7]:
# Load and config the tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    use_fast=True,
)

# add special tokens for ChatML formatting and a pad token
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
# For the sake of memory and time, we will only use 64 tokens
tokenizer.model_max_length = 64
# resize model embeddings
model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   # phi2 default is 64, see configuration_phi.py
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Define the tokenizing function to tokenize the dataset
def tokenize_function(data_point):
    def get_prompt(inst):
        return f"""
        # System:
        You are a helpful AI assistant. Follow the instruction. 
        # INSTRUCTION:
        {inst}
        # CODE:
        """
    prompts = [get_prompt(inst) for inst in data_point['instruction']]
    data_point['input_ids'] = tokenizer(
        prompts,
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    data_point['labels'] = tokenizer(
        data_point['output'],
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    return data_point


tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, num_proc=os.cpu_count())

Map (num_proc=32): 100%|██████████| 7826/7826 [00:00<00:00, 12096.94 examples/s]


In [9]:
# Apply the Peft adapter
config = LoraConfig(
    r=1,
    lora_alpha=16,
    target_modules=[
        'Wqkv', 'out_proj'
    ],
    bias="none",
    lora_dropout=0.05,
    # because we added new tokens
    modules_to_save=["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [10]:
# We can see the number of trainable parameters are only few percent of the original model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


print_trainable_parameters(model)

trainable params: 129325184 || all params: 2904420608 || trainable%: 4.4527016384536


In [11]:
# Define the trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=0,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        max_steps=100,
        learning_rate=2.5e-5,
        lr_scheduler_type="constant",
        logging_steps=50,
        dataloader_num_workers=4,
        optim="adamw_hf",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        report_to="tensorboard",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, mlm=False),
)

In [12]:
# Train!
trainer.train()
model.save_pretrained(output_dir)

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.9401
100,1.2197


Checkpoint destination directory ./phi-2-Evol-Instruct-Code-80k-v1-10-precent/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./phi-2-Evol-Instruct-Code-80k-v1-10-precent/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
