# Overview

We are going to fine-tune Microsoft Phi2 using QLoRA in this notebook.

In [None]:
%%capture
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3

In [None]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning Microsoft-phi-2"
os.environ["WANDB_NAME"] = "ft-microsoft-phi-2"
os.environ["MODEL_NAME"] = "microsoft/phi-2"
os.environ["DATASET"] = "g-ronimo/riddles_evolved"

torch.backends.cudnn.deterministic=True

In [None]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

# Loading Dataset

In [None]:
from datasets import load_dataset

dataset=load_dataset(os.getenv("DATASET"), split="train[:500]")
dataset=dataset.train_test_split(test_size=0.1)
print(dataset["train"][0])
dataset

# Loading Tokenizer

* Loading tokenizer
* Adding chatML tokens to tokenizer

In [None]:
from transformers import AutoTokenier

tokenizer=AutoTokenizer.from_pretrained(os.getenv("MODEL_NAME"), use_fast=False)
print(len(tokenizer))
# add chatML tokens to tokenizer
tokenizer.add_tokens(["<|im_start|>","<PAD>"])
tokenizer.pad_token="<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
print(len(tokenizer))

# Preprocess Data

We are going to apply ChatML format and tokenize to the data.

In [None]:
from functools import partial

templates=[
    '<|im_start|>assistant\n{msg}<|im_end|>',
    '<|im_start|>user\n{msg}<|im_end|>'
]

IGNORE_INDEX=-100

def preprocess_func(input, max_length):
    input_ids, attention_mask, labels=[],[],[]
    for i, msg in enumerate(input["messages"]):
        isHuman=i%2==0
        msg_chatml=templates[isHuman].format(msg=msg)
        msg_tokenized=tokenizer(msg_chatml,truncation=False, add_special_tokens=False)
        
        input_ids+=msg_tokenized["input_ids"]
        atention_mask+=msg_tokenized["attention_mask"]
        labels+=[IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]

    return {
        "input_ids": input_ids[:max_length],
        "attention_mask": attention_mask[:max_length],
        "labels": labels[:max_length]
    }

dataset_tokenized=dataset.map(
    partial(preprocess_func, max_length=1024), # max sample length 1024 tokens, enough for the dataset
    batched=False,
    num_proc=os.cpu_count(),
    remove_columns=dataset["train"].column_names # do not need this anymore, we have tokens from here on
)

## Visualization data

In [None]:
import matplotlib.pyplot as plt

data=[len(tok) for tok in (dataset_tokenized["train"]["input_ids"]+dataset_tokenized["test"]["input_ids"])]
print(f"Longest sample: {max(data)} tokens")

plt.hist(data, bins=10)
plt.show()

# Batch the Data

In [None]:
def collate(example):
    tokens=[e["input_ids"] for e in example]
    tokens_maxlen=max([len(t) for t in tokens])
    
    for i, sample in enumerate(elements):
        input_ids=sample["input_ids"]
        labels=sample["labels"]
        attention_mask=sample["attention_mask"]
        
        pad_len=tokens_maxlen-len(input_ids)
        
        input_ids.extend(pad_len*[tokenizer.pad_token_id])
        labels.extend(pad_len*[IGNORE_INDEX])
        attention_mask.extend(pad_len*[0])
        
    batch={
        "input_ids":torch.tensor([e["input_ids"] for e in example]),
        "labels":torch.tensor([e["labels"] for e in example]),
        "attention_mask": torch.tensor([e["attention_mask"] for e in example])
    }
    
    return batch

# Loading Model

* Quantization
* Free the original weights
* LoRA

There is no need to resize the token embeeddings, phi-2 already has embeddings sized for additional tokens. The model's vocabulary size is 51200, this means you can add ~700 tokens to the tokenizer without having to resize the embeddings.

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

model=AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True, # for phi-2
)

model.config.eos_token_id=tokenizer.eos_token_id
model.gradient_checkpointing_enable() # reducing memory usage
print(model.model.embed_tokens)

def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

# Freeze weigths and apply LoRA

In [None]:
from peft import prepare_model_for_kbit_training

prepared_model=prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)

print_trainable_parameters(prepared_model)
print(prepared_model)

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config=LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'dense'],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["lm_head","embed_tokens"], # we added new tokens to tokenizer, this is necesarry
    task_type=TaskType.CAUSAL_LM
)

lora_model=get_peft_model(prepared_model, lora_config)
lora_model.config.use_cache=False
print_trainable_parameters(lora_model)
print(lora_model)

# Training

In [None]:
from transformers import set_seed, TrainingArguments, Trainer

set_seed(2024)

bs=4
bs_eval=16
ga_steps=16
lr=0.00002
epochs=3

steps_per_epoch=len(dataset_tokenized["train"]//(bs*ga_steps))

args=TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit", # val_loss will go nan with paged_adamw_8bit
    leanring_rate=lr,
    group_by_length=False,
    fp16=True,
    ddp_find_unused_parameters=False
)

trainer=Trainer(
    model=lora_model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"]
)

trainer.train()

In [None]:
kwargs={
    'model_name': f'{os.getenv("WANDB_NAME")}',
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': '',
#     'dataset_tags':'',
    'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)

# Inference

In [None]:
import gc

del tokenizer, lora_model, prepared_model, model

gc.collect()
torch.cuda.empty_cache()

In [None]:
tokenizer=AutoTokenizer.from_pretrained(os.getenv("MODEL_NAME"), use_fast=False)
tokenizer.add_tokens(["<|im_start|>","<PAD>"])
tokenizer.pad_token="<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

tokenizer.chat_template="{% if not add_generation_prompt is defined%}{% set add_generation_prompt=false%}{%endif%}{%for message in messages%}{{'<|im_start|>'+message['role']+message['content']+'<|im_end|>'}}{%endfor%}{%if add_generation_prompt%}{{'<|im_start|>assistant'}}{%endif%}"

In [None]:
model=AutoModelForCausalLM.from_pretrained(os.getenv("MODEL_NAME"), torch_dtype=torch.bfloat16, device_map="auto")
model.config.eos_token_id=tokenizer.eos_token_id

In [None]:
from peft import PeftModel

generation_config=GenerationConfig(
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.1,
    top_k=40,
    repetition_penalty=1.18,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_pad_id
)

model=PeftModel.from_pretrained(model, os.getenv("WANDB_NAME"))
model=model.merge_and_unload()

model.save_pretrained(os.getenv("WANDB_NAME")+"-merged", safe_serialization=True, max_shared_size="4GB")
tokenizer.save_pretrained(os.getenv("WANDB_NAME")+"-merged")
generation_config.save_pretrined(os.getenv("WANDB_NAME")+"-merged")

# Credit

* https://medium.com/@geronimo7/phinetuning-2-0-28a2be6de110
* https://github.com/geronimi73/phi2-finetune/blob/main/nb_qlora.ipynb