<a href="https://colab.research.google.com/github/Abinayasankar-co/finetuningworks/blob/main/DPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers trl peft

In [None]:
!pip install -q -U bitsandbytes

In [None]:
import torch
from tqdm import tqdm
import pandas as pd


tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import DPOTrainer,SFTTrainer
from peft import LoraConfig,PeftModel,get_peft_model,prepare_model_for_kbit_training

In [None]:
def chatml_format(example):
  prompt = "<|im_start|>\n You are Supportive AI assistance working. Generate a detailed long answer on it.<|im_end|>\n <|im_start|>user\n"+example["instruction"]+"<|im_end|>"
  chosen = example["chosen_response"] + '<|im_end|>\n'
  rejected = example["rejected_response"] + '<|im_end|>'

  return{
      "prompt":prompt,
      "chosen":chosen,
      "rejected":rejected
  }

In [None]:
dpo_dataset_dict = {
    "prompt": [
        "hello",
        "how are you",
        "What is your name?",
        "What is your name?",
        "Which is the best programming language?",
        "Which is the best programming language?",
        "Which is the best programming language?",
    ],
    "chosen": [
        "hi nice to meet you",
        "I am fine",
        "My name is Mary",
        "My name is Mary",
        "Python",
        "Python",
        "Java",
    ],
    "rejected": [
        "leave me alone",
        "I am not fine",
        "Whats it to you?",
        "I dont have a name",
        "Javascript",
        "C++",
        "C++",
    ],
}

In [None]:
dataset = load_dataset("Anthropic/hh-rlhf")

In [None]:
dataset

In [None]:
# Define and parse arguments.
@dataclass
class ScriptArguments:
    """
    The arguments for the DPO training script.
    """

    # data parameters
    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})

    # training parameters
    model_name_or_path: Optional[str] = field(
        default="../sft/results/final_checkpoint",
        metadata={"help": "the location of the SFT model name or path"},
    )
    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
    gradient_accumulation_steps: Optional[int] = field(
        default=4, metadata={"help": "the number of gradient accumulation steps"}
    )
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )

    gradient_checkpointing_use_reentrant: Optional[bool] = field(
        default=False, metadata={"help": "whether to use reentrant for gradient checkpointing"}
    )

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"})
    max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"})
    max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
    eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})
    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "whether to load the model in 4bit"})
    model_dtype: Optional[str] = field(
        default="float16", metadata={"help": "model_dtype[float16, bfloat16, float] for loading."}
    )

    # instrumentation
    sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
    report_to: Optional[str] = field(
        default="wandb",
        metadata={
            "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
        },
    )
    # debug argument for distributed training
    ignore_bias_buffers: Optional[bool] = field(
        default=False,
        metadata={
            "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
        },
    )
    seed: Optional[int] = field(
        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
    )



In [None]:
peft_config = LoraConfig(r=16,lora_alpha=16,lora_dropout=0.05,bias='none',task_type="CAUSAL_LM",target_modules =['k_proj','gate_proj','v_proj','up_proj','q_proj','o_proj','drown_proj'])

In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
from transformers import BitsAndBytesConfig,AutoModelForCausalLM
import accelerate
import bitsandbytes

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",quantization_config=nf4_config,device_map="auto")
model.config.use_cache = False

In [None]:
new_model = "Phi2Model_math"

In [None]:
from transformers import TrainingArguments
#Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps =4,
    gradient_checkpointing = True,
    learning_rate = 5e-5,
    lr_scheduler_type = "cosine",
    max_steps = 200,
    save_strategy="no",
    logging_steps=1,
    output_dir = new_model,
    optim="paged_adamw_8bit",
    warmup_steps=100,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    packing=True,
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()

In [None]:
dpo_trainer = DPOTrainer(
    model,
    ref_model = None,
    args = training_args,
    train_dataset = dataset,
    tokenizer=tokenizer,
    peft_config = peft_config,
    beta = 0.1,
    max_prompt_length = 1024,
    max_length = 1536
)
dpo_trainer.train()

In [None]:
#saving models
dpo_trainer.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")