## DPO Training with Unsloth

In [None]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./results2/checkpoint-1200", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [9]:
import pandas as pd
test_path= "nbroad-v2.csv"
df = pd.read_csv(test_path)

In [10]:
def remove_numbered_list(text):
    final_text_paragraphs = [] 
    for line in text.split('\n'):
        # Split each line at the first occurrence of '. '
        parts = line.split('. ', 1)
        # If the line looks like a numbered list item, remove the numbering
        if len(parts) > 1 and parts[0].isdigit():
            final_text_paragraphs.append(parts[1])
        else:
            # If it doesn't look like a numbered list item, include the line as is
            final_text_paragraphs.append(line)

    return '  '.join(final_text_paragraphs)


#trims LLM output to just the response
def trim_to_response(text):
    terminate_string = "[/INST]"
    text = text.replace('</s>', '')
    #just in case it puts things in quotes
    text = text.replace('"', '')
    text = text.replace("'", '')

    last_pos = text.rfind(terminate_string)
    return text[last_pos + len(terminate_string):] if last_pos != -1 else text

#looks for response_start / returns only text that occurs after
def extract_text_after_response_start(full_text):
    parts = full_text.rsplit(response_start, 1)  # Split from the right, ensuring only the last occurrence is considered
    if len(parts) > 1:
        return parts[1].strip()  # Return text after the last occurrence of response_start
    else:
        return full_text  # Return the original text if response_start is not found

    
#trims text to requested number of sentences (or first LF or double-space sequence)
def trim_to_first_x_sentences_or_lf(text, x):
    if x <= 0:
        return ""

    # Any double-spaces dealt with as linefeed
    text = text.replace("  ", "\n")

    # Split text at the first linefeed
    text_chunks = text.split('\n', 1)
    first_chunk = text_chunks[0]

    # Split the first chunk into sentences, considering the space after each period
    sentences = [sentence.strip() for sentence in first_chunk.split('.') if sentence]

    # If there's a linefeed, return the text up to the first linefeed
    if len(text_chunks) > 1:
        # Check if the first chunk has fewer sentences than x, and if so, just return it
        if len(sentences) < x:
            trimmed_text = first_chunk
        else:
            # Otherwise, trim to x sentences within the first chunk
            trimmed_text = '. '.join(sentences[:x]).strip()
    else:
        # If there's no linefeed, determine if the number of sentences is less than or equal to x
        if len(sentences) <= x:
            trimmed_text = '. '.join(sentences).strip()  # Ensure space is preserved after periods
        else:
            # Otherwise, return the first x sentences, again ensuring space after periods
            trimmed_text = '. '.join(sentences[:x]).strip()

    # Add back the final period if it was removed and the text needs to end with a sentence.
    if len(sentences) > 0 and not trimmed_text.endswith('.'):
        trimmed_text += '.'

    return trimmed_text

In [11]:
#original text prefix
orig_prefix = "Original Text:"

instruction_prefix = "Instruction:"

#mistral "response"
instruction = "There are two sentences defined as Original Text and Re-written Text below. You will tell what new element was added or change in tone was made to improve it - with no references to the original.  You will avoid mentioning names of characters.  It is crucial no person, place or thing from the original text be mentioned.  For example - You will not say things like 'change the puppet show into a book report' - You would just say 'improve this text into a book report'.  If the original text mentions a specific idea, person, place, or thing - You will not mention it in your answer.  For example if there is a 'dog' or 'office' in the Original text - the word 'dog' or 'office' must not be in your response."
#modified text prefix
rewrite_prefix = "Re-written Text:"

#provided as start of Mistral response (anything after this is used as the prompt)
#providing this as the start of the response helps keep things relevant
response_start = "Answer: "

#added after response_start to prime mistral
#"Improve this" or "Improve this text" resulted in non-answers.  
#"Improve this text by" seems to product good results
response_prefix = "Improve this text by"

#well-scoring baseline text
#thanks to: https://www.kaggle.com/code/rdxsun/lb-0-61
base_line = 'Refine the following passage by emulating the writing style of [insert desired style here], with a focus on enhancing its clarity, elegance, and overall impact. Preserve the essence and original meaning of the text, while meticulously adjusting its tone, vocabulary, and stylistic elements to resonate with the chosen style.Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.' 


In [12]:
def build_prompt(instruction , original_text ,rewritten_text):
  prompt=f"<s>[INST] Instruction:\n{instruction} \nOriginal Text:\n{original_text} \nRe-written Text: \n{rewritten_text}[/INST]\nAnswer:\n"

  
  return prompt

In [None]:

def build_prompt_wo_answer_prefix(instruction , original_text ,rewritten_text):
  prompt=f"<s>[INST] Instruction:\n{instruction} \nOriginal Text:\n{original_text} \nRe-written Text: \n{rewritten_text}[/INST]"

  
  return prompt

In [None]:
text = build_prompt(instruction ,df.iloc[1]["original_text"] ,df.iloc[1]["rewritten_text"] )

In [None]:
df['rejected'].notna().sum()

In [None]:


for index, row in df.iterrows():
    text = build_prompt(instruction ,row['original_text'], row['rewritten_text'])
    inputs = tokenizer([text,], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 20, use_cache = True)
    result_raw = tokenizer.batch_decode(outputs)
    just_response = trim_to_response(result_raw[0])        
    final_text = extract_text_after_response_start(just_response)
    print(index)
    df.at[index, 'rejected'] = final_text

In [None]:
for index, row in df.iterrows():
    text = build_prompt_wo_answer_prefix(instruction ,row['original_text'], row['rewritten_text'])
    df.at[index, 'prompt'] = text

In [None]:
for index, row in df.iterrows():
    df.at[index, 'chosen'] = "\nAnswer:\n" + row['chosen']

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
import re

In [None]:

df['rejected'].notna().sum()

In [None]:
df = df.iloc[:766]


In [None]:
# convert to dataset object
dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())
dataset = Dataset(pa.Table.from_pandas(df))
dataset

In [None]:
model2 = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 20,
        save_strategy="epoch",
        learning_rate = 5e-6,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "dpo_output2",
    ),
    beta = 0.1,
    train_dataset = dataset,
    # eval_dataset = raw_datasets["test"],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

In [None]:
dpo_trainer.train()

In [None]:
# Try model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
import re


In [2]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
peft_model = "./dpo_output2/checkpoint-287"

In [3]:
model = AutoModelForCausalLM.from_pretrained(base_model , load_in_4bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:

model.load_adapter(peft_model)

In [6]:
def build_prompt(instruction , original_text ,rewritten_text):
  prompt=f"<s>[INST] Instruction:\n{instruction} \nOriginal Text:\n{original_text} \nRe-written Text: \n{rewritten_text}[/INST]\nAnswer:\n"

  
  return prompt

In [29]:
text = build_prompt(instruction ,df.iloc[343]["original_text"] ,df.iloc[343]["rewritten_text"] )

In [30]:

    inputs = tokenizer([text,], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 20, use_cache = True)
    result_raw = tokenizer.batch_decode(outputs)
    just_response = trim_to_response(result_raw[0])        
    final_text = extract_text_after_response_start(just_response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [31]:
final_text

'\nAnswer:\nRewrite the essay as if it takes place in feudal Japan during the Sengoku'

In [32]:
df.iloc[343]['rewrite_prompt']

'Rewrite the story as a time-travel fantasy set in Feudal Japan'

In [33]:
!gsutil cp -r ./dpo_output2 gs://artificialintelligence-393314/llm_finetuned/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Copying file://./dpo_output2/checkpoint-191/tokenizer.model [Content-Type=application/octet-stream]...
Copying file://./dpo_output2/checkpoint-191/rng_state.pth [Content-Type=application/octet-stream]...
Copying file://./dpo_output2/checkpoint-191/trainer_state.json [Content-Type=application/json]...
Copying file://./dpo_output2/checkpoint-191/README.md [Content-Type=text/markdown]...
- [4 files][594.6 KiB/594.6 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./dpo_output2/checkpoint-191/training_args.bin [Content-Type=application/octet-stream]...
Copying file://./dpo_output2/checkpoint-191/tokenizer_config.json [Content-Type=application/json]...
Copying file://./dpo_output2/checkpoint-191/scheduler.pt [Content-Type=ap