In [1]:
!pip install -U -q bitsandbytes transformers trl accelerate peft

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json, os, io
import torch
from datasets import Dataset, load_dataset, Features, Value, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DPOTrainer, DPOConfig
from tqdm import tqdm

In [4]:
access_token = "hf_..."

## load dataset

In [5]:
ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/llm_data"

In [6]:
datapath = os.path.join(ROOT_DIR, "data")

def get_datafile(filename):
    with open(os.path.join(datapath, filename), "rb") as f:
        dataset = json.load(f)
    return dataset

f_data = get_datafile("feeling.txt")
t_data = get_datafile("thinking.txt")

In [7]:
len(f_data) == len(t_data)

True

In [8]:
def gen_dataset(data_type, chosen_data, rejected_data):
    temp_dataset = []
    for chosen, rejected in zip(chosen_data, rejected_data):
        temp_data = {}
        temp_data['prompt'] = chosen['instruction']
        temp_data['chosen'] = chosen['output']
        temp_data['rejected'] = rejected['output']
        temp_data['data_type'] = data_type
        temp_dataset.append(temp_data)
    return temp_dataset

In [9]:
dataset = gen_dataset("F", f_data, t_data) + gen_dataset("T", t_data, f_data)

In [10]:
def generate_prompt(example):
    temp_dict = {}
    system_prompt = "당신은 논리적, 분석적 근거를 바탕으로 판단하여 친구처럼 답변해야 합니다." if example['data_type']=="T" else \
                    "당신은 개인적, 사회적 가치를 바탕으로 판단하여 친구처럼 답변해야 합니다." if example['data_type']=="F" else " "

    temp_dict['prompt'] = f"<start_of_turn>system\n{system_prompt}<end_of_turn><start_of_turn>user\n{example['prompt']}<end_of_turn>\n<start_of_turn>model\n"
    temp_dict['chosen'] = f"{example['chosen']}<end_of_turn>"
    temp_dict['rejected'] = f"{example['rejected']}<end_of_turn>"

    return temp_dict

In [11]:
features = Features({
    "prompt": Value("string"),
    "rejected": Value("string"),
    "chosen": Value("string"),
    "data_type": Value("string")
})

In [12]:
dpo_data = DatasetDict({"train": Dataset.from_list(dataset, features=features)}).map(generate_prompt)

Map:   0%|          | 0/24318 [00:00<?, ? examples/s]

In [13]:
dpo_dataset = dpo_data['train'].train_test_split(test_size=0.1)

## train

https://huggingface.co/docs/trl/dpo_trainer#using-option-3---load-the-adapter-twice

In [14]:
# qlora setting
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True, # 이중양자화
    bnb_4bit_compute_dtype=torch.float16
)

In [15]:
BASE_MODEL = "google/gemma-2-2b-it"
PEFT_MODEL = os.path.join(ROOT_DIR, "./model/peft_model")
MERGED_MODEL = os.path.join(ROOT_DIR, "./model/gemma-2-2b-it-sns")

# 모델과 토크나이저 로드
model = AutoModelForCausalLM.from_pretrained(MERGED_MODEL,
                                             device_map="auto",
                                             quantization_config=bnb_config,
                                             attn_implementation='eager',
                                             token=access_token)
# model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.padding_side = 'right'

# model = PeftModel.from_pretrained(model, PEFT_MODEL, is_trainable=True, peft_config=lora_config, adapter_name="train_model")
# model.load_adapter(PEFT_MODEL, is_trainable=True, peft_config=lora_config, adapter_name="reference_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
#### COMMENT IN TO RECALCULATE MAX LENGTHS ####
from numpy import percentile
train_dataset = dpo_dataset['train']
eval_dataset = dpo_dataset['test']

# lets find the p95 length of the prompt
prompt_length = int(percentile([len(tokenizer(x)["input_ids"]) for x in train_dataset["prompt"]], 95))
max_seq_length_chosen = int(percentile([len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) for x in train_dataset], 95))
max_seq_length_rejected = int(percentile([len(tokenizer(x["prompt"] + x["rejected"])["input_ids"]) for x in train_dataset], 95))
max_seq_length = max(max_seq_length_chosen, max_seq_length_rejected)

# filter datasets to remove samples that are too long
train_dataset = train_dataset.filter(lambda x: len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length)
eval_dataset = eval_dataset.filter(lambda x: len(tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length)
print(f"len(train_dataset): {len(train_dataset)}")
print(f"len(eval_dataset): {len(eval_dataset)}")

# Up the lengths to next multiple of 2, why 2? Don't know
prompt_length = ((prompt_length + 1) // 2) * 2
max_seq_length = ((max_seq_length + 1) // 2) * 2
print(f"p95 prompt length: {prompt_length}")
print(f"p95 prompt + chosen length: {max_seq_length}")

## len(train_dataset): 20810
## len(eval_dataset): 2300
## p95 prompt length: 92
## p95 prompt + chosen length: 362

prompt_length = 128
max_seq_length = 512

In [17]:
training_args = DPOConfig(
    # model_adapter_name="train_model",
    # ref_adapter_name="reference_model",

    output_dir="./outputs/dpo",
    eval_strategy="steps",
    do_eval=True,
    # max_steps=3000,
    num_train_epochs=1,
    optim="paged_adamw_32bit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=1,
    logging_steps=100,
    learning_rate=5e-7,
    eval_steps=1000,
    save_steps=1000,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    max_prompt_length=prompt_length,
    max_length=max_seq_length,
    save_total_limit=2,
    bf16=True,
    beta=0.5,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    peft_config=lora_config,
    train_dataset=dpo_dataset['train'],
    eval_dataset=dpo_dataset['test'],
    tokenizer=tokenizer,
)



Tokenizing train dataset:   0%|          | 0/21886 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2432 [00:00<?, ? examples/s]

In [18]:
trainer.train()

# from transformers.trainer_utils import EvalPrediction, get_last_checkpoint

# last_checkpoint = get_last_checkpoint(training_args.output_dir)
# trainer.train(resume_from_checkpoint=last_checkpoint)

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
1000,0.704,0.709064,-0.001387,0.001326,0.483553,-0.002712,-248.902145,-247.70163,-3.693312,-3.703695
2000,0.6815,0.69343,0.045971,0.01639,0.512336,0.029581,-248.872025,-247.606964,-3.692079,-3.703415
3000,0.6346,0.637475,0.151551,-0.008919,0.626234,0.160469,-248.922653,-247.395767,-3.683365,-3.698145
4000,0.5367,0.528613,0.50569,-0.007924,0.737664,0.513615,-248.920639,-246.6875,-3.660987,-3.685528
5000,0.3892,0.414985,0.82202,-0.399821,0.794408,1.221841,-249.704437,-246.05484,-3.627799,-3.671474
6000,0.3379,0.342799,0.970378,-1.313994,0.817023,2.284372,-251.532791,-245.758118,-3.588045,-3.658058
7000,0.2719,0.310501,0.958214,-2.407846,0.829359,3.36606,-253.720474,-245.782425,-3.548939,-3.643504
8000,0.3702,0.298732,0.77454,-3.327103,0.83347,4.101643,-255.559006,-246.149796,-3.525877,-3.636899
9000,0.3227,0.297519,0.649119,-3.853889,0.834704,4.503008,-256.612579,-246.40065,-3.51408,-3.633948
10000,0.2887,0.294125,0.615406,-4.04752,0.837582,4.662925,-256.999847,-246.468063,-3.509503,-3.632587


TrainOutput(global_step=10943, training_loss=0.4538646296414272, metrics={'train_runtime': 14123.0577, 'train_samples_per_second': 1.55, 'train_steps_per_second': 0.775, 'total_flos': 0.0, 'train_loss': 0.4538646296414272, 'epoch': 1.0})

In [19]:
ADAPTER_MODEL = os.path.join(ROOT_DIR, "sns_lora_dpo")
trainer.model.save_pretrained(ADAPTER_MODEL)