In [3]:
import gc
import os
import torch

from peft import ( # Parameter Efficient Fine-Tuning ; peft
    LoraConfig, # 특정 parameter만 학습하는 LoRA 방식 사용
    PeftModel,
    prepare_model_for_kbit_training,
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)

from trl import (
    ORPOConfig,
    ORPOTrainer,
    setup_chat_format,
)

In [7]:
# Flash Attention 사용 가능 확인
# Major Capability가 8 이상일 경우 사용 가능

if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

AssertionError: Torch not compiled with CUDA enabled

In [None]:
# Model 설정

base_model = "beomi/Llama-3-Open-Ko-8B-Instruct-preview"
new_model = "Test-Llama-3-Open-Ko-8B-Instruct-preview"

In [None]:
# QLoRA 설정 : 4Bit 양자화

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 4Bit 정밀도 로드
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype, # torch.bfloat16
    bnb_4bit_use_double_quant=True,
)

In [None]:
# LoRA 설정 : QLoRA용 PEFT를 사용하여 LoRA 설정

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
# tokenizer 로드

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
# 모델 로드

model = AutoModelForCausalLM.from_pretrained(
    base_model, # 모델 위치
    quantization_config=bnb_config, # 양자화
    device_map="auto",
    attn_implementation=attn_implementation
)

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer) # 대화형 포맷의 토큰 추가 (전처리)
model = prepare_model_for_kbit_training(model) # peft Wrapping

In [None]:
# 학습 설정
orpo_args = ORPOConfig(
    learning_rate=1e-6, # 논문 상 8e-6
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=4,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    output_dir="./results/",
)

# 학습기 설정
trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model(new_model)

In [None]:
# trainer = ORPOTrainer(
#     model=model,
#     args=orpo_args,
#     train_dataset=dataset["train"],
#     # eval_dataset=dataset["test"],
#     peft_config=peft_config,
#     tokenizer=tokenizer,
# )

In [8]:
%%writefile llama_3_70b_fsdp_qlora.yaml
# script parameters
model_id: "beomi/Llama-3-Open-Ko-8B-Instruct-preview" # Hugging Face model id
dataset_path: "."                      # path to dataset
max_seq_len:  1024 # 2048              # max sequence length for model and packing of the dataset
# training parameters
output_dir: "./test-model" # Temporary output directory for model checkpoints
report_to: "tensorboard"               # report metrics to tensorboard
learning_rate: 0.0002                  # learning rate 2e-4
lr_scheduler_type: "constant"          # learning rate scheduler
num_train_epochs: 4                    # number of training epochs
per_device_train_batch_size: 4         # batch size per device during training
per_device_eval_batch_size: 4          # batch size for evaluation
gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
optim: adamw_torch                     # use torch adamw optimizer
logging_steps: 10                      # log every 10 steps
save_strategy: epoch                   # save checkpoint every epoch
evaluation_strategy: epoch             # evaluate every epoch
max_grad_norm: 0.3                     # max gradient norm
warmup_ratio: 0.03                     # warmup ratio
bf16: true                             # use bfloat16 precision
tf32: true                             # use tf32 precision
gradient_checkpointing: true           # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
  backward_prefetch: "backward_pre"
  forward_prefetch: "false"
  use_orig_params: "false"

Writing llama_3_70b_fsdp_qlora.yaml


In [None]:
##### CMD
# CCELERATE_USE_FSDP=1 FSDP_CPU_RAM_EFFICIENT_LOADING=1 torchrun --nproc_per_node=1 run_fsdp_qlora.py --config llama_3_70b_fsdp_qlora.yaml
# 명령어를 통해 실행

# run_fsdp_qlora.py에 QLoRA 및 FSDP 학습기 설정
# nproc_per_node 수를 조절하여 오류 방지