# 사용자 선호에 맞는 시 창작 모델

### 0. 환경 설정

In [None]:
!python -m pip install --upgrade pip
!pip install typing_extensions pydantic openai
!pip install datasets transformers peft trl bitsandbytes

In [None]:
import os
import torch

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = "cuda" if torch.cuda.is_available() else "cpu"

### 1. 지도학습 (기반모델 Q-LoRA 파인튜닝)

##### (1) 학습용 데이터 준비

In [None]:
import json
from datasets import Dataset

dataset_path = "./korean_poetry_dataset.json"

with open(dataset_path, "r", encoding="utf-8") as f:
    poem_data = json.load(f)

processed_data = [
    {"topic": item["text"]["topic"], "poem": item["text"]["poem"]}
    for item in poem_data
]

train_dataset = Dataset.from_list(processed_data)

In [None]:
# tokenizer 로드
from transformers import AutoTokenizer

model_name = "NCSOFT/Llama-VARCO-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# 전처리 함수: 토큰화 + 라벨링
def preprocess_text(sample):
    input_texts = [f"주제: {t}\n\n시: {p}" for t, p in zip(sample["topic"], sample["poem"])]
    model_inputs = tokenizer(
        input_texts,
        padding="max_length",
        max_length=512,
        truncation=True
    )

    model_inputs["labels"] = model_inputs["input_ids"].copy()
    pad_token_id = tokenizer.pad_token_id
    model_inputs["labels"] = [
        [(l if l != pad_token_id else -100) for l in label]
        for label in model_inputs["labels"]
    ]

    return model_inputs

In [None]:
train_dataset = train_dataset.map(
    preprocess_text,
    batched=True,
    remove_columns=["topic", "poem"]
)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

##### (2) 파인튜닝 학습 준비

- 양자화 설정 > 모델 로드
- 학습 모드로 전환
- LoRA 학습 설정
- TrainingArguments 설정

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model.config.use_cache = False
model.config.attn_implementation = "flash_attention_2"

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [None]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)
model.train()

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./q_lora_poem",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    optim="adamw_bnb_8bit",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

##### (3) 학습 진행

In [None]:
trainer.train()

### 2. 학습된 모델로 시(응답) 생성

##### (1) 모델 로드

In [None]:
from transformers import pipeline

qlora_checkpoint = "./q_lora_poem/checkpoint-xxx"

model = AutoModelForCausalLM.from_pretrained(qlora_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_name)

generate_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,
    batch_size=2
)

In [None]:
topics = ["바람", "비", "노을", "달빛", "안개", "사랑", "이별", "운명", "기다림", "후회", "추억", "시간", "청춘", "변화", "마지막 순간", "군중", "밤거리", "버스", "인생", "빌딩", "사람들", "거짓말", "욕망", "돈", "권력", "비밀", "죽음", "희망", "동물", "자연", "도시", "바다", "산", "하늘", "별", "꽃", "나무", "강", "바위", "흙", "눈", "빗방울", "눈물", "웃음"]

eval_file = "rlhf_evaluation_data.json"

try:
    with open(eval_file, "r", encoding="utf-8") as f:
        eval_dataset = json.load(f)
except FileNotFoundError:
    eval_dataset = []

In [None]:
num_batches = 5
batch_size = 20
total_samples = num_batches * batch_size
generated_samples = len(eval_dataset)

##### (2) 시 생성

In [None]:
# 시 생성 함수 정의
import time
import random
from tqdm import tqdm

def generate_poem_batch():
    batch_data = []

    with tqdm(total=batch_size, desc="✍️시 생성 중...", leave=False) as t:
        for _ in range(batch_size):
            topic = random.choice(topics)
            input_text = f"주제: {topic}\n\n시:"

            start_time = time.time()
            poem = generate_pipeline(
                input_text,
                max_new_tokens=100,
                temperature=0.8,
                top_p=0.9
            )[0]["generated_text"]
            end_time = time.time()

            gen_time = end_time - start_time
            batch_data.append({
                "topic": topic,
                "poem": poem,
                "selected": None
            })

            t.update(1)

            global generated_samples
            generated_samples += 1
            complete_rate = (generated_samples / total_samples) * 100
            remaining_time = ((total_samples - generated_samples) * gen_time) / 60

            print(f"\n{generated_samples}/{total_samples}개 완료 ({complete_rate:.2f}%)")
            print(f" - 예상 남은 시간: {remaining_time:.1f}분")
            print("-" * 100)

    return batch_data

In [None]:
for _ in tqdm(range(num_batches), desc="<<< 전체 진행 상황 >>>", position=0):
    eval_dataset.extend(generate_poem_batch())

    with open(eval_file, "w", encoding="utf-8") as f:
        json.dump(eval_dataset, f, ensure_ascii=False, indent=4)

##### (3) 피드백
- 생성된 시에 대해 selected="True"로 수정해 피드백 반영

### 3. Reward Model 학습

##### (1) 데이터 로드 및 처리

In [None]:
with open(eval_file, "r", encoding="utf-8") as f:
    evaluation_data = json.load(f)

reward_data = [
    {"text_a": f"주제: {item['topic']},", "text_b": item["poem"]}
     for item in evaluation_data if item["selected"]
]

reward_dataset = Dataset.from_list(reward_data)

In [None]:
# 데이터 전처리
def preprocess_reward_data(sample):
    
    model_inputs = tokenizer(
        sample["text_a"],
        text_pair=sample["text_b"],
        max_length=512,
        truncation=True
    )

    model_inputs["labels"] = model_inputs["input_ids"].copy()
    pad_token_id = tokenizer.pad_token_id
    model_inputs["labels"] = [
        [(l if l != pad_token_id else -100) for l in label]
        for label in model_inputs["labels"]
    ]

    return model_inputs

In [None]:
reward_dataset = reward_dataset.map(
    preprocess_reward_data,
    batched=True,
    remove_columns=["text_a", "text_b"]
)

##### (2) 학습 준비

- 양자화 설정 > 모델 로드
- LoRA 학습 설정
- TrainingArguments 설정

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [None]:
reward_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
reward_model = prepare_model_for_kbit_training(reward_model)

reward_model = get_peft_model(reward_model, lora_config)

In [None]:
reward_training_args = TrainingArguments(
    output_dir="./reward_model",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    fp16=True
)

reward_trainer = Trainer(
    model=reward_model,
    args=reward_training_args,
    train_dataset=reward_dataset,
    tokenizer=tokenizer
)

##### (3) 학습 진행

In [None]:
reward_trainer.train()

### 4. RLHF (ORPO)

##### (1) 모델 로드

In [None]:
model = AutoModelForCausalLM.from_pretrained(qlora_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.train()
model.cuda()

for param in model.parameters():
    param.requires_grad = True

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# !export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# torch.cuda.empty_cache()

##### (2) ORPO 데이터셋 준비

In [None]:
with open(eval_file, "r", encoding="utf-8") as f:
    evaluation_data = json.load(f)

orpo_data = []

for item in evaluation_data:
    if item["selected"]:
        prompt_text = f"주제: {item['topic']}\n\n이 주제에 맞는 시를 작성해 주세요."
        chosen_text = item["poem"]
        rejected_text = ""

        tokenized_prompt = tokenizer(prompt_text, truncation=True, padding="max_length", max_length=64, return_tensors="pt")
        tokenized_chosen = tokenizer(chosen_text, truncation=True, padding="max_length", max_length=64, return_tensors="pt")
        tokenized_rejected = tokenizer(rejected_text, truncation=True, padding="max_length", max_length=64, return_tensors="pt")

        orpo_data.append({
            "prompt": prompt_text,
            "chosen": chosen_text,
            "rejected": rejected_text,
            "prompt_input_ids": tokenized_prompt["input_ids"].squeeze(0).cuda(),
            "prompt_attention_mask": tokenized_prompt["attention_mask"].squeeze(0).cuda(),
            "chosen_input_ids": tokenized_chosen["input_ids"].squeeze(0).cuda(),
            "chosen_attention_mask": tokenized_chosen["attention_mask"].squeeze(0).cuda(),
            "rejected_input_ids": tokenized_rejected["input_ids"].squeeze(0).cuda(),
            "rejected_attention_mask": tokenized_rejected["attention_mask"].squeeze(0).cuda()
        })

        orpo_dataset = Dataset.from_list(orpo_data)

##### (3) ORPO 설정

In [None]:
from trl import ORPOConfig

orpo_config = ORPOConfig(
    output_dir='./orpo_output',
    per_device_train_batch_size=1,
    num_train_epochs=5,
    learning_rate=2e-6,
    gradient_accumulation_steps=4,
    logging_steps=50,
    fp16=False,
    bf16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    warmup_steps=100,
    save_steps=500,
    save_total_limit=2
)

In [None]:
from trl.trainer.utils import DPODataCollatorWithPadding

data_collator = DPODataCollatorWithPadding(
    pad_token_id=tokenizer.pad_token_id,
    label_pad_token_id=-100,
    is_encoder_decoder=False
)

In [None]:
from trl import ORPOTrainer

orpo_trainer = ORPOTrainer(
    model=model,
    args=orpo_config,
    train_dataset=orpo_dataset,
    data_collator=data_collator,
    processing_class=tokenizer
)

##### (4) ORPO 적용

In [None]:
orpo_trainer.train()

### 최종 시 생성

In [None]:
orpo_checkpoint = "./orpo_output/checkpoint-xxx"

model = AutoModelForCausalLM.from_pretrained(orpo_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_name)

generate_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
def generate_poem_final(num_samples=5):
    result = []

    for _ in range(num_samples):
        topic = random.choice(topics)
        input_text = f"주제: {topic}\n\n시:"

        poem = generate_pipeline(
            input_text,
            max_new_tokens=100,
            temperature=0.8,
            top_p=0.9
        )[0]["generated_text"]

        result.append({"topic": topic, "poem": poem})

    return result

In [None]:
generated_poem = generate_poem_final()
generated_poem