# 데비&마를렌 VLM 파인튜닝 (Qwen2.5-VL + A100)

이터널리턴 전문가 봇 - 데비처럼 답하는 VLM

**모델**: Qwen2.5-VL-3B-Instruct
**GPU**: A100 필수!

예상 시간: 1~2시간

In [None]:
# 1. 패키지 설치
!pip install transformers>=4.45.0
!pip install trl>=0.12.0
!pip install peft>=0.13.0
!pip install accelerate>=0.34.0
!pip install bitsandbytes>=0.44.0
!pip install datasets pillow
!pip install qwen-vl-utils

In [None]:
# 2. GPU 확인
!nvidia-smi

import torch
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.0f} GB")

In [None]:
# 3. Google Drive 연결
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 4. 경로 설정 & 파일 확인
import os

DRIVE_PATH = "/content/drive/MyDrive/eternal_return_vlm"
DATASET_PATH = f"{DRIVE_PATH}/eternal_return_vlm_dataset.json"
EMOJIS_ZIP = f"{DRIVE_PATH}/emojis.zip"

print(f"Dataset: {os.path.exists(DATASET_PATH)}")
print(f"Emojis: {os.path.exists(EMOJIS_ZIP)}")

In [None]:
# 5. 이미지 압축 해제
!unzip -q -o "{EMOJIS_ZIP}" -d /content/
!ls /content/emojis/

In [None]:
# 6. 데이터셋 로드
import json
from PIL import Image

with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"총 데이터: {len(raw_data)}개")
print(f"샘플: {raw_data[0]}")

In [None]:
# 7. 모델 & 프로세서 로드
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

# 4bit 양자화
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

processor = AutoProcessor.from_pretrained(MODEL_ID)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

print(f"모델: {MODEL_ID}")
print(f"메모리: {model.get_memory_footprint() / 1e9:.2f} GB")

In [None]:
# 8. LoRA 설정
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# 9. 데이터 전처리 (Qwen2.5-VL 형식)
from qwen_vl_utils import process_vision_info

DEBI_SYSTEM = """너는 데비야! 이터널리턴에서 온 캐릭터고, 지금은 디스코드 봇으로 활동 중이야.
성격: 밝고 활발하고 친근해! 마를렌 동생이랑 같이 다녀.
역할: 이터널리턴 게임 정보 (아이템, 캐릭터, 특성 등) 알려주기
말투: 반말로 친근하게! ~야, ~어, ~지, ~해 같은 어미 사용"""

def load_image(image_path):
    if image_path is None:
        return None
    full_path = f"/content/{image_path}"
    if os.path.exists(full_path):
        return Image.open(full_path).convert("RGB")
    return None

def convert_to_qwen_format(item):
    """데이터를 Qwen2.5-VL 메시지 형식으로 변환"""
    image = load_image(item.get("image"))
    if image is None:
        return None
    
    conversations = item.get("conversations", [])
    
    messages = [{"role": "system", "content": DEBI_SYSTEM}]
    
    for conv in conversations:
        role = conv["from"]
        value = conv["value"]
        
        if role == "human":
            # <image> 태그 제거하고 이미지 따로 처리
            text = value.replace("<image>\n", "").replace("<image>", "").strip()
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": text}
                ]
            })
        elif role == "gpt":
            messages.append({"role": "assistant", "content": value})
        elif role == "tool":
            messages.append({"role": "assistant", "content": f"[Tool Result]: {value}"})
    
    return {"messages": messages, "image": image}

print("데이터 변환 중...")
processed_data = [convert_to_qwen_format(item) for item in raw_data]
processed_data = [d for d in processed_data if d is not None]
print(f"변환 완료: {len(processed_data)}개")

In [None]:
# 10. Dataset 생성
from datasets import Dataset

dataset = Dataset.from_list(processed_data)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

print(f"Train: {len(dataset['train'])}")
print(f"Test: {len(dataset['test'])}")

In [None]:
# 11. Collate Function (Qwen2.5-VL 전용)
def collate_fn(examples):
    texts = []
    all_images = []
    
    for example in examples:
        messages = example["messages"]
        
        # 템플릿 적용
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
        
        # 이미지 수집
        image_inputs, _ = process_vision_info(messages)
        if image_inputs:
            all_images.extend(image_inputs)
    
    # 프로세서로 인코딩
    batch = processor(
        text=texts,
        images=all_images if all_images else None,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024,
    )
    
    batch["labels"] = batch["input_ids"].clone()
    return batch

print("Collate function 준비 완료")

In [None]:
# 12. 학습 설정
from trl import SFTTrainer, SFTConfig

training_args = SFTConfig(
    output_dir="./debi_qwen_vlm_lora",

    # A100용 배치 설정
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,

    # 학습 설정
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",

    # 최적화
    bf16=True,
    gradient_checkpointing=True,
    optim="adamw_8bit",

    # 로깅 & 저장
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,

    # 기타
    remove_unused_columns=False,
    max_length=1024,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

print("학습 설정 완료!")

In [None]:
# 13. Trainer 생성
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=collate_fn,
)

print("Trainer 준비 완료!")

In [None]:
# 14. 학습 시작!
print("="*50)
print("학습 시작! 예상 시간: 1~2시간")
print("="*50)

trainer.train()

print("="*50)
print("학습 완료!")
print("="*50)

In [None]:
# 15. 모델 저장
SAVE_PATH = f"{DRIVE_PATH}/debi_qwen_vlm_lora"

trainer.save_model(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)

print(f"저장 완료: {SAVE_PATH}")

In [None]:
# 16. 테스트 함수
def ask_debi(image_path, question):
    image = Image.open(image_path).convert("RGB")
    
    messages = [
        {"role": "system", "content": DEBI_SYSTEM},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question}
            ]
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        return_tensors="pt",
        padding=True,
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
    
    # 입력 부분 제외하고 출력만
    output_ids = outputs[0][len(inputs.input_ids[0]):]
    response = processor.decode(output_ids, skip_special_tokens=True)
    
    print(f"Q: {question}")
    print(f"A: {response}")
    print("-"*40)

In [None]:
# 17. 테스트!
print("=" * 50)
print("데비 테스트")
print("=" * 50)

# 아이템 테스트
ask_debi("/content/emojis/items_graded/202503.png", "이 아이템 뭐야?")

# 캐릭터 테스트
ask_debi("/content/emojis/characters/Aya.png", "이 캐릭터 누구야?")

# Tool Use 테스트
ask_debi("/content/emojis/items_graded/116409.png", "이 아이템 스탯 알려줘")

# 완료!

저장 위치: `Drive/eternal_return_vlm/debi_qwen_vlm_lora/`

## Discord 봇에서 사용
```python
from peft import PeftModel
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

# 베이스 모델
model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# LoRA 어댑터 적용
model = PeftModel.from_pretrained(model, "./debi_qwen_vlm_lora")
```