In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import bitsandbytes
import peft

In [2]:
from datasets import Dataset
import boto3
from io import BytesIO
import pandas as pd

# boto3 클라이언트 설정
client = boto3.client(
    's3',
    endpoint_url="http://artifact-store:9000",
    aws_access_key_id="minio",
    aws_secret_access_key="miniostorage"
)

# 방법 1: pandas 경유 (추천)
response = client.get_object(Bucket="instruction", Key="data/0.0.1v/agriculture.trainset.parquet")
parquet_data = response['Body'].read()

# pandas로 읽고 Dataset으로 변환
df = pd.read_parquet(BytesIO(parquet_data))
dataset = Dataset.from_pandas(df)

print(f"데이터 크기: {len(dataset)}")
print(f"컬럼: {dataset.column_names}")

데이터 크기: 18267
컬럼: ['QUESTION', 'ANSWER']


In [3]:
dataset

Dataset({
    features: ['QUESTION', 'ANSWER'],
    num_rows: 18267
})

In [4]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "Qwen/Qwen2.5-7B-Instruct"

# bf16을 사용할 수 있다면 권장 (3090Ti는 지원 O)
compute_dtype = torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",            # nf4는 QLoRA와 호환성 좋음
    bnb_4bit_compute_dtype=compute_dtype, # torch.bfloat16 또는 torch.float16
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",                    # multi-GPU or 자동 GPU 할당
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# QLoRA 설정
peft_config = LoraConfig(
    r=16,  # rank
    lora_alpha=32,  # scaling parameter
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Qwen2.5용
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# 모델 준비
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [6]:
def format_agriculture_instruction(example):
    messages = [
        {
            "role": "system", 
            "content": "너는 농업 전문가야. 농업과 관련된 질문에 전문적이고 실용적인 답변을 제공해줘."
        },
        {
            "role": "user", 
            "content": example["QUESTION"]
        },
        {
            "role": "assistant", 
            "content": example["ANSWER"]
        }
    ]
    
    formatted = tokenizer.apply_chat_template(
        messages, 
        tokenize=False,
        add_generation_prompt=False
    )
    
    return {"text": formatted}


train_dataset = dataset.map(format_agriculture_instruction).remove_columns(['QUESTION', 'ANSWER'])
train_dataset["text"][0]

# train/eval 분할
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

Map:   0%|          | 0/18267 [00:00<?, ? examples/s]

Train: 16440, Eval: 1827


In [9]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./qwen_outputs",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,  # eval은 더 크게
    gradient_accumulation_steps=8,
    num_train_epochs=10,  # 넉넉하게 설정
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    
    # Evaluation 설정
    eval_strategy="steps",
    eval_steps=100,  # 100 step마다 평가
    save_strategy="steps",
    save_steps=100,
    
    # Early Stopping 설정
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    logging_steps=50,
    bf16=True,
    save_total_limit=3,  # 더 많이 저장
    report_to=["mlflow"],
    run_name="qwen-qlora-agriculture",
    label_names=["labels"],
    seed=42,
)

In [16]:
! pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl (24.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scikit-learn<2
  Downloading scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting alembic!=1.10.0,<2
  Downloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting mlflow-skinny==3.1.1
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting gunicorn<24
  Downloading gunicorn-23.0.0-py3-none-any.whl (8

In [10]:
import mlflow
mlflow.set_tracking_uri("http://mlflow-server:5000")  # 또는 mlruns 디렉토리
mlflow.set_experiment("Qwen-Finetuning")

<Experiment: artifact_location='s3://mlflow/1', creation_time=1753066942916, experiment_id='1', last_update_time=1753066942916, lifecycle_stage='active', name='Qwen-Finetuning', tags={}>

In [11]:
from trl import SFTConfig, SFTTrainer

# SFTConfig로 설정 통합
sft_config = SFTConfig(
    output_dir="./qwen_outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    save_total_limit=2,
    report_to=["mlflow"],
    run_name="qwen-qlora-finetune",
    dataset_text_field="text",
    max_seq_length=2048,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=sft_config,
    processing_class=tokenizer,
    peft_config=peft_config,
)

Adding EOS to train dataset:   0%|          | 0/16440 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16440 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16440 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.6764
20,2.0426
30,1.5577
40,1.3005
50,1.154
60,1.0568
70,1.0313
80,1.0395
90,1.0328
100,0.9805


In [6]:
model

NameError: name 'model' is not defined