In [1]:
import torch
import json
import os
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
from datasets import load_dataset, Dataset
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
)
import bitsandbytes as bnb

Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\helen\anaconda3\envs\LLMenv\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\helen\anaconda3\envs\LLMenv\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\helen\anaconda3\envs\LLMenv\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\helen\anaconda3\envs\LLMenv\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\helen\anaconda3\envs\LLMenv\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 6: invalid start byte


In [2]:
# ========== GPU 메모리 확인 ==========
print("GPU 정보:")
print(f"GPU Name: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

GPU 정보:
GPU Name: NVIDIA GeForce RTX 4060
GPU Memory: 8.00 GB


In [3]:
# ========== 설정 ==========
@dataclass
class Config:
    # 모델 설정
    model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"  
    # model_name: str = "meta-llama/Llama-2-7b-hf"  # 7B 모델 (16GB 이상)
    
    # 데이터 경로
    train_data_path: str = "HTP_data.jsonl"
    
    # 출력 경로
    output_dir: str = "./htp_lora_model"
    
    # 배치 크기 (RTX 4060 8GB용)
    per_device_train_batch_size: int = 2
    per_device_eval_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    
    # 학습 설정
    num_train_epochs: int = 10
    learning_rate: float = 2e-4
    warmup_ratio: float = 0.03
    
    # 텍스트 길이
    max_seq_length: int = 512
    
    # LoRA 설정
    lora_r: int = 8
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    
    # 8-bit 양자화 (메모리 절감)
    use_8bit: bool = True
    
    # 기타
    seed: int = 42
    fp16: bool = True  # Mixed precision training


# ========== 데이터 로딩 ==========
def load_jsonl_data(file_path: str) -> list:
    """JSONL 파일 읽기"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def prepare_dataset(data: list, tokenizer, config: Config) -> Dataset:
    """데이터셋 준비"""
    
    def formatting_func(examples):
        # input과 output을 합쳐서 학습 데이터 생성
        texts = []
        for inp, out in zip(examples['input'], examples['output']):
            text = f"HTP 해석 입력: {inp}\n\nHTP 해석 출력: {out}"
            texts.append(text)
        
        # 토큰화
        tokenized = tokenizer(
            texts,
            max_length=config.max_seq_length,
            padding="max_length",
            truncation=True,
            return_tensors=None,
        )
        
        # labels 설정 (input_ids와 동일하게)
        tokenized["labels"] = tokenized["input_ids"].copy()
        
        return tokenized
    
    # Dataset으로 변환
    dataset = Dataset.from_dict({
        'input': [item['input'] for item in data],
        'output': [item['output'] for item in data],
    })
    
    # 처리
    processed_dataset = dataset.map(
        formatting_func,
        batched=True,
        batch_size=100,
        remove_columns=['input', 'output'],
        desc="포매팅 중...",
    )
    
    return processed_dataset


# ========== 모델 로딩 및 LoRA 설정 ==========
def setup_model_and_tokenizer(config: Config):
    """모델과 토크나이저 설정"""
    
    print(f"모델 로딩: {config.model_name}")
    
    # 토크나이저 로딩
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # 모델 로딩 (8-bit 양자화)
    if config.use_8bit:
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            load_in_8bit=True,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        
        # 8-bit 양자화 모델 준비
        model = prepare_model_for_kbit_training(
            model,
            use_gradient_checkpointing=True,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    
    # LoRA 설정
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=config.lora_r,
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        bias="none",
        target_modules=["q_proj", "v_proj"],  # 모델에 따라 수정 필요
    )
    
    # LoRA 적용
    model = get_peft_model(model, peft_config)
    
    # 학습 가능한 파라미터 확인
    model.print_trainable_parameters()
    
    return model, tokenizer

In [4]:
# ========== 메인 학습 함수 ==========
config = Config()

# 시드 설정
transformers.set_seed(config.seed)

# 출력 디렉토리 생성
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

# 1. 데이터 로드
print("데이터 로딩 중...")
raw_data = load_jsonl_data(config.train_data_path)
print(f"로드된 샘플 수: {len(raw_data)}")

# 2. 모델, 토크나이저 설정
model, tokenizer = setup_model_and_tokenizer(config)

# 3. 데이터셋 준비
print("데이터셋 준비 중...")
dataset = prepare_dataset(raw_data, tokenizer, config)

# 학습/검증 분할 (9:1)
split_dataset = dataset.train_test_split(test_size=0.1, seed=config.seed)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"학습 샘플: {len(train_dataset)}, 검증 샘플: {len(eval_dataset)}")

# 4. 학습 설정
training_args = TrainingArguments(
    output_dir=config.output_dir,
    overwrite_output_dir=True,
    
    # 배치 크기
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    
    # 에폭
    num_train_epochs=config.num_train_epochs,
    
    # 학습률
    learning_rate=config.learning_rate,
    warmup_ratio=config.warmup_ratio,
    
    # 옵티마이저
    optim="paged_adamw_32bit",  # 8-bit 최적화
    
    # 저장 및 로깅
    save_strategy="epoch",
    logging_steps=10,
    eval_strategy="epoch",
    
    # 계산 최적화
    fp16=config.fp16,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    
    # 기타
    seed=config.seed,
    report_to=["tensorboard"],
    logging_dir="./logs",
)

# 5. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8),
)

# 6. 학습 시작
print("학습 시작...")
trainer.train()

# 7. 모델 저장
print("모델 저장 중...")
model.save_pretrained(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

print(f"✓ 파인튜닝 완료: {config.output_dir}")



데이터 로딩 중...
로드된 샘플 수: 1453
모델 로딩: Qwen/Qwen2.5-1.5B-Instruct


`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705
데이터셋 준비 중...


포매팅 중...:   0%|          | 0/1453 [00:00<?, ? examples/s]

학습 샘플: 1307, 검증 샘플: 146


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


학습 시작...




Epoch,Training Loss,Validation Loss
1,0.2109,0.2133
2,0.1888,0.204048
3,0.184,0.199951
4,0.1916,0.197915
5,0.1711,0.195824
6,0.1686,0.196182
7,0.1627,0.195154
8,0.1561,0.19605
9,0.1506,0.196372
10,0.17,0.196334




모델 저장 중...
✓ 파인튜닝 완료: ./htp_lora_model


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM

# ========== 파인튜닝된 모델 로드 ==========
def load_finetuned_model(model_path: str):
    """파인튜닝된 모델 로드"""
    
    # LoRA 모델 로드
    model = AutoPeftModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    # 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    return model, tokenizer


# ========== 추론 함수 ==========
def generate_htp_interpretation(model, tokenizer, htp_input: str, max_length: int = 256):
    """HTP 해석 생성"""
    
    # 입력 준비
    prompt = f"HTP 해석 입력: {htp_input}\n\nHTP 해석 출력:"
    
    # 토큰화
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).to(model.device)
    
    # 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # 디코딩
    response = tokenizer.decode(outputs, skip_special_tokens=True)
    
    # 출력 부분만 추출
    if "HTP 해석 출력:" in response:
        response = response.split("HTP 해석 출력:")[-1].strip()
    
    return response





: 

In [None]:
# ========== 사용 예시 ==========
# 모델 로드
model, tokenizer = load_finetuned_model("./htp_lora_model")

# HTP 해석 생성
htp_input = "나무가 크고 가지가 많으며 뿌리가 깊게 표현됨"

interpretation = generate_htp_interpretation(
    model,
    tokenizer,
    htp_input,
    max_length=256,
)

print(f"입력: {htp_input}")
print(f"해석: {interpretation}")