In [1]:
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments 
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
# GPU 0번
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
# 모델 및 토크나이저 로드
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="yanolja/EEVE-Korean-Instruct-10.8B-v1.0",  # Unsloth에서 제공하는 모델 사용
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3080 Ti. Max memory: 11.755 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [4]:
# PEFT 설정
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2024.8 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [5]:
# 데이터셋 로드 및 준비
dataset = load_dataset("json", data_files="./dataset/results.jsonl", split="train")

EOS_TOKEN = tokenizer.eos_token  # EOS 토큰 가져오기

def format_conversation(row):
    return f"{row['Author']}: {row['Input']}\nAI: {row['Response']}{EOS_TOKEN}"

dataset = dataset.map(lambda row: {'text': format_conversation(row)})

In [6]:
# 학습 설정
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=3,
    max_steps=1500,
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=3407,
    output_dir="outputs",
)


In [7]:

# SFTTrainer 설정 및 학습
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)


max_steps is given, it will override any value given in num_train_epochs


In [8]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,024 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,500
 "-____-"     Number of trainable parameters = 62,914,560
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,2.6676
2,3.0124
3,2.5444
4,2.5555
5,2.6045
6,2.8405
7,2.6355
8,2.5121
9,1.9251
10,2.0488


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=1500, training_loss=0.6264957962607344, metrics={'train_runtime': 6343.3156, 'train_samples_per_second': 1.892, 'train_steps_per_second': 0.236, 'total_flos': 1.818184885559132e+17, 'train_loss': 0.6264957962607344, 'epoch': 5.928853754940711})

In [10]:
save_directory = "./trained_model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.json')

# 모델 로드 및 사용

In [3]:
import torch
from transformers import AutoTokenizer

save_directory = "./trained_model"

# 모델과 토크나이저 로드
model, tokenizer = FastLanguageModel.from_pretrained(save_directory)  # 모델과 토크나이저를 튜플로 반환



==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3080 Ti. Max memory: 11.755 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.21it/s]
Unsloth 2024.8 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [4]:
from transformers import pipeline

# 모델을 추론 모드로 전환 (Enable native 2x faster inference)
model = FastLanguageModel.for_inference(model)

# 테스트할 입력 문장 설정
prompt = "글쓰기에서 '디테일'의 중요성은 무엇인가요?"

# 입력 토큰화 및 CUDA로 이동
inputs = tokenizer(
    [prompt], 
    return_tensors="pt", 
    padding=True, 
    truncation=True, 
    max_length=512
).to("cuda")

# 모델 추론 수행
with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=100,  # 생성할 최대 토큰 수
        use_cache=True,  # 캐시 사용
        num_return_sequences=1,  # 하나의 시퀀스만 반환하도록 설정
        repetition_penalty=1.2,  # 반복되는 단어에 패널티 적용
        no_repeat_ngram_size=2, # n-gram 반복을 방지
        eos_token_id=tokenizer.eos_token_id,  # 문장의 끝을 명시적으로 지정
        pad_token_id=tokenizer.pad_token_id   # 패딩 토큰 지정
    )

# 결과 디코딩 및 출력
generated_text_list = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# 전체 텍스트 출력
full_text = generated_text_list[0]

if "AI:" in full_text:
    generated_text = full_text.split("AI:")[1].strip()

print("AI:", generated_text)

AI: 디테일은 글에 깊이와 현실감을 더해줍니다. 구체적인 예시나 설명을 통해 독자가 더 잘 이해할 수 있도록 돕고, 글의 신뢰성을 높이는 데 기여합니다.
