In [1]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
BASE_MODEL = "beomi/gemma-ko-7b"

# 모델 & 토크나이저 로드
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [5]:
# ========== 1. 데이터 로드 및 전처리 ==========
df = pd.read_csv("./data/train.csv", encoding="utf-8-sig")

# 프롬프트 생성
def generate_prompt(example):
    prompt_list = []
    for i in range(len(example['input'])):
        prompt_list.append(r"""<bos><start_of_turn>user
글자 수와 띄어쓰기를 유지하여 리뷰를 난독화하세요:

{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn><eos>""".format(example['output'][i], example['input'][i]))
    return prompt_list


In [6]:
df['prompt'] = generate_prompt(df)

In [7]:
df['prompt'][0]

'<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를 난독화하세요:\n\n별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.<end_of_turn>\n<start_of_turn>model\n별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶.<end_of_turn><eos>'

In [8]:
df = df.drop(columns=['ID', 'input', 'output'])
df.head()

Unnamed: 0,prompt
0,<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를...
1,<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를...
2,<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를...
3,<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를...
4,<bos><start_of_turn>user\n글자 수와 띄어쓰기를 유지하여 리뷰를...


In [9]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"], padding=True, truncation=True, max_length=1390, return_tensors="pt").to("cuda"), batched=True)

train_dataset = dataset
#train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
#train_dataset = train_test_split['train']
#test_dataset = train_test_split['test']

Map:   0%|          | 0/11263 [00:00<?, ? examples/s]

In [10]:
train_dataset

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 11263
})

In [11]:
# ========== 2. 환경 세팅 ==========
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 3072, padding_idx=0)
        (layers): ModuleList(
          (0-27): 28 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
            

In [12]:
# ========== 3. Fine-tuning ==========
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    #eval_strategy="steps",
    #eval_steps=100, # 모델의 평가 주기
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=10, # 학습률 스케줄링
    logging_strategy="steps",
    learning_rate=2e-4,
    group_by_length=True,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    #eval_dataset=test_dataset,
    args=training_args,
    peft_config=peft_config,
    processing_class=tokenizer,
    formatting_func=lambda x: x['input_ids']
)

model.config.use_cache = False
torch.cuda.empty_cache()
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
100,1.5601
200,1.224
300,1.207
400,1.1804
500,1.1389
600,1.152
700,1.1474
800,1.1486
900,1.107
1000,1.1169


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1408, training_loss=1.167559806596149, metrics={'train_runtime': 13207.5681, 'train_samples_per_second': 0.853, 'train_steps_per_second': 0.107, 'total_flos': 6.999991643215565e+17, 'train_loss': 1.167559806596149, 'epoch': 1.0})

In [13]:
# ========== 4. 모델 저장 ==========
ADAPTER_MODEL = "lora_adapter_encoder"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [14]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.bfloat16)

model.save_pretrained('gemma-ko-7b-finetuning-encoder')

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]