In [None]:
!pip install -U datasets transformers trl peft bitsandbytes accelerate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0

## Gemma 2 Model Fine Tuning - Lora Model 학습

In [1]:
import torch
import pandas as pd
from datasets import Dataset, load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer




In [2]:
mean_five_elements_hanja_prompt = pd.read_csv('./dataset/prompt/mean_five_elements_hanja_prompt.csv')
mean_five_elements_solving_prompt = pd.read_csv('./dataset/prompt/mean_five_elements_solving_prompt.csv')
korean_female_naming_prompt = pd.read_csv('./dataset/prompt/female_names_prompt_deduplication.csv')
korean_male_naming_prompt = pd.read_csv('./dataset/prompt/male_names_prompt_deduplication.csv')
korean_five_elements_naming_prompt = pd.read_csv('./dataset/prompt/mean_five_elements_name_prompt.csv')
name_meaning_prompt = pd.read_csv('./dataset/prompt/name_meaning_prompt.csv')

In [3]:
df = pd.concat([
    mean_five_elements_hanja_prompt,
    mean_five_elements_solving_prompt,
    korean_female_naming_prompt,
    korean_male_naming_prompt,
    korean_five_elements_naming_prompt,
    name_meaning_prompt
], ignore_index=True)

dataset = Dataset.from_pandas(df)

print(dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 9314
})


In [4]:
def generate_prompt(dataset):
    prompt_list = []
    for i in range(len(dataset['question'])):
        prompt_list.append(r"""<bos><start_of_turn>user
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn><eos>""".format(dataset['question'][i], dataset['answer'][i]))
    return prompt_list

In [5]:
base_model = "rtzr/ko-gemma-2-9b-it"
lora_model = "./lora-model"

In [6]:
lora_config = LoraConfig(
    lora_alpha = 8,
    lora_dropout = 0.1,
    r=16,
    target_modules=["k_proj", "o_proj", "v_proj", "q_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/10 [00:00<?, ?it/s]

model-00001-of-00010.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model-00002-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00003-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00009-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00010-of-00010.safetensors:   0%|          | 0.00/705M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/40.5k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    formatting_func=generate_prompt,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="./outputs",
        num_train_epochs = 1,
        max_steps=3000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/9314 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()



Step,Training Loss
100,1.0092
200,0.4134
300,0.3668
400,0.3558
500,0.3324
600,0.3369
700,0.3292
800,0.322
900,0.3108
1000,0.3117




TrainOutput(global_step=3000, training_loss=0.30585841941833497, metrics={'train_runtime': 13000.621, 'train_samples_per_second': 0.923, 'train_steps_per_second': 0.231, 'total_flos': 8.911649571929395e+16, 'train_loss': 0.30585841941833497, 'epoch': 1.2883830792355595})

In [None]:
trainer.model.save_pretrained(lora_model)

## Base Model과 Lora Model 병합

In [None]:
base_model = "rtzr/ko-gemma-2-9b-it"
lora_model = "./lora-model"
fine_tuning_model = "./saju-naming"

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto', torch_dtype=torch.float16)

In [None]:
model = PeftModel.from_pretrained(model, lora_model, device_map='auto', torch_dtype=torch.float16)

In [None]:
model = model.merge_and_unload(progressbar=True)

## huggingface_hub 저장 또는 local 저장

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

In [None]:
# hugginface_hub 저장
model.push_to_hub('5KLetsGo/saju-naming')
tokenizer.push_to_hub('5KLetsGo/saju-naming')

In [None]:
# fine-tuning model local 저장
model.save_pretrained(fine_tuning_model)