In [3]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [5]:
# Model from Hugging Face hub
base_model = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

# New instruction dataset
finance_dataset = "./data/train.csv"

# Fine-tuned model
new_model = "llama-3-Korean-Bllossom-8B-fin"

In [6]:
dataset = load_dataset("csv", data_files=finance_dataset)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['SAMPLE_ID', 'Source', 'Source_path', 'Question', 'Answer'],
        num_rows: 496
    })
})

In [8]:
print("Source: ", dataset["train"][0]["Source"])
print("Question: ", dataset["train"][0]["Question"])
print("Answer: ", dataset["train"][0]["Answer"])

Source:  1-1 2024 주요 재정통계 1권
Question:  2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?
Answer:  2024년 중앙정부 재정체계는 예산(일반·특별회계)과 기금으로 구분되며, 2024년 기준으로 일반회계 1개, 특별회계 21개, 기금 68개로 구성되어 있습니다.


In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Question'])):
        text = f"### Question: {example['Question'][i]}\n ### Answer: {example['Answer'][i]}"
        output_texts.append(text)
    return {'text': output_texts}

# 데이터셋 전처리 및 'text' 필드 추가
formatted_dataset = dataset['train'].map(formatting_prompts_func, batched=True)

# 전처리된 데이터셋 확인
print(formatted_dataset.column_names)  # ['SAMPLE_ID', 'Source', 'Source_path', 'Question', 'Answer', 'text']

['SAMPLE_ID', 'Source', 'Source_path', 'Question', 'Answer', 'text']


In [10]:
formatted_dataset

Dataset({
    features: ['SAMPLE_ID', 'Source', 'Source_path', 'Question', 'Answer', 'text'],
    num_rows: 496
})

In [11]:
print("Source: ", formatted_dataset[0]["Source"])
print("Question: ", formatted_dataset[0]["Question"])
print("Answer: ", formatted_dataset[0]["Answer"])
print("text: ", formatted_dataset[0]["text"])

Source:  1-1 2024 주요 재정통계 1권
Question:  2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?
Answer:  2024년 중앙정부 재정체계는 예산(일반·특별회계)과 기금으로 구분되며, 2024년 기준으로 일반회계 1개, 특별회계 21개, 기금 68개로 구성되어 있습니다.
text:  ### Question: 2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?
 ### Answer: 2024년 중앙정부 재정체계는 예산(일반·특별회계)과 기금으로 구분되며, 2024년 기준으로 일반회계 1개, 특별회계 21개, 기금 68개로 구성되어 있습니다.


In [12]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [17]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Question'])):
        text = f"### Question: {example['Question'][i]}\n ### Answer: {example['Answer'][i]}"
        output_texts.append(text)
    return output_texts

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [20]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 