<a href="https://colab.research.google.com/github/Ahnkyuwon504/AI-modeling/blob/main/BX-gpt-app/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 환경 설정

In [1]:
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U accelerate==0.27.2
!pip3 install -q -U pandas

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 2. Dataset

In [4]:
preprocessed_data_path = "/content/drive/MyDrive/AI-modeling/BX-gpt-app/CBP-Certi-instruction-data-preprocessed.jsonl"

import pandas as pd
import json

# 데이터 읽기
data = []
with open(preprocessed_data_path, "r", encoding="utf-8") as f:
    for line in f:
        json_line = json.loads(line)
        data.append({
            "question": json_line["question"],
            "answer": json_line["answer"]
        })

# 데이터셋 생성
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
# df.head()
dataset[0]

{'question': '우리나라 은행 IT 시스템의 발전 과정은 어떻게 되나요?',
 'answer': '1970년대 과목 별 온라인 시스템, 1980년대 종합 온라인 시스템, 1990년대 차세대 시스템, 그리고 포스트 차세대 시스템으로 진화해 왔습니다.'}

# 3. Pretrained Model load

In [5]:
PRE_TRAINED_MODEL = "beomi/gemma-ko-2b"

model = AutoModelForCausalLM.from_pretrained(PRE_TRAINED_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL, add_special_tokens=True)



config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

In [11]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200
)

In [17]:
prompt = "잔액유형의 갱신 방법은 어떻게 관리되나요?"

outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    add_special_tokens=True
)

In [18]:
outputs[0].get("generated_text")

"잔액유형의 갱신 방법은 어떻게 관리되나요?\u200b \u200b이번에 소개해드릴 상품은 바로바로 '카페인 프리' 입니다. 카페인을 함유하지 않아서 안심하고 마실 수 있는 제품입니다.\u200b제가 이렇게 좋은 정보를 알려주고 싶었던 가장 큰 이유는, 저도 한때 카페인 때문에 고생했기 때문입니다. 그래서 다른 분들께도 도움이 되길 원했습니다!\u200b그럼 지금부터 카페인 프리에 대해 자세히 살펴보겠습니다~^^\u200b\u200b \u200b\u200b저희는 카페인 프리에 대한 설명과 함께 여러분들이 더욱더 편리하게 이용하시도록 다양한 기능들을 제공합니다. \u200b또한, 카페인 프리 관련해서 문의사항이나 상담 등 필요하신 부분들은 언제든지 연락 주세요. 친절하게 응대 해 드립니다 :)\u200b\u200b\u200b \u200b\u200b오"

> 런타임 재시작해 메모리 clean

# 4. Instruction Tuning

In [5]:
!nvidia-smi

Fri Jul 19 14:44:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## 4.1. Instruction Prompt

In [6]:
def generate_prompt(example):
    output_texts = []
    for i in range(len(example['question'])):
        prompt = f"### Instruction: {example['question'][i]}\n\n### Response: {example['answer'][i]}<eos>"
        output_texts.append(prompt)
    return output_texts

In [7]:
print(generate_prompt(dataset[:1])[0])

### Instruction: 우리나라 은행 IT 시스템의 발전 과정은 어떻게 되나요?

### Response: 1970년대 과목 별 온라인 시스템, 1980년대 종합 온라인 시스템, 1990년대 차세대 시스템, 그리고 포스트 차세대 시스템으로 진화해 왔습니다.<eos>


## 4.2. QLoRA

In [8]:
lora_config = LoraConfig(
    r=6,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

## 4.3. Train

In [9]:
BASE_MODEL = "beomi/gemma-ko-2b"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer.padding_side = 'right'



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
#        num_train_epochs = 1,
        max_steps=3000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=0.03,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]



In [11]:
trainer.train()

Step,Training Loss
100,1.7081
200,1.3649
300,1.28
400,1.1774
500,1.163
600,1.1154
700,1.0937
800,1.0494
900,1.0486
1000,1.0537




TrainOutput(global_step=3000, training_loss=0.9082574666341146, metrics={'train_runtime': 3943.8031, 'train_samples_per_second': 3.043, 'train_steps_per_second': 0.761, 'total_flos': 8574856465563648.0, 'train_loss': 0.9082574666341146, 'epoch': 2.35})

## 4.4 저장

In [12]:
!pwd

/content


In [13]:
ADAPTER_MODEL = "lora_adapter_it"

trainer.model.save_pretrained(ADAPTER_MODEL)



In [14]:
!ls -alh lora_adapter_it

total 29M
drwxr-xr-x 2 root root 4.0K Jul 19 15:51 .
drwxr-xr-x 1 root root 4.0K Jul 19 15:51 ..
-rw-r--r-- 1 root root  688 Jul 19 15:51 adapter_config.json
-rw-r--r-- 1 root root  29M Jul 19 15:51 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Jul 19 15:51 README.md


In [16]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('/content/drive/MyDrive/AI-modeling/BX-gpt-app/beomi-gemma-ko-2b-it-f16')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
!ls -alh /content/drive/MyDrive/AI-modeling/BX-gpt-app/beomi-gemma-ko-2b-it-f16

total 4.7G
-rw------- 1 root root  637 Jul 19 15:59 config.json
-rw------- 1 root root  132 Jul 19 15:59 generation_config.json
-rw------- 1 root root 4.7G Jul 19 16:00 model-00001-of-00002.safetensors
-rw------- 1 root root  65M Jul 19 16:00 model-00002-of-00002.safetensors
-rw------- 1 root root  14K Jul 19 16:00 model.safetensors.index.json


# 5. Instruction model 추론

In [5]:
!nvidia-smi

Fri Jul 19 16:06:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
BASE_MODEL = "beomi/gemma-ko-2b"
FINETUNE_MODEL = "/content/drive/MyDrive/AI-modeling/BX-gpt-app/beomi-gemma-ko-2b-it-f16"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
pipe_finetuned = pipeline(
    "text-generation",
    model=finetune_model,
    tokenizer=tokenizer,
    max_new_tokens=200
)

In [9]:
prompt = "잔액유형의 갱신 방법은 어떻게 관리되나요?"
formatted_prompt = f"### Response: {prompt}\n\n### Response:"

formatted_prompt

'### Response: 잔액유형의 갱신 방법은 어떻게 관리되나요?\n\n### Response:'

In [10]:
outputs = pipe_finetuned(
    formatted_prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(formatted_prompt):])

 잔액유형의 갱신 방법을 잔액관리에서 관리합니다.
