## Setup
* target 40% trainable parmas

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.5 MB/

In [1]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)

import pandas as pd
import numpy as np
from numpy import random as np_rnd
import random as rnd
import re
import os
import pickle
import shutil
from datetime import datetime
from time import time
import gc
from itertools import product
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import transformers
from transformers import get_polynomial_decay_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import EarlyStoppingCallback
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from peft import prepare_model_for_kbit_training
from peft import LoraConfig
from peft import get_peft_model

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    except:
        pass

def pickleIO(obj, src, op="r"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)

def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

def get_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return {"all_param": all_params, "trainable_params": trainable_params, "trainable%": 100 * trainable_params / all_params}

In [3]:
class CFG:
    debug = False
    dataset_version = "5주차_v3"

    model_id = "beomi/KoAlpaca-Polyglot-12.8B"
    epochs = 2 if debug else 10
    early_stopping_rounds = 10
    batch_size = 4
    eta = 5e-4
    weight_decay = 1e-4
    # r: [16, 8, 4] about-> [20%, 10%, 5%] (for 12.8)
    # r: [8, 4, 2] about-> [20%, 10%, 5%] (for 1.3)
    max_lora_r = 32
    max_seq_len = 256
    gen_max_seq_len = 512

    validation_split = False
    validation_ratio = 0.1
    reduce_sample_size = True

## Loading data

In [4]:
df_qa = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/dataset/prepdata/FNCSP_FAQ_전처리_{CFG.dataset_version}.csv", encoding="utf8")
df_qa.columns = ["input", "output"]
df_qa["input"] = df_qa["input"].astype("str").apply(lambda x: x.strip())
df_qa["output"] = df_qa["output"].astype("str").apply(lambda x: x.strip())

if CFG.reduce_sample_size:
    df_qa = pd.concat([
        df_qa.iloc[:((len(df_qa) // 3) * 1)],
        df_qa.iloc[((len(df_qa) // 3) * 1):((len(df_qa) // 3) * 2)].sample(250, random_state=1),
        df_qa.iloc[((len(df_qa) // 3) * 2):((len(df_qa) // 3) * 3)].sample(250, random_state=2),
    ], axis=0).reset_index(drop=True)

if CFG.debug:
    df_qa = df_qa.sample(n=50, random_state=42).reset_index(drop=True)
else:
    df_qa = df_qa.sample(frac=1, random_state=42).reset_index(drop=True)

df_qa

Unnamed: 0,input,output
0,워드클라우드 분석 결과가 무엇인가요?,워드클라우드란 해당 기업의 텍스트 데이터를 분석해서 기업 특성을 파악하는데 보통 사...
1,기업컨설팅보고서 서비스에는 동일 업종의 상위 기업 목록이 있나요?,"네, 기업컨설팅보고서 서비스는 동일 업종의 상위 기업 목록을 제공합니다."
2,가입한 컨설턴트 구독 서비스의 요금 내역을 어떻게 확인할 수 있을까요?,컨설턴트 구독 서비스를 이용 중인 계정의 등록된 이메일로 구독 서비스 결제 시 바로...
3,원하는 기업을 즐겨찾기에 추가할 수 있는 기능이 있을까요?,현재 지원되고 있지 않은 기능입니다.
4,기업 비교 기능,현재 지원되고 있지 않은 기능입니다.
...,...,...
1509,기업 경영진 리더십 및 역량 수준은 어떤 방식으로 평가 되나요?,기업 경영진의 리더십과 역량 평가는 다음과 같은 방식으로 이루어집니다.\n기업을 이...
1510,조회를 클릭했는데 페이지로 이동이 안됩니다.,페이지로 이동했을 때 오류가 발생하면 아래 내용의 해당사항이 있는지 확인해주시기 바...
1511,어떤 신용카드가 결제에 사용될 수 있는지 정보를 알 수 있을까요?,"결제가 가능한 카드는 [신한, BC, KB국민, 현대, 삼성, 롯데, 하나(구외환)..."
1512,검색 결과에 없는 기업은 어떻게 등록 요청할 수 있을까요? 기업 등록을 요청하고 싶어요.,해당 부분에 대해서는 고객 센터로 문의해주시기 바랍니다.


In [5]:
df_test = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/dataset/prepdata/test_prompt.csv", encoding="utf8")
df_test = df_test.fillna("none")
df_test.columns = ["input", "output"]
df_test["input"] = df_test["input"].astype("str").apply(lambda x: x.strip())
df_test["output"] = df_test["output"].astype("str").apply(lambda x: x.strip())
test_prompt = df_test["input"].to_list()
df_test

Unnamed: 0,input,output
0,세금계산서 신청 후 내역은 어떻게 알 수 있을까요?,빠른 처리를 위해 고객센터로 연락주시면 바로 도와드리겠습니다.
1,세금계산서 신청을 했는데 이메일이 안왔어요.,빠른 처리를 위해 고객센터로 연락주시면 바로 도와드리겠습니다.
2,업종을 선택했는데 통계정보 표시가 안돼요.,일시적인 현상일 가능성이 높기 때문에 페이지 새로고침을 권장드립니다.
3,PDF 버튼을 클릭했는데 저장이 안돼요.,일시적인 현상일 가능성이 높기 때문에 페이지 새로고침을 권장드립니다.\n문제가 해결...
4,동종업계 분석 결과 리스트를 알 수 있나요?,자세한 내용은 고객 센터로 문의 바랍니다.
5,외국인도 FNCSP 서비스를 이용할 수 있나요?,자세한 내용은 고객 센터로 문의 바랍니다.
6,컨설턴트 구독이란 뭐에요?,자세한 건 고객센터에 물어봐 주세요.
7,일반 가입하고 컨설턴트 구독하고 뭐가 달라요?,자세한 건 고객센터에 물어봐 주세요.
8,가장 높거나 낮은 평가지수를 가진 업종을 선택할 순 없나요?,"네, 현재 지원되지 않고 있는 기능입니다."
9,전체 업종별 통계를 볼 수 있나요?,"아니요, 현재 지원되지 않고 있는 기능입니다."


## Create dataset

In [6]:
config_tokenizer = {
    "max_length": CFG.max_seq_len,
    "padding": "max_length",
    "truncation": True,
    "return_token_type_ids": False,
    "return_tensors": "pt",
}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_id, padding_side="right")

if CFG.validation_split:
    shuffled_idx = np_rnd.permutation(len(df_qa))
    feature_container = DatasetDict({
        "train": Dataset.from_pandas(df_qa.iloc[shuffled_idx[int(len(df_qa) * CFG.validation_ratio):]]).with_format("torch"),
        "valid": Dataset.from_pandas(df_qa.iloc[shuffled_idx[:int(len(df_qa) * CFG.validation_ratio)]]).with_format("torch"),
    })
else:
    feature_container = DatasetDict({
        "train": Dataset.from_pandas(df_qa).with_format("torch"),
    })

feature_container = feature_container.map(lambda x: {'text': f"### 질문: {x['input']}\n\n### 답변: {x['output']}<|endoftext|>"}, remove_columns=feature_container["train"].column_names)
feature_container = feature_container.map(lambda samples: tokenizer(samples["text"], **config_tokenizer), remove_columns=feature_container["train"].column_names)
feature_container = feature_container.map(lambda samples: {"input_ids": samples["input_ids"].squeeze(dim=0), "attention_mask": samples["attention_mask"].squeeze(dim=0)})

display(feature_container["train"])

Map:   0%|          | 0/1514 [00:00<?, ? examples/s]

Map:   0%|          | 0/1514 [00:00<?, ? examples/s]

Map:   0%|          | 0/1514 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1514
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_id, padding_side="left")

df_test = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/dataset/prepdata/test_prompt.csv", encoding="utf8")
df_test = df_test.fillna("none")
df_test.columns = ["input", "output"]
df_test["input"] = df_test["input"].astype("str").apply(lambda x: x.strip())
df_test["output"] = df_test["output"].astype("str").apply(lambda x: x.strip())
df_test

test_ds = DatasetDict({
    "test": Dataset.from_pandas(df_test).with_format("torch"),
})

test_ds = test_ds.map(lambda x: {'text': f"### 질문: {x['input']}\n\n### 답변:"}, remove_columns=test_ds["test"].column_names)
test_ds = test_ds.map(lambda samples: tokenizer(samples["text"], **config_tokenizer), remove_columns=test_ds["test"].column_names)
test_ds = test_ds.map(lambda samples: {"input_ids": samples["input_ids"].squeeze(dim=0), "attention_mask": samples["attention_mask"].squeeze(dim=0)})

display(test_ds["test"])

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 60
})

## Define helper functions

In [9]:
def get_optimizer_params(model, eta, weight_decay):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        # apply weight decay
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': weight_decay},
        # don't apply weight decay for LayerNormalization layer
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': 0.0},
    ]
    return optimizer_parameters

def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    scheduler = get_polynomial_decay_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=0.5, lr_end=1e-5
    )
    return scheduler

In [10]:
def do_training(params, feature_container, save_path="./tmp/"):
    start_time = time()

    # config on lora for fine-tuning
    lora_config = LoraConfig(
        **params["config_lora"],
        target_modules=["query_key_value"],
        bias="none",
        task_type="CAUSAL_LM"
    )

    # initialize model with quantization
    model = AutoModelForCausalLM.from_pretrained(CFG.model_id, quantization_config=bnb_config)
    model = prepare_model_for_kbit_training(model)
    model.gradient_checkpointing_enable()

    # get model for fine-tuning with lora
    model = get_peft_model(model, lora_config)
    params_info = get_trainable_parameters(model)
    print(params_info)

    # config on training
    if CFG.validation_split:
        train_args = transformers.TrainingArguments(
            num_train_epochs=CFG.epochs,
            per_device_train_batch_size=CFG.batch_size,
            per_device_eval_batch_size=CFG.batch_size,
            gradient_accumulation_steps=1,
            fp16=False,
            do_eval=True,
            save_strategy="epoch",
            evaluation_strategy="epoch",
            metric_for_best_model="eval_loss",
            load_best_model_at_end=True,
            logging_steps=len(feature_container["train"])//CFG.batch_size,
            output_dir="outputs",
            seed=GLOBAL_SEED,
        )
    else:
        train_args = transformers.TrainingArguments(
            num_train_epochs=CFG.epochs,
            per_device_train_batch_size=CFG.batch_size,
            gradient_accumulation_steps=1,
            fp16=False,
            do_eval=False,
            logging_steps=len(feature_container["train"])//CFG.batch_size,
            output_dir="outputs",
            seed=GLOBAL_SEED,
        )

    # config optimizer & scheduler
    optimizer_parameters = get_optimizer_params(
        model,
        eta=CFG.eta,
        weight_decay=CFG.weight_decay
    )
    optimizer = AdamW(optimizer_parameters, lr=CFG.eta, weight_decay=CFG.weight_decay)
    scheduler = get_scheduler(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=np.ceil(len(feature_container["train"]) / CFG.batch_size) * CFG.epochs,
    )

    # needed for gpt-neo-x tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    # silence the warnings
    model.config.use_cache = False

    if CFG.validation_split:
        trainer = transformers.Trainer(
            model=model,
            train_dataset=feature_container["train"],
            eval_dataset=feature_container["valid"],
            data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
            optimizers=(optimizer, scheduler),
            args=train_args
        )
    else:
        trainer = transformers.Trainer(
            model=model,
            train_dataset=feature_container["train"],
            data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
            optimizers=(optimizer, scheduler),
            args=train_args
        )

    # training
    training_result = trainer.train()

    # save model
    createFolder(save_path)
    model.save_pretrained(save_path + "model/")
    tokenizer.save_pretrained(save_path + "tokenizer/")

    end_time = time()

    output = {
        "params_info": params_info,
        "training_result": training_result,
        "training_runtime": round(end_time - start_time, 3),
        "training_runtime_per_sample": round((end_time - start_time) / len(feature_container["train"]), 3),
    }

    del model
    gc.collect()
    torch.cuda.empty_cache()
    return output


def do_inference(params, model, prompt):
    start_time = time()

    model.eval()
    model.config.use_cache = True
    gen_config = {
        **params["generator"],
        "max_new_tokens": params["max_tokens"],
        "early_stopping": True if params["generator"]["num_beams"] > 1 else False,
        "eos_token_id": 2,
    }

    # generating
    response= []
    for batch in DataLoader(prompt, batch_size=CFG.batch_size * 4, shuffle=False):
        for k in batch.keys():
            batch[k] = batch[k].to(device)
        with torch.no_grad():
            gened = model.generate(
                **batch,
                **gen_config,
            )
        response.extend(tokenizer.batch_decode(gened, skip_special_tokens=True))

    end_time = time()

    # decoding & return
    output = {
        "response": response,
        "inference_runtime": round(end_time - start_time, 3),
        "inference_runtime_per_sample": round((end_time - start_time) / len(prompt), 3),
    }

    return output

## Training

In [11]:
# config on model for quantization
bnb_config = BitsAndBytesConfig(
    # 모델을 4bit로 로딩하도록 설정합니다
    load_in_4bit=True,
    # double quantization 모드를 활성화합니다 (weight 저장과 계산을 다른 타입으로 할 수 있게 합니다)
    bnb_4bit_use_double_quant=True,
    # double quantization 모드에서 저장될 4bit 데이터 타입을 지정합니다
    bnb_4bit_quant_type="nf4",
    # double quantization 모드에서 계산에 사용할 데이터 타입을 지정합니다
    bnb_4bit_compute_dtype=torch.bfloat16,
    # set device
    device_map="auto",
)

training_fixed_params = {}
if CFG.debug:
    training_search_space = {
        "config_lora": {
            "r": [CFG.max_lora_r],
            "lora_alpha": [32],
            "lora_dropout": [1/2**2],
        }
    }
else:
    # training_search_space = {
    #     "config_lora": {
    #         "r": [CFG.max_lora_r, CFG.max_lora_r//2, CFG.max_lora_r//4],
    #         "lora_alpha": [32, 16, 8],
    #         "lora_dropout": [1/2**2, 1/2**3, 1/2**4],
    #     }
    # }
    training_search_space = {
        "config_lora": {
            "r": [CFG.max_lora_r],
            "lora_alpha": [32],
            "lora_dropout": [1/2**4],
        }
    }

# "greedy": {"num_beams": 1, "do_sample": False},
# "multinomial_greedy": {"num_beams": 1, "do_sample": True},
# "beam_search": {"num_beams": 5, "do_sample": False},
# "multinomial_beam_search": {"num_beams": 5, "do_sample": True},
inference_fixed_params = {
    "max_tokens": CFG.gen_max_seq_len,
}
if CFG.debug:
    inference_search_space = {
        "generator": {
            "num_beams": [1],
            "do_sample": [False]
        }
    }
else:
    inference_search_space = {
        "generator": {
            "num_beams": [1, 5],
            "do_sample": [False, True]
        }
    }

time_name = datetime.now().strftime('%Y-%m-%d_%H-%M')

In [12]:
for idx, value in enumerate(product(*training_search_space["config_lora"].values())):
    seed_everything(GLOBAL_SEED)
    architecture_name = f"koalpaca_12.8b_{CFG.dataset_version}_{time_name}_case{idx}"
    save_path = f"/content/drive/MyDrive/Colab Notebooks/architecture/{architecture_name}/"
    createFolder(save_path + "test_case/")
    params = training_fixed_params.copy()
    params.update({"config_lora": {k: v for k, v in zip(training_search_space["config_lora"].keys(), value)}})
    # training
    output = do_training(params, feature_container, save_path)
    # save output
    with open(save_path + "search_params.json", "w", encoding="utf8") as f:
        f.write(json.dumps(params))
    with open(save_path + "training_output.json", "w", encoding="utf8") as f:
        f.write(json.dumps(output))
    # # load model for inference
    # model = AutoModelForCausalLM.from_pretrained(save_path + "model/", quantization_config=bnb_config)
    # for idx2, value2 in enumerate(product(*inference_search_space["generator"].values())):
    #     response_container = []
    #     params = inference_fixed_params.copy()
    #     params.update({"generator": {k: v for k, v in zip(inference_search_space["generator"].keys(), value2)}})
    #     # inference
    #     output = do_inference(params, model, test_ds["test"])
    #     response_container = {"prompt": test_prompt, "response": output["response"]}
    #     # save output
    #     # pd.DataFrame(response_container).to_csv(save_path + f"test_case/test_case{idx2}.csv", encoding="utf8", index=False)
    #     with open(save_path + f"test_case/test_case{idx2}.json", "w", encoding="utf8") as f:
    #         f.write(json.dumps(response_container))
    #     with open(save_path + f"test_case/inference_params{idx2}.json", "w", encoding="utf8") as f:
    #         f.write(json.dumps(params))
    #     with open(save_path + f"test_case/inference_output{idx2}.json", "w", encoding="utf8") as f:
    #         f.write(json.dumps({"inference_runtime": output["inference_runtime"], "inference_runtime_per_sample": output["inference_runtime_per_sample"]}))

    # del model
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/28 [00:00<?, ?it/s]

{'all_param': 6627573760, 'trainable_params': 26214400, 'trainable%': 0.39553539423754375}


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
378,1.1774
756,0.5284
1134,0.3623
1512,0.2908
1890,0.2539
2268,0.2308
2646,0.2116
3024,0.1969
3402,0.1837
3780,0.1693


In [13]:
import shutil
shutil.copytree("./outputs", save_path +"checkpoints/")

'/content/drive/MyDrive/Colab Notebooks/architecture/koalpaca_12.8b_5주차_v3_2023-09-02_03-02_case0/checkpoints/'