In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# GPU 확인
!nvidia-smi

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

# 데이터셋

In [None]:
# 데이터셋 가져오기
from datasets import load_dataset

data = load_dataset("json", data_files='/content/drive/MyDrive/Koalpaca/code_data.json')

In [None]:
# data 전처리
data = data.map(
    lambda x:
    {'text': f"### 명령어: {'아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 파이썬 코드를 작성해주세요.'}\n\n### 질문: {x['instruction']}\n\n### 답변: {x['output']}<|endoftext|>" }
)

# 모델로드

In [None]:
# 모델 가져오기
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig

MODEL = 'beomi/KoAlpaca-Polyglot-5.8B'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=bnb_config, device_map={"":0})

# tokenize

In [None]:
# 텍스트 데이터만 tokenize
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

# PEFT

In [None]:
# Low bit 학습 준비
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# 학습하기

In [None]:
# 모델 학습
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        max_steps=50, ## 초소량만 학습: 50 step만 학습. 약 4분정도 걸립니다.
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
# 모델 평가모드
model.eval()
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

# 답변 생성 test

In [None]:
# 답변 생성 함수
def gen(x):
    gened = model.generate(
        **tokenizer(
            f"### 명령어: 아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 파이썬 코드를 작성해주세요.\n\n### 질문: {x}\n\n### 답변:",
            return_tensors='pt',
            return_token_type_ids=False
        ),
        max_new_tokens=512,
        early_stopping=True,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [None]:
gen('Python이 뭐야?')

# 모델 저장 & huggingface에 업로드

In [None]:
# huggingface 로그인
import huggingface_hub

huggingface_hub.login()

In [None]:
# 모델 업로드
# 아이디 부분 수정
model.push_to_hub('아이디/qlora-koalpaca-polyglot-5.8b-50step')