In [None]:
!pip install transformers
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install datasets

In [2]:
import transformers
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, AutoPeftModelForCausalLM, get_peft_model
from peft import IA3Model, IA3Config
from datasets import load_dataset

In [3]:
config = IA3Config(peft_type = "IA3", task_type = "CAUSAL_LM")

tokenizer = AutoTokenizer.from_pretrained('EleutherAI/polyglot-ko-1.3b')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/polyglot-ko-1.3b', device_map = 'auto', torch_dtype = torch.float16)
ia3_model = get_peft_model(model, config)

Downloading (…)okenizer_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/748M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [4]:
ia3_model

PeftModelForCausalLM(
  (base_model): IA3Model(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(30080, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): Linear(
                in_features=2048, out_features=6144, bias=True
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 6144x1 (GPU 0)])
              )
              (dense): Linear(in_features=2048, out_features=2048, bias=Tr

In [5]:
data = load_dataset('beomi/KoAlpaca-v1.1a')

Downloading readme:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/21155 [00:00<?, ? examples/s]

In [18]:
data = data.remove_columns(['url'])
data = data.map(lambda x: {"text" : f"### 질문:{x['instruction']}\n\n### 답변: {x['output']}<|endoftext|>"})
data = data.map(lambda x: tokenizer(x['text']), batched = True)

Map:   0%|          | 0/21155 [00:00<?, ? examples/s]

In [19]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21155
    })
})

In [11]:
tokenizer.pad_token = tokenizer.eos_token
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

In [21]:
trainer = transformers.Trainer(
    model = ia3_model,
    train_dataset = data['train'],
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 1,
        max_steps = 50,
        learning_rate = 6e-2,
        logging_steps = 1,
        output_dir = "/content/drive/MyDrive/ColabNotebooks"
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.375
2,2.6777
3,2.6816
4,2.582
5,2.3887
6,2.4902
7,2.9785
8,2.6797
9,2.6152
10,2.3359


TrainOutput(global_step=50, training_loss=2.36462890625, metrics={'train_runtime': 19.8598, 'train_samples_per_second': 5.035, 'train_steps_per_second': 2.518, 'total_flos': 209442631237632.0, 'train_loss': 2.36462890625, 'epoch': 0.0})

In [27]:
ans = ia3_model.generate(**tokenizer("### 질문: 몸에 단백질이 부족하면 나타나는 현상은?\n\n### 답변:", return_tensors = 'pt', return_token_type_ids = False).to('cuda'), max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [28]:
tokenizer.decode(ans[0]).replace("### 질문: 몸에 단백질이 부족하면 나타나는 현상은?\n\n### 답변", "")

': 단백질이 부족하면 나타나는 현상은 다음과 같습니다. 1.: 근육이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 근육량이 줄어들고, 근육이 줄어들면 체중이 줄어들고, 체중이 줄어들면 체중이 줄어들고, 체중이 줄어들면 체중이 줄어들고, 체중이 줄어들면 체중이 줄어들고, 체중'