In [1]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import torch

import numpy as np
import evaluate

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = '0'
os.environ['LOCAL_RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'

2023-05-25 16:26:06.925681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset("yelp_review_full")

Found cached dataset yelp_review_full (/home/bigster/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

- 단 1번의 스텝으로 데이터를 처리하기 위해, `Datasets.map` method를 사용하자
- `multiprocessing` 이랑 비슷한 mechanism 인듯?

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [4]:
# tokenize_fn
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [5]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
# tokenized_datasets.set_format('torch')

Loading cached processed dataset at /home/bigster/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-64d6404193e8d0b4.arrow
Loading cached processed dataset at /home/bigster/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-32a25b1122eef73f.arrow
Loading cached shuffled indices for dataset at /home/bigster/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-c45c2b5ee50b85dc.arrow
Loading cached shuffled indices for dataset at /home/bigster/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-85ddd7c92ccad2ff.arrow


### Train
- HF의 `Trainer` API는 logging, gradient_accumulation, max_precision과 같은 다양한 학습 옵션을 지원

- 모델과 레이블 개수를 정확하게 명시해주기

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

- `TrainingArguments` 클래스 안에 모든 H-params를 포함시킬 수 있음

In [7]:
training_args = TrainingArguments(output_dir = './test_trainer')

- `Trainer`는 학습 중에 모델 성능에 대해 자동적으로 평가해주지 않음
- 그래서 `Trainer`를 성능 평가를 연산해주는 함수에 pass 시켜줘야함
- `Evaluate` 라이브러리는 간단한 accuracy 함수를 제공하며 `evaluate.load`를 통해 쉽게 불러올 수 있음

In [8]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

- `evaluation_strategy` 파라미터에 metrics 함수를 넣어 각 epoch의 끝마다 평가 함수를 report 받을 수 있음

In [15]:
training_args = TrainingArguments(output_dir="./test_trainer", evaluation_strategy="epoch", 
                                  deepspeed = 'ds_config_zero2.json', save_total_limit = 1, 
                                  fp16 = True)


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_train_dataset,
    eval_dataset = small_eval_dataset,
    compute_metrics = compute_metrics
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


### Fine-tune

In [16]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


[2023-05-25 16:49:07,047] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.9.2, git-hash=unknown, git-branch=unknown
[2023-05-25 16:49:07,078] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
Time to load cpu_adam op: 2.158184289932251 seconds
[2023-05-25 16:49:10,335] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
Adam Optimizer #2 is created with AVX2 arithmetic capability.
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1
[2023-05-25 16:49:10,341] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[2023-05-25 16:49:10,342] [INFO] [utils.py:54:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2023-05-25 16:49:10,342] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer
[2023-05-25 16:49:

Using /home/bigster/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module cpu_adam, skipping build step...
Loading extension module cpu_adam...
Using /home/bigster/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


Rank: 0 partition count [1] and sizes[(108314118, False)] 
[2023-05-25 16:49:10,947] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states
[2023-05-25 16:49:10,948] [INFO] [utils.py:786:see_memory_usage] MA 0.66 GB         Max_MA 0.75 GB         CA 0.76 GB         Max_CA 1 GB 
[2023-05-25 16:49:10,949] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 10.69 GB, percent = 17.0%
[2023-05-25 16:49:11,392] [INFO] [utils.py:785:see_memory_usage] After initializing optimizer states
[2023-05-25 16:49:11,393] [INFO] [utils.py:786:see_memory_usage] MA 0.66 GB         Max_MA 0.66 GB         CA 0.76 GB         Max_CA 1 GB 
[2023-05-25 16:49:11,394] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 11.48 GB, percent = 18.3%
[2023-05-25 16:49:11,394] [INFO] [stage_1_and_2.py:489:__init__] optimizer state initialized
[2023-05-25 16:49:11,520] [INFO] [utils.py:785:see_memory_usage] After initializing ZeRO optimizer
[2023-05-25 16:49:11,520] [INF

Using /home/bigster/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 108314117


Time to load utils op: 0.0018157958984375 seconds


  0%|          | 0/375 [00:00<?, ?it/s]

[2023-05-25 16:49:11,796] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
[2023-05-25 16:49:11,931] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
[2023-05-25 16:49:13,200] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384
[2023-05-25 16:49:14,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=3, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:49:14,664] [INFO] [timer.py:199:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=23.975885802684157, CurrSamplesPerSec=22.51934495146732, MemAllocated=0.26GB, MaxMemAllocated=3.67GB
[2023-05-25 16:49:18,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=3, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:49:18,360] [INFO] [timer.py:199

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.3935546875, 'eval_accuracy': 0.537, 'eval_runtime': 3.6843, 'eval_samples_per_second': 271.419, 'eval_steps_per_second': 33.927, 'epoch': 1.0}
[2023-05-25 16:50:03,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=4, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:50:03,205] [INFO] [timer.py:199:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=22.041497142142894, CurrSamplesPerSec=22.20348009295776, MemAllocated=0.26GB, MaxMemAllocated=3.67GB
[2023-05-25 16:50:07,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=4, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:50:07,105] [INFO] [timer.py:199:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=21.96298108863297, CurrSamplesPerSec=21.316013421914924, MemAllocated=0.26GB, MaxMemAllocated=3.67GB
[2023-05-25 16:50:11,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=4, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:50:11,699] [INFO] [timer.py:199:s

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9208984375, 'eval_accuracy': 0.528, 'eval_runtime': 3.6897, 'eval_samples_per_second': 271.027, 'eval_steps_per_second': 33.878, 'epoch': 2.0}
[2023-05-25 16:50:59,052] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=4, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:50:59,053] [INFO] [timer.py:199:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=21.177279172926134, CurrSamplesPerSec=22.67217393713306, MemAllocated=0.26GB, MaxMemAllocated=3.67GB
[2023-05-25 16:51:01,862] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096
[2023-05-25 16:51:02,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=5, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-05-25 16:51:02,593] [INFO] [timer.py:199:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=21.24611443487586, CurrSamplesPerSec=22.741289813180074, MemAllocated=0.26GB, MaxMemAllocated=3.67GB
[2023-05-25

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 2.4375, 'eval_accuracy': 0.497, 'eval_runtime': 3.6813, 'eval_samples_per_second': 271.64, 'eval_steps_per_second': 33.955, 'epoch': 3.0}
{'train_runtime': 154.2573, 'train_samples_per_second': 19.448, 'train_steps_per_second': 2.431, 'train_loss': 0.40657767740885414, 'epoch': 3.0}


TrainOutput(global_step=375, training_loss=0.40657767740885414, metrics={'train_runtime': 154.2573, 'train_samples_per_second': 19.448, 'train_steps_per_second': 2.431, 'train_loss': 0.40657767740885414, 'epoch': 3.0})

In [17]:
trainer.save_model('./test_trainer')

Saving model checkpoint to ./test_trainer
Configuration saved in ./test_trainer/config.json
Model weights saved in ./test_trainer/pytorch_model.bin


In [21]:
del model
del trainer
torch.cuda.empty_cache()