# 1주차 HomeWork
- Bucketing 이해 및 구현
- NSMC 학습.

### Load Data

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import sys
from omegaconf import OmegaConf

from sklearn.metrics import accuracy_score, f1_score

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
)
from transformers.integrations import WandbCallback

sys.path.append('/root/nlp-with-transformers')
from src.data import NSMCDataModule



In [3]:
config = OmegaConf.load('conf/nsmc.yaml')
tokenizer = AutoTokenizer.from_pretrained(config.model.pretrained_model_name_or_path)
dm = NSMCDataModule(tokenizer=tokenizer, **OmegaConf.to_container(config.data))
dm.setup()

Found cached dataset nsmc (/root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-2d03d9ec85550652.arrow


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/143 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-fa4cc0bdb164e8b3.arrow


  0%|          | 0/8 [00:00<?, ?ba/s]

In [4]:
model_base_dir = '/root/model_safari'
config.training.output_dir = f"{model_base_dir}/{config.model.pretrained_model_name_or_path}-finetuned-nsmc"
config.training.logging_steps = len(dm.ds['train']) // config.training.per_device_train_batch_size
training_args = TrainingArguments(**OmegaConf.to_container(config.training))

---

### Bucketing에 따른 배치별 label 분포 차이 확인.
* bucketing을 적용할 경우, 속도는 빨라지지만, 문장 길이에 따른 클래스 분포가 RandomSampling과 차이가 존재할 경우, 성능하락이 있을 수 있음.
* 그래서 RandomSampler와 LengthGroupSampler의 미니배치의 비율 차이를 확인하는 것이 도움이 될 것이다.

In [5]:
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data import RandomSampler
from transformers import DataCollatorWithPadding
from transformers.trainer_utils import seed_worker
from transformers.trainer_pt_utils import LengthGroupedSampler

from src.utils.dl_stats import aggregate_batch_label_counts

In [6]:
generator = torch.Generator()
generator.manual_seed(training_args.seed)
data_collator = DataCollatorWithPadding(tokenizer,return_tensors='np')

* length_group_sampler는 dataset의 index를 input의 길이를 기준으로 정렬하기 때문에 시간이 좀 걸림.

In [7]:
%%time
random_sampler = RandomSampler(dm.ds['train'], generator=generator)

CPU times: user 51 µs, sys: 4 µs, total: 55 µs
Wall time: 64.1 µs


In [8]:
%%time
length_group_sampler = LengthGroupedSampler(training_args.per_device_train_batch_size, dataset=dm.ds['train'], generator=generator)

CPU times: user 44.1 s, sys: 127 ms, total: 44.2 s
Wall time: 44.2 s


In [9]:
# random sampler
rs_dl = DataLoader(
            dm.ds['train'],
            batch_size=training_args.per_device_train_batch_size,
            sampler=random_sampler,
            collate_fn=data_collator,
            drop_last=True,
            num_workers=training_args.dataloader_num_workers,
            pin_memory=training_args.dataloader_pin_memory,
            worker_init_fn=seed_worker,
        )
# length group sampler
lg_dl = DataLoader(
            dm.ds['train'],
            batch_size=training_args.per_device_train_batch_size,
            sampler=length_group_sampler,
            collate_fn=data_collator,
            drop_last=True,
            num_workers=training_args.dataloader_num_workers,
            pin_memory=training_args.dataloader_pin_memory,
            worker_init_fn=seed_worker,
        )

In [10]:
from scipy.stats import ttest_ind

In [11]:
rs_df = aggregate_batch_label_counts(rs_dl, dm.id2label)
lg_df = aggregate_batch_label_counts(lg_dl, dm.id2label)

[2023-02-02 05:46:31.028: W smdistributed/modelparallel/torch/nn/predefined_hooks.py:75] Found unsupported HuggingFace version 4.25.1 for automated tensor parallelism. HuggingFace modules will not be automatically distributed. You can use smp.tp_register_with_module API to register desired modules for tensor parallelism, or directly instantiate an smp.nn.DistributedModule. Supported HuggingFace transformers versions for automated tensor parallelism: ['4.17.0', '4.20.1', '4.21.0']
[2023-02-02 05:46:31.116 ee7736cf838b:2818 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-02-02 05:46:31.284 ee7736cf838b:2818 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

* 평균은 갖고 분산만 다름.

In [12]:
print('Random Sampling')
print(f"{rs_df['positive'].mean():.2f} ± {rs_df['positive'].std():.2f}")
print('LengthGroup Sampling')
print(f"{lg_df['positive'].mean():.2f} ± {lg_df['positive'].std():.2f}")

Random Sampling
31.91 ± 3.95
LengthGroup Sampling
31.91 ± 4.10


* 독립표본 t검증으로도 귀무가설 채택

In [13]:
stat, pv = ttest_ind(rs_df['positive'], lg_df['positive'])
print(f"p-value: {pv:.4f}")

p-value: 0.9941


---

### 모델 학습

In [14]:
def compute_metrics(pred:EvalPrediction):
    """Get EvalPrediction and Calculate the metrics"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy":acc, "f1":f1}

In [15]:
os.environ['WANDB_PROJECT'] = config.env['wandb']['WANDB_PROJECT']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(
    config.model.pretrained_model_name_or_path, 
    num_labels=dm.num_classes
).to(device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [16]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dm.ds['train'],
                  eval_dataset=dm.ds['validation'],
                  data_collator=dm.get_collate_fn(),
                  callbacks=[WandbCallback()]
                 )

In [17]:
print(training_args.group_by_length)

False


In [18]:
%%time
rs_model = trainer.train()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdatalama[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2767,0.243326,0.899867,0.899767
2,0.1876,0.228202,0.912267,0.912267
3,0.1342,0.252896,0.9136,0.91359


CPU times: user 27min 6s, sys: 10min 55s, total: 38min 1s
Wall time: 35min 53s


In [19]:
training_args.group_by_length = True

In [21]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dm.ds['train'],
                  eval_dataset=dm.ds['validation'],
                  data_collator=dm.get_collate_fn(),
                  callbacks=[WandbCallback()]
                 )

In [22]:
%%time
lg_model = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1558,0.2712,0.905467,0.90545
2,0.1027,0.303188,0.9064,0.906393
3,0.0644,0.369119,0.907067,0.907054


CPU times: user 13min 47s, sys: 2min 50s, total: 16min 37s
Wall time: 15min 25s


* 결과
  * 확실히 bucketing을 적용한 데이터가 더 빠르게 수렴하는 현상을 보임. (절반 이하)
  * 물론 실험을 한번만 돌려서 확언하기는 어렵지만, bucketing을 적용한 경우, Validation loss가 빠르게 exploding하는 현상을 보임.

---