# 다중 언어를 위한(multilingual) BERT

In [None]:
!pip install transformers
!pip install datasets

**다음 영화 리뷰 데이터 준비**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/Othercomputers/내 컴퓨터/TextMining/data/daum_movie_review.csv')

# rating이 6보다 작으면 0 즉 부정, 6 이상이면 긍정으로 라벨 생성
y = [0 if rate < 6 else 1 for rate in df.rating]

# 데이터셋을 학습, 검증, 평가의 세 데이터셋으로 분리
X_train_val, X_test, y_train_val, y_test = train_test_split(df.review.tolist(), y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=0)

print('Train set size:', len(X_train))
print('Validation set size:', len(X_val))
print('Test set size:', len(X_test))

Train set size: 8282
Validation set size: 2761
Test set size: 3682


**토크나이저 및 모델(분류기) 설정**

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification 

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# 토크나이징 예시
print(tokenizer.tokenize("안녕하세요. 반갑습니다."))
inputs = tokenizer("안녕하세요. 반갑습니다.")
print(inputs)

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

['안', '##녕', '##하', '##세', '##요', '.', '반', '##갑', '##습', '##니다', '.']
{'input_ids': [101, 9521, 118741, 35506, 24982, 48549, 119, 9321, 118610, 119081, 48345, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
# 토크나이징
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

In [None]:
# bert-base-multilingual-cased 사전학습모형으로부터 분류기 모형을 생성
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased")

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

**데이터셋 변환**

In [None]:
import torch

class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

**정확도 함수 정의**

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

**미세조정 학습**

In [None]:
from transformers import Trainer, TrainingArguments

# Trainer에서 사용할 하이퍼파라미터 지정
training_args = TrainingArguments(
    output_dir='./results',          # 모형 예측이나 체크포인트 출력 폴더, 반드시 필요함
    num_train_epochs=3,              # 학습 에포크 수
    evaluation_strategy="steps",      # eval_steps 마다 검증 데이터셋에 대한 평가 지표를 출력
    eval_steps = 500,
    per_device_train_batch_size=8,   # 학습에 사용할 배치 사이즈
    per_device_eval_batch_size=16,   # 평가에 사용할 배치 사이즈
    warmup_steps=200,                # 학습률 스케줄러의 warmup 구간 설정
    weight_decay=0.01,               # AdamW의 가중치 감쇠도
)

# Trainer 객체 생성
trainer = Trainer(
    model=model,                     # 학습할 모형
    args=training_args,              # 위에서 정의한 학습 매개변수
    train_dataset=train_dataset,     # 훈련 데이터셋
    eval_dataset=val_dataset,        # 검증 데이터셋
    compute_metrics=compute_metrics,
)

# 미세조정학습 실행
trainer.train()

***** Running training *****
  Num examples = 8282
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3108
  if __name__ == '__main__':


Step,Training Loss,Validation Loss,Accuracy
500,0.5675,0.583939,0.729446
1000,0.5705,0.543302,0.769286
1500,0.5664,0.540187,0.769286
2000,0.5603,0.540476,0.769286
2500,0.5598,0.540115,0.770373
3000,0.5291,0.49553,0.775081


***** Running Evaluation *****
  Num examples = 2761
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 2761
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 2761
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 2761
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/che

TrainOutput(global_step=3108, training_loss=0.556299690727715, metrics={'train_runtime': 1939.146, 'train_samples_per_second': 12.813, 'train_steps_per_second': 1.603, 'total_flos': 4034713478410080.0, 'train_loss': 0.556299690727715, 'epoch': 3.0})

**모델 평가**

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 3682
  Batch size = 16
  if __name__ == '__main__':


{'epoch': 3.0,
 'eval_accuracy': 0.7705051602390005,
 'eval_loss': 0.49175286293029785,
 'eval_runtime': 76.1335,
 'eval_samples_per_second': 48.362,
 'eval_steps_per_second': 3.034}

# KcBERT에 대한 파이토치 기반 미세조정학습

In [None]:
# GPU 메모리 확보 (multilingual BERT을 실행했다면 필수)

del model
del trainer
torch.cuda.empty_cache()

**토크나이저 및 모델(분류기) 설정**

In [3]:
from transformers import BertTokenizer
from transformers import BertModel

tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base')

# 토크나이징 예시
print(tokenizer.tokenize("안녕하세요. 반갑습니다."))
inputs = tokenizer("안녕하세요. 반갑습니다.")
print(inputs)

['안녕', '##하세요', '.', '반', '##갑', '##습니다', '.']
{'input_ids': [2, 19017, 8482, 17, 1483, 4981, 8046, 17, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
# 토크나이징
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

In [5]:
# KcBERT 사전학습모형 로드
bert_model = BertModel.from_pretrained('beomi/kcbert-base')
bert_model.config

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertConfig {
  "_name_or_path": "beomi/kcbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

**데이터셋 및 데이터로더 변환**

In [6]:
import torch
from torch.utils.data import DataLoader

class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
# Dataset
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

In [8]:
# DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

**모델 정의**

In [9]:
# BERT를 포함한 신경망 모형
class MyModel(torch.nn.Module):
    def __init__(self, pretrained_model, token_size, num_labels): 
        super(MyModel, self).__init__()
        self.token_size = token_size
        self.num_labels = num_labels
        self.pretrained_model = pretrained_model

        # 분류기 정의
        self.classifier = torch.nn.Linear(self.token_size, self.num_labels)

    def forward(self, inputs):
        # BERT 모형에 입력을 넣고 출력을 받음
        outputs = self.pretrained_model(**inputs)
        # BERT 출력에서 CLS 토큰에 해당하는 부분만 가져옴
        bert_clf_token = outputs.last_hidden_state[:,0,:]
        
        return self.classifier(bert_clf_token)

# token_size는 BERT 토큰과 동일
model = MyModel(bert_model, num_labels=2, token_size=bert_model.config.hidden_size)

**모델 학습**

In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import time

# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)  # 모형을 GPU로 복사
model.train()     # 학습모드로 전환

# 옵티마이저를 트랜스포머가 제공하는 AdamW로 설정
optim = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) # 가중치 감쇠 설정
criterion = torch.nn.CrossEntropyLoss()    # 멀티클래스이므로 크로스 엔트로피를 손실함수로 사용

num_epochs = 3      # 학습 epoch를 3회로 설정
total_training_steps = num_epochs * len(train_loader)
# 학습 스케줄러 설정
scheduler = get_linear_schedule_with_warmup(optimizer=optim,
                                            num_training_steps=total_training_steps,
                                            num_warmup_steps=200)

start = time.time() # 시작시간 기록
eval_steps = 500
step = 0

model.train()     # 학습모드로 전환



MyModel(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [11]:
for epoch in range(num_epochs):
    train_loss = 0
    for batch in train_loader:
        optim.zero_grad()     # 그래디언트 초기화

        # 배치에서 label을 제외한 입력만 추출하여  GPU로 복사
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
        labels = batch['labels'].to(device) # 배치에서 라벨을 추출하여 GPU로 복사
        outputs = model(inputs) # 모형으로 결과 예측

        # 두 클래스에 대해 예측하고 각각 비교해야 하므로 labels에 대해 원핫인코딩을 적용한 후에 손실을 게산
        loss = criterion(outputs, F.one_hot(labels, num_classes=2).float()) # loss 계산
        train_loss += loss

        loss.backward() # 그래디언트 계산
        optim.step()    # 가중치 업데이트
        scheduler.step() # 스케줄러 업데이트
        
        step += 1
        if step % eval_steps == 0:  # eval_steps 마다 경과한 시간과 loss를 출력
            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                model.eval()        # 평가모드로 전환

                for batch in val_loader:
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels = batch['labels'].to(device)
                    outputs = model(inputs)

                    loss = criterion(outputs, F.one_hot(labels, num_classes=2).float()) # loss 계산
                    val_loss += loss

                avg_val_loss = val_loss / len(val_loader)

            avg_train_loss = train_loss / eval_steps    # eval_steps의 평균 loss 계산
            
            elapsed = time.time() - start
            print('Step %d, elapsed time: %.2f, train loss: %.4f, validation loss: %.4f' 
                  % (step, elapsed, avg_train_loss, avg_val_loss))

  # Remove the CWD from sys.path while we load stuff.


Step 500, elapsed time: 215.10, train loss: 0.4338, validation loss: 0.3216
Step 1000, elapsed time: 431.69, train loss: 0.7569, validation loss: 0.2759
Step 1500, elapsed time: 647.87, train loss: 0.1343, validation loss: 0.3381
Step 2000, elapsed time: 864.52, train loss: 0.2860, validation loss: 0.3151
Step 2500, elapsed time: 1080.73, train loss: 0.0323, validation loss: 0.3729
Step 3000, elapsed time: 1297.50, train loss: 0.0570, validation loss: 0.3868


**모델 평가(테스트)**

In [12]:
from datasets import load_metric

metric= load_metric("accuracy")
model.eval()

for batch in test_loader:
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    
    with torch.no_grad(): # 학습 X (그래디언트 계산 X)
        outputs = model(inputs)

    predictions = torch.argmax(outputs, dim=-1)

metric.compute(predictions=predictions, references=labels)

  # Remove the CWD from sys.path while we load stuff.


{'accuracy': 1.0}