# 사전 준비

In [None]:
!pip install transformers
!pip install datasets
!pip install sacremoses

In [2]:
import warnings

# 경고메세지 끄기
warnings.filterwarnings(action='ignore')

**GLUE의 SST-2 데이터 불러오기**

In [3]:
from datasets import load_dataset

datasets = load_dataset("glue", "sst2")



  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
# label 0: negative(부정) / 1: positive(긍정) / -1: test data (비공개)
print(datasets["train"][0])
print(datasets["validation"][0])
print(datasets["test"][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}
{'sentence': "it 's a charming and often affecting journey . ", 'label': 1, 'idx': 0}
{'sentence': 'uneasy mishmash of styles and genres .', 'label': -1, 'idx': 0}


**XLM 모델과 토크나이저 불러오기**

In [6]:
from transformers import XLMTokenizer, XLMForSequenceClassification

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
model = XLMForSequenceClassification.from_pretrained("xlm-mlm-en-2048")

Some weights of the model checkpoint at xlm-mlm-en-2048 were not used when initializing XLMForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMForSequenceClassification were not initialized from the model checkpoint at xlm-mlm-en-2048 and are newly initialized: ['sequence_summary.summary.weight', 'transformer.position_ids', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

In [7]:
model.config

XLMConfig {
  "_name_or_path": "xlm-mlm-en-2048",
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "xlm",
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "pad_index": 2,
  "pad_token_id": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.23.0",
  "unk_index": 3,
  "use_lang_emb": true,
  "vocab_size": 30145
}

# 데이터 구축

데이터 준비
train:validation:test = 7 : 1 : 2

In [8]:
from tqdm.auto import tqdm as tqdm_auto

In [9]:
ids = datasets['validation'].num_rows

In [10]:
train_sentence = [datasets['train']['sentence'][idx] for idx in tqdm_auto(range(0, ids*7))]
train_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(0, ids*7))]

  0%|          | 0/6104 [00:00<?, ?it/s]

  0%|          | 0/6104 [00:00<?, ?it/s]

In [11]:

val_sentence = [datasets['validation']['sentence'][idx] for idx in tqdm_auto(range(0, ids))]
val_label = [datasets['validation']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/872 [00:00<?, ?it/s]

  0%|          | 0/872 [00:00<?, ?it/s]

In [12]:
# SST-2의 test data는 비공개이기 때문에 train data의 일부로 test data를 만든다
test_sentence = [datasets['train']['sentence'][idx] for idx in tqdm_auto(range(ids*7, ids*9))]
test_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(ids*7, ids*9))]

  0%|          | 0/1744 [00:00<?, ?it/s]

  0%|          | 0/1744 [00:00<?, ?it/s]

In [13]:
# 마지막 train data와 test data의 마지막과 처음이 중복인지 확인
print("last train data:", train_sentence[-1])
print("last test data:", test_sentence[0])

last train data: a moral 
last test data: that gives movies about ordinary folk a bad name 


**토크나이징**

In [14]:
# 패딩 채우기
train_input = tokenizer(train_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")
val_input = tokenizer(val_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")
test_input = tokenizer(test_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")

**데이터셋 변환**

In [15]:
import sys
import torch

In [16]:
class SSTDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = SSTDataset(train_input, train_label)
val_dataset = SSTDataset(val_input, val_label)
test_dataset = SSTDataset(test_input, test_label)

In [18]:
for n in range(3):
    print("train_dataset[",n,"]")
    print(train_dataset[n])

train_dataset[ 0 ]
{'input_ids': tensor([    0,  3870,    79,  9458,  1895,    40,    14, 20941,  1404,     1,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(0)}
train_dataset[ 1 ]
{'input_ids': tensor([    0,  1710,    93, 16205,    15,    9

**데이터로더 정의**

In [19]:
from torch.utils.data import DataLoader

In [20]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [21]:
# 데이터로더 확인
next(iter(train_loader))

{'input_ids': tensor([[    0,    48, 12380,    18,   224, 17785,  2329,     1,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2],
         [    0,   185,   109,    15,    55,    61,  1812,  5311,  8197,  6880,
           8659,    14,  2317,    15, 16096,   386,    17,  9675,    16,     1,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2],
         [    0,  2006,    29,    14,  2930,     1,     2,     2,     2,     2,
 

# 모델 학습

**모델 파라미터 설정**

In [22]:
import torch.nn
from transformers import get_linear_schedule_with_warmup

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 5
learning_rate = 5e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

"""
# 학습 스케줄러 설정
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_training_steps=len(train_loader),
                                            num_warmup_steps=0)
"""

step = 0
eval_steps = 380        # 훈련 배치수의 약 절반

In [24]:
model.to(device)

XLMForSequenceClassification(
  (transformer): XLMModel(
    (position_embeddings): Embedding(512, 2048)
    (embeddings): Embedding(30145, 2048, padding_idx=2)
    (layer_norm_emb): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
    (attentions): ModuleList(
      (0): MultiHeadAttention(
        (q_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (k_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (v_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (out_lin): Linear(in_features=2048, out_features=2048, bias=True)
      )
      (1): MultiHeadAttention(
        (q_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (k_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (v_lin): Linear(in_features=2048, out_features=2048, bias=True)
        (out_lin): Linear(in_features=2048, out_features=2048, bias=True)
      )
      (2): MultiHeadAttention(
        (q_lin): Linear(in_features=2048, o

**학습 진행**

In [25]:
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

In [26]:
model.train()

for epoch in range(num_epochs):
    loss = 0
    train_loss = 0.0
    
    for batch in tqdm_auto(train_loader, mininterval=0.01, leave=True):
        optimizer.zero_grad()     # 그래디언트 초기화

        # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
        labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
        outputs = model(**inputs).logits    # 모형으로 결과 예측

        loss = criterion(outputs, F.one_hot(labels, num_classes=2).float())
        train_loss += loss
        
        loss.backward()
        optimizer.step()

        step += 1
        if step % eval_steps == 0:  # eval_steps 마다 loss를 출력
            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                model.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_loader, mininterval=0.01, leave=True):

                    # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
                    labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
                    outputs = model(**inputs).logits     # 모형으로 결과 예측

                    loss = criterion(outputs, F.one_hot(labels, num_classes=2).float())  
                    val_loss += loss

                avg_val_loss = val_loss / len(val_loader)

            val_accuracy = accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
            print('Step %d, validation loss: %.4f, accuracy_score: %.2f' % (step, avg_val_loss, val_accuracy))
            
        avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
    print('epoch %d, train loss: %.4f, accuracy_score: %.2f \n' % (epoch, avg_train_loss, train_accuracy))

  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Step 380, validation loss: 0.7557, accuracy_score: 0.62


  0%|          | 0/109 [00:00<?, ?it/s]

Step 760, validation loss: 0.7077, accuracy_score: 0.38
epoch 0, train loss: 1.0308, accuracy_score: 0.50 



  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Step 1140, validation loss: 0.7077, accuracy_score: 0.25


  0%|          | 0/109 [00:00<?, ?it/s]

Step 1520, validation loss: 0.6150, accuracy_score: 0.88
epoch 1, train loss: 0.7014, accuracy_score: 0.75 



  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Step 1900, validation loss: 0.7317, accuracy_score: 0.38


  0%|          | 0/109 [00:00<?, ?it/s]

Step 2280, validation loss: 0.7306, accuracy_score: 0.25
epoch 2, train loss: 0.6907, accuracy_score: 0.50 



  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Step 2660, validation loss: 0.7003, accuracy_score: 0.38


  0%|          | 0/109 [00:00<?, ?it/s]

Step 3040, validation loss: 0.6462, accuracy_score: 0.75
epoch 3, train loss: 0.6890, accuracy_score: 0.62 



  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Step 3420, validation loss: 0.7252, accuracy_score: 0.25


  0%|          | 0/109 [00:00<?, ?it/s]

Step 3800, validation loss: 0.7226, accuracy_score: 0.38
epoch 4, train loss: 0.6893, accuracy_score: 0.62 



# 모델 테스트

SST-2의 테스트 데이터의 label은 모두 -1로 비공개 되어있다. (https://github.com/huggingface/datasets/issues/245) <br>
따라서 테스트 데이터를 훈련데이터에서 따로 분리하여 진행한다.

In [27]:
model.eval()

for batch in test_loader:
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    
    with torch.no_grad():   # 학습 X (그래디언트 계산 X)
        outputs = model(**inputs).logits

    test_accuracy = accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())

print('test accuracy_score: %.2f \n' % (test_accuracy))

test accuracy_score: 0.38 

