# ALBERT를 이용한 감정 분석
사전학습모델 : albert-base-v2<br>
데이터 : GLUE_SST-2 (Stanford Sentiment Treebank v2)

# 사전 준비

In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import random
import numpy as np

# Nondeterministic 한 작업 피하기
RANDOM_SEED = 2022

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

**GLUE의 SST-2 데이터 불러오기**

In [3]:
from datasets import load_dataset

datasets = load_dataset("glue", "sst2")

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
# label 0: negative(부정)) / 1: positive(긍정) / -1: test data
print(datasets["train"][0])
print(datasets["validation"][0])
print(datasets["test"][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}
{'sentence': "it 's a charming and often affecting journey . ", 'label': 1, 'idx': 0}
{'sentence': 'uneasy mishmash of styles and genres .', 'label': -1, 'idx': 0}


**ALBERT 모델과 토크나이저 불러오기**

In [6]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
albert_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [7]:
albert_model.config

AlbertConfig {
  "_name_or_path": "albert-base-v2",
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "vocab_size": 30000
}

# 토크나이징, 데이터 구축

**데이터 준비**

In [8]:
from tqdm import tqdm

In [9]:
# train data가 약 67000개로 너무 많아 20000개로 줄임
train_sentence = [datasets['train']['sentence'][idx] for idx in tqdm(range(0, 20000))]
train_label = [datasets['train']['label'][idx] for idx in tqdm(range(0, 20000))]

100%|██████████| 20000/20000 [17:42<00:00, 18.83it/s]
100%|██████████| 20000/20000 [06:49<00:00, 48.88it/s]


In [10]:
ids = datasets['validation'].num_rows
val_sentence = [datasets['validation']['sentence'][idx] for idx in range(0, ids)]
val_label = [datasets['validation']['label'][idx] for idx in range(0, ids)]

In [11]:
ids = datasets['test'].num_rows
test_sentence = [datasets['test']['sentence'][idx] for idx in range(0, ids)]
test_label = [datasets['test']['label'][idx] for idx in range(0, ids)]

In [12]:
# ALBERT모델에서 정한 입력 크기보다 크면 잘라내기, 패딩 채우기
train_input = tokenizer(train_sentence, truncation=True, padding=True, return_tensors="pt")
val_input = tokenizer(val_sentence, truncation=True, padding=True, return_tensors="pt")
test_input = tokenizer(test_sentence, truncation=True, padding=True, return_tensors="pt")

**데이터셋 변환**

In [13]:
import torch

In [14]:
class SSTDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [15]:
train_dataset = SSTDataset(train_input, train_label)
val_dataset = SSTDataset(val_input, val_label)
test_dataset = SSTDataset(test_input, test_label)

In [16]:
for n in range(3):
    print("train_dataset[",n,"]")
    print(train_dataset[n])

train_dataset[ 0 ]
{'input_ids': tensor([    2,  3077,    78, 27467,    18,    37,    14, 21207,  1398,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 

  import sys


**데이터로더 정의**

In [17]:
from torch.utils.data import DataLoader

In [18]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [19]:
# 데이터로더 확인
next(iter(train_loader))

  import sys


{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[    2,    26,  7468,  ...,     0,     0,     0],
         [    2,  3669,    20,  ...,     0,     0,     0],
         [    2, 11764,    20,  ...,     0,     0,     0],
         ...,
         [    2,    30,    48,  ...,     0,     0,     0],
         [    2, 10979,  4029,  ...,     0,     0,     0],
         [    2,  7952,    18,  ...,     0,     0,     0]]),
 'labels': tensor([1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]])}

# 모델 학습
**모델 파라미터 설정**

In [20]:
import warnings

# 경고메세지 끄기
warnings.filterwarnings(action='ignore')

In [21]:
# Lamb optimizer를 사용하기 위함 (https://pytorch-optimizer.readthedocs.io/en/latest/_modules/torch_optimizer/lamb.html#Lamb)
!pip install torch_optimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
import torch.nn
import torch.nn.functional as F
import torch_optimizer as optim

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 5
learning_rate = 1e-5

optimizer = optim.Lamb(albert_model.parameters(), lr=learning_rate)     # 매우 요약하면 Lamb = LARS + Adam -> 배치가 작아서 효과적일지는 의문점
criterion = torch.nn.CrossEntropyLoss()

step = 0
eval_steps = 625        # 훈련 배치수의 절반

**학습 진행**

In [24]:
from tqdm.auto import tqdm as tqdm_auto
from sklearn.metrics import accuracy_score

In [25]:
albert_model.to(device)
albert_model.train()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [26]:
for epoch in range(num_epochs):
    loss = 0
    train_loss = 0.0
    
    for batch_idx, batch in enumerate(tqdm_auto(train_loader, mininterval=0.01, leave=True)):
        optimizer.zero_grad()     # 그래디언트 초기화

        # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
        labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
        outputs = albert_model(**inputs).logits    # 모형으로 결과 예측

        loss = criterion(outputs, F.one_hot(labels, num_classes=2).float())
        train_loss += loss
        
        loss.backward()
        optimizer.step()

        step += 1
        if step % eval_steps == 0:  # eval_steps 마다 loss를 출력
            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                albert_model.eval()        # 평가모드로 전환

                for val_batch_idx, val_batch in enumerate(tqdm_auto(val_loader, mininterval=0.01, leave=True)):

                    # 배치에서 label을 제외한 입력만 추출하여 GPU로 복사
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
                    labels = batch['labels'].to(device)     # 배치에서 라벨을 추출하여 GPU로 복사
                    outputs = albert_model(**inputs).logits     # 모형으로 결과 예측

                    loss = criterion(outputs, F.one_hot(labels, num_classes=2).float())  
                    val_loss += loss

                avg_val_loss = val_loss / len(val_loader)

            val_accuracy = accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
            print('Step %d, validation loss: %.4f, accuracy_score: %.2f' % (step, avg_val_loss, val_accuracy))
            
        avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
    print('epoch %d, train loss: %.4f, accuracy_score: %.2f \n' % (epoch, avg_train_loss, train_accuracy))

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Step 625, validation loss: 0.6996, accuracy_score: 0.44


  0%|          | 0/55 [00:00<?, ?it/s]

Step 1250, validation loss: 0.4648, accuracy_score: 0.88
epoch 0, train loss: 0.6347, accuracy_score: 0.88 



  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Step 1875, validation loss: 0.4430, accuracy_score: 0.81


  0%|          | 0/55 [00:00<?, ?it/s]

Step 2500, validation loss: 0.1919, accuracy_score: 0.94
epoch 1, train loss: 0.3779, accuracy_score: 0.94 



  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Step 3125, validation loss: 0.4048, accuracy_score: 0.81


  0%|          | 0/55 [00:00<?, ?it/s]

Step 3750, validation loss: 0.2732, accuracy_score: 0.88
epoch 2, train loss: 0.2972, accuracy_score: 0.88 



  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Step 4375, validation loss: 0.2038, accuracy_score: 0.94


  0%|          | 0/55 [00:00<?, ?it/s]

Step 5000, validation loss: 0.1805, accuracy_score: 0.94
epoch 3, train loss: 0.2582, accuracy_score: 0.94 



  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Step 5625, validation loss: 0.1020, accuracy_score: 1.00


  0%|          | 0/55 [00:00<?, ?it/s]

Step 6250, validation loss: 0.2046, accuracy_score: 0.94
epoch 4, train loss: 0.2315, accuracy_score: 0.94 

