In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel

import pandas as pd

from transformers import ElectraForSequenceClassification, ElectraTokenizer, ElectraConfig

In [16]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-discriminator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator")

Some weights of the model checkpoint at monologg/koelectra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-discriminator and are newly initialized: ['cl

In [18]:
config = ElectraConfig.from_pretrained("monologg/koelectra-small-discriminator")

In [19]:
config

ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 32200
}

In [4]:
nsmc_train = pd.read_csv('../../data/nsmc/ratings_train.txt', sep='\t', encoding='utf-8')
nsmc_test = pd.read_csv('../../data/nsmc/ratings_train.txt', sep='\t', encoding='utf-8')

In [60]:
nsmc_test['document'] = nsmc_test['document'].apply(str)

In [20]:
nsmc_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [5]:
train_encodings = tokenizer(list(map(str, nsmc_train['document'])), truncation=True, padding=True)
test_encodings = tokenizer(list(map(str, nsmc_test['document'])), truncation=True, padding=True)

In [6]:
test_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [7]:
class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = NSMCDataset(train_encodings, nsmc_train['label'])
test_dataset = NSMCDataset(test_encodings, nsmc_test['label'])

In [8]:
test_dataset[0]

{'input_ids': tensor([    2,  3360, 28709,    18,    18, 12704, 29334,  5853, 11852, 21747,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [12]:
test_dataset[0].items()

dict_items([('input_ids', tensor([    2,  3360, 28709,    18,    18, 12704, 29334,  5853, 11852, 21747,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,   

In [25]:
outputs = model(**test_dataset[0])

ValueError: Wrong shape for input_ids (shape torch.Size([142])) or attention_mask (shape torch.Size([142]))

In [27]:
from torch.utils.data import DataLoader

In [49]:
loader = DataLoader(test_dataset, batch_size=36, shuffle=True)

In [50]:
x = next(iter(loader))

In [51]:
outputs = model(**x)

In [52]:
outputs.loss

tensor(0.6955, grad_fn=<NllLossBackward>)

In [58]:
a = 0.02345
f'{a:.3f}'

'0.023'

In [53]:
outputs.logits

tensor([[-0.0405, -0.0233],
        [-0.0479, -0.0150],
        [-0.0365, -0.0188],
        [-0.0572, -0.0190],
        [-0.0480, -0.0076],
        [-0.0407, -0.0236],
        [-0.0358, -0.0225],
        [-0.0429, -0.0068],
        [-0.0310, -0.0323],
        [-0.0379, -0.0382],
        [-0.0408, -0.0279],
        [-0.0365, -0.0196],
        [-0.0359, -0.0270],
        [-0.0499, -0.0172],
        [-0.0519, -0.0108],
        [-0.0477, -0.0145],
        [-0.0434, -0.0161],
        [-0.0392, -0.0292],
        [-0.0319, -0.0043],
        [-0.0342, -0.0355],
        [-0.0521, -0.0126],
        [-0.0401, -0.0231],
        [-0.0448, -0.0174],
        [-0.0344, -0.0073],
        [-0.0417, -0.0050],
        [-0.0343, -0.0275],
        [-0.0508, -0.0136],
        [-0.0414, -0.0230],
        [-0.0449, -0.0326],
        [-0.0340, -0.0262],
        [-0.0322, -0.0138],
        [-0.0429, -0.0157],
        [-0.0285, -0.0198],
        [-0.0491, -0.0135],
        [-0.0445, -0.0243],
        [-0.0398, -0

In [54]:
outputs.logits.argmax(axis=1)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [57]:
x['labels'].size(0)

36

In [43]:
x['labels']

tensor([1, 1, 0, 1, 1])

In [55]:
x['labels'].eq(outputs.logits.argmax(axis=1)).sum().item()

16

In [36]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [37]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

<ParallelMode.NOT_DISTRIBUTED: 'not_distributed'>