In [17]:
%config IPCompleter.use_jedi=False

# Reference 

[여기](https://colab.research.google.com/drive/1JZ-pXlmgRIYHm8yPLYY68Q28l9OYAL6H?usp=sharing#scrollTo=WZKIQNjZwdn1)에서 가져온 내용

In [10]:
!pip install -q transformers
!rm ratings_test.txt
!rm ratings_train.txt
!wget -q https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget -q https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

In [3]:
!head ratings_train.txt

id	document	label
9976970	아 더빙.. 진짜 짜증나네요 목소리	0
3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1
10265843	너무재밓었다그래서보는것을추천한다	0
9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0
6483659	사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다	1
5403919	막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.	0
7797314	원작의 긴장감을 제대로 살려내지못했다.	0
9443947	별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단 낫겟다 납치.감금만반복반복..이드라마는 가족도없다 연기못하는사람만모엿네	0
7156791	액션이 없는데도 재미 있는 몇안되는 영화	1


In [1]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import AdamW, AutoTokenizer, ElectraForSequenceClassification

device = torch.device("cpu")

# Data

In [2]:
class NSMCDataset(Dataset):
    def __init__(self, csv_file):
        self.dataset = pd.read_csv(csv_file, sep="\t").dropna(axis=0)
        # 중복제거
        self.dataset.drop_duplicates(subset=["document"], inplace=True)
        self.tokenizer = AutoTokenizer.from_pretrained(
            "monologg/koelectra-small-v2-discriminator"
        )

        print(self.dataset.describe())

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True,
        )

        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]

        return input_ids, attention_mask, y


train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")

                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [3]:
tokenizer = test_dataset.tokenizer
encoded_tokens = tokenizer.encode("내게 능력 주시는 자 안에서 내가 모든 것을 할 수 있느니라")
decoded_tokens = tokenizer.decode(encoded_tokens)

print("Vocab Size    :", len(tokenizer.vocab))
print("Encoded Tokens: ", encoded_tokens)
print("Decoded Tokens: ", decoded_tokens)

Vocab Size    : 32200
Encoded Tokens:  [2, 6745, 1065, 9710, 29950, 52, 124, 29951, 29962, 87, 29956, 650, 45, 29952, 70, 37, 24, 30341, 30041, 29991, 3]
Decoded Tokens:  [CLS] 내게 능력 주시는 자 안에서 내가 모든 것을 할 수 있느니라 [SEP]


# Model

In [6]:
model_name = "monologg/koelectra-base-v3-discriminator"
model = ElectraForSequenceClassification.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [17]:
text, attention_mask, y = train_dataset[0]
output = model(
    text.unsqueeze(0).to(device),
    attention_mask=attention_mask.unsqueeze(0).to(device),
)
output["logits"]

tensor([[-0.0917,  0.1638]], grad_fn=<AddmmBackward0>)

# Training

In [18]:
epochs = 5
batch_size = 16

In [19]:
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(
            input_ids_batch.to(device),
            attention_mask=attention_masks_batch.to(device),
        )[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print(
                "Batch Loss:", total_loss, "Accuracy:", correct.float() / total
            )

    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/9137 [00:00<?, ?it/s]

# Evaluation

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)