In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 16.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling 

In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [None]:
# GPU 사용
device = torch.device("cuda")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_dataset = '/content/drive/MyDrive/CP1/train_data_list.tsv'
test_dataset = '/content/drive/MyDrive/CP1/test_data_list.tsv'

In [None]:
train_dataset = pd.read_csv(train_dataset,delimiter='\t').reset_index()
test_dataset = pd.read_csv(test_dataset,delimiter='\t').reset_index()

In [None]:
train_dataset

Unnamed: 0,index,Sentence,Emotion
0,0,아 더빙.. 진짜 짜증나네요 목소리,0
1,1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,2,너무재밓었다그래서보는것을추천한다,0
3,3,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,4,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
195597,195597,아무거나 다 추천해줘. 다 할 수 있을 것 같아!,1
195598,195598,너무많이 미안하고..,0
195599,195599,아니야. 다 내 잘못같아. 상사가 하는 말중에 틀린 말이 하나도 없더라.,0
195600,195600,몰라 그냥 걷고 있어.,0


In [None]:
#tsv용
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = csv_file.dropna(axis=0).copy() #copy(): https://blog.naver.com/PostView.nhn?blogId=wideeyed&logNo=221817400937
    # 중복제거
    self.dataset.drop_duplicates(subset=['Sentence'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx): #https://blog.weirdx.io/post/21466
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
train_dataset = NSMCDataset(train_dataset)
test_dataset = NSMCDataset(test_dataset)

NameError: ignored

In [None]:
test_dataset.__len__()

64344

# Create Model

In [None]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Downloading:   0%|          | 0.00/458 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/54.0M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [None]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# Learn

In [None]:
epochs = 15
batch_size = 16

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) #torch형식의 dataset으로 load
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)



In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad() #https://algopoolja.tistory.com/55
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward() #autograd를 사용하여 역전파 단계를 계산합니다. https://ropiens.tistory.com/29
    optimizer.step() 

    total_loss += loss.item() #손실이 갖고 있는 스칼라 값을 가져올 수 있습니다. https://tutorials.pytorch.kr/beginner/pytorch_with_examples.html

    _, predicted = torch.max(y_pred, 1) #텐서 배열(임베딩 벡터)에서 최대 값의 인덱스를 저장
    correct += (predicted == y_batch).sum() #해당 인덱스와 label이 같은 값인지 확인
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/11981 [00:00<?, ?it/s]



Batch Loss: 68.95703834295273 Accuracy: tensor(0.5619, device='cuda:0')
Batch Loss: 137.1896352171898 Accuracy: tensor(0.5716, device='cuda:0')
Batch Loss: 204.6898169517517 Accuracy: tensor(0.5817, device='cuda:0')
Batch Loss: 272.4486157298088 Accuracy: tensor(0.5828, device='cuda:0')
Batch Loss: 339.93774062395096 Accuracy: tensor(0.5849, device='cuda:0')
Batch Loss: 408.047423183918 Accuracy: tensor(0.5826, device='cuda:0')
Batch Loss: 474.9283535480499 Accuracy: tensor(0.5856, device='cuda:0')
Batch Loss: 542.2018741965294 Accuracy: tensor(0.5854, device='cuda:0')
Batch Loss: 609.1678568720818 Accuracy: tensor(0.5854, device='cuda:0')
Batch Loss: 675.5555769205093 Accuracy: tensor(0.5869, device='cuda:0')
Batch Loss: 741.3608055710793 Accuracy: tensor(0.5902, device='cuda:0')
Batch Loss: 805.9149748086929 Accuracy: tensor(0.5948, device='cuda:0')
Batch Loss: 869.936675786972 Accuracy: tensor(0.5998, device='cuda:0')
Batch Loss: 933.1217163801193 Accuracy: tensor(0.6037, device='cu

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 48.50769418478012 Accuracy: tensor(0.7706, device='cuda:0')
Batch Loss: 98.1659245789051 Accuracy: tensor(0.7706, device='cuda:0')
Batch Loss: 145.0629670470953 Accuracy: tensor(0.7727, device='cuda:0')
Batch Loss: 192.38674622774124 Accuracy: tensor(0.7761, device='cuda:0')
Batch Loss: 238.24148063361645 Accuracy: tensor(0.7789, device='cuda:0')
Batch Loss: 282.0267390459776 Accuracy: tensor(0.7808, device='cuda:0')
Batch Loss: 329.8378836661577 Accuracy: tensor(0.7799, device='cuda:0')
Batch Loss: 378.4112515002489 Accuracy: tensor(0.7778, device='cuda:0')
Batch Loss: 426.0213608443737 Accuracy: tensor(0.7769, device='cuda:0')
Batch Loss: 474.24271509051323 Accuracy: tensor(0.7761, device='cuda:0')
Batch Loss: 520.8151273429394 Accuracy: tensor(0.7758, device='cuda:0')
Batch Loss: 566.7127343267202 Accuracy: tensor(0.7758, device='cuda:0')
Batch Loss: 614.8849482089281 Accuracy: tensor(0.7746, device='cuda:0')
Batch Loss: 658.6133403778076 Accuracy: tensor(0.7763, device=

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 41.85089512169361 Accuracy: tensor(0.8006, device='cuda:0')
Batch Loss: 82.20648010075092 Accuracy: tensor(0.8053, device='cuda:0')
Batch Loss: 121.89703585207462 Accuracy: tensor(0.8092, device='cuda:0')
Batch Loss: 161.2501922994852 Accuracy: tensor(0.8116, device='cuda:0')
Batch Loss: 202.9489460736513 Accuracy: tensor(0.8128, device='cuda:0')
Batch Loss: 245.35903829336166 Accuracy: tensor(0.8118, device='cuda:0')
Batch Loss: 288.5336513072252 Accuracy: tensor(0.8103, device='cuda:0')
Batch Loss: 329.5362588018179 Accuracy: tensor(0.8109, device='cuda:0')
Batch Loss: 371.96434949338436 Accuracy: tensor(0.8107, device='cuda:0')
Batch Loss: 413.6216291040182 Accuracy: tensor(0.8106, device='cuda:0')
Batch Loss: 455.69541051983833 Accuracy: tensor(0.8105, device='cuda:0')
Batch Loss: 497.83470763266087 Accuracy: tensor(0.8110, device='cuda:0')
Batch Loss: 539.4610134363174 Accuracy: tensor(0.8108, device='cuda:0')
Batch Loss: 582.7624102681875 Accuracy: tensor(0.8097, devi

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 36.69362586736679 Accuracy: tensor(0.8344, device='cuda:0')
Batch Loss: 75.73321920633316 Accuracy: tensor(0.8269, device='cuda:0')
Batch Loss: 116.24059653282166 Accuracy: tensor(0.8217, device='cuda:0')
Batch Loss: 152.84008565545082 Accuracy: tensor(0.8237, device='cuda:0')
Batch Loss: 192.92316372692585 Accuracy: tensor(0.8211, device='cuda:0')
Batch Loss: 229.10031206905842 Accuracy: tensor(0.8241, device='cuda:0')
Batch Loss: 268.4720537662506 Accuracy: tensor(0.8244, device='cuda:0')
Batch Loss: 307.4570430368185 Accuracy: tensor(0.8243, device='cuda:0')
Batch Loss: 345.1049274355173 Accuracy: tensor(0.8242, device='cuda:0')
Batch Loss: 383.3234711140394 Accuracy: tensor(0.8241, device='cuda:0')
Batch Loss: 423.05463495850563 Accuracy: tensor(0.8242, device='cuda:0')
Batch Loss: 459.7952401638031 Accuracy: tensor(0.8256, device='cuda:0')
Batch Loss: 498.2373085319996 Accuracy: tensor(0.8261, device='cuda:0')
Batch Loss: 538.5189434736967 Accuracy: tensor(0.8247, devi

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 36.15456825494766 Accuracy: tensor(0.8312, device='cuda:0')
Batch Loss: 75.99454985558987 Accuracy: tensor(0.8244, device='cuda:0')
Batch Loss: 111.87363959848881 Accuracy: tensor(0.8319, device='cuda:0')
Batch Loss: 150.6799670010805 Accuracy: tensor(0.8291, device='cuda:0')
Batch Loss: 187.0435977205634 Accuracy: tensor(0.8301, device='cuda:0')
Batch Loss: 223.0529159605503 Accuracy: tensor(0.8307, device='cuda:0')
Batch Loss: 259.0298815444112 Accuracy: tensor(0.8317, device='cuda:0')
Batch Loss: 297.27086248993874 Accuracy: tensor(0.8309, device='cuda:0')
Batch Loss: 331.37715271115303 Accuracy: tensor(0.8327, device='cuda:0')
Batch Loss: 367.32118962705135 Accuracy: tensor(0.8336, device='cuda:0')
Batch Loss: 402.80684350430965 Accuracy: tensor(0.8345, device='cuda:0')
Batch Loss: 437.3936033695936 Accuracy: tensor(0.8347, device='cuda:0')
Batch Loss: 472.39010056853294 Accuracy: tensor(0.8354, device='cuda:0')
Batch Loss: 508.32037100195885 Accuracy: tensor(0.8357, de

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 32.4802975282073 Accuracy: tensor(0.8625, device='cuda:0')
Batch Loss: 65.7001011222601 Accuracy: tensor(0.8581, device='cuda:0')
Batch Loss: 98.83156753331423 Accuracy: tensor(0.8569, device='cuda:0')
Batch Loss: 133.26945577561855 Accuracy: tensor(0.8559, device='cuda:0')
Batch Loss: 167.9684073626995 Accuracy: tensor(0.8518, device='cuda:0')
Batch Loss: 201.23635456711054 Accuracy: tensor(0.8522, device='cuda:0')
Batch Loss: 235.21391785889864 Accuracy: tensor(0.8507, device='cuda:0')
Batch Loss: 268.5214643329382 Accuracy: tensor(0.8512, device='cuda:0')
Batch Loss: 302.822709903121 Accuracy: tensor(0.8510, device='cuda:0')
Batch Loss: 337.6028174087405 Accuracy: tensor(0.8500, device='cuda:0')
Batch Loss: 371.2146325856447 Accuracy: tensor(0.8502, device='cuda:0')
Batch Loss: 407.7477786689997 Accuracy: tensor(0.8493, device='cuda:0')
Batch Loss: 443.3376810029149 Accuracy: tensor(0.8490, device='cuda:0')
Batch Loss: 476.5643731728196 Accuracy: tensor(0.8492, device='c

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 34.80073190480471 Accuracy: tensor(0.8525, device='cuda:0')
Batch Loss: 66.01036512106657 Accuracy: tensor(0.8559, device='cuda:0')
Batch Loss: 97.72830344736576 Accuracy: tensor(0.8573, device='cuda:0')
Batch Loss: 133.2782973498106 Accuracy: tensor(0.8537, device='cuda:0')
Batch Loss: 165.08310817927122 Accuracy: tensor(0.8563, device='cuda:0')
Batch Loss: 196.07149974256754 Accuracy: tensor(0.8580, device='cuda:0')
Batch Loss: 229.05635275691748 Accuracy: tensor(0.8572, device='cuda:0')
Batch Loss: 260.6540663167834 Accuracy: tensor(0.8590, device='cuda:0')
Batch Loss: 292.94709122925997 Accuracy: tensor(0.8586, device='cuda:0')
Batch Loss: 325.5077363178134 Accuracy: tensor(0.8582, device='cuda:0')
Batch Loss: 356.7293255254626 Accuracy: tensor(0.8586, device='cuda:0')
Batch Loss: 388.866964392364 Accuracy: tensor(0.8584, device='cuda:0')
Batch Loss: 423.3860305696726 Accuracy: tensor(0.8576, device='cuda:0')
Batch Loss: 455.59396919608116 Accuracy: tensor(0.8576, devic

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 28.60373479500413 Accuracy: tensor(0.8800, device='cuda:0')
Batch Loss: 61.25923341140151 Accuracy: tensor(0.8719, device='cuda:0')
Batch Loss: 91.63567990809679 Accuracy: tensor(0.8719, device='cuda:0')
Batch Loss: 120.77889355272055 Accuracy: tensor(0.8747, device='cuda:0')
Batch Loss: 150.83771254867315 Accuracy: tensor(0.8753, device='cuda:0')
Batch Loss: 180.3117288351059 Accuracy: tensor(0.8746, device='cuda:0')
Batch Loss: 211.86771497875452 Accuracy: tensor(0.8734, device='cuda:0')
Batch Loss: 242.5811455771327 Accuracy: tensor(0.8730, device='cuda:0')
Batch Loss: 275.4311555027962 Accuracy: tensor(0.8715, device='cuda:0')
Batch Loss: 307.1496892645955 Accuracy: tensor(0.8706, device='cuda:0')
Batch Loss: 340.6969313323498 Accuracy: tensor(0.8688, device='cuda:0')
Batch Loss: 369.3925525471568 Accuracy: tensor(0.8699, device='cuda:0')
Batch Loss: 401.66738753020763 Accuracy: tensor(0.8695, device='cuda:0')
Batch Loss: 432.25129233300686 Accuracy: tensor(0.8694, devi

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 34.16305724531412 Accuracy: tensor(0.8606, device='cuda:0')
Batch Loss: 64.57219719141722 Accuracy: tensor(0.8631, device='cuda:0')
Batch Loss: 91.71023278310895 Accuracy: tensor(0.8685, device='cuda:0')
Batch Loss: 121.18768088892102 Accuracy: tensor(0.8697, device='cuda:0')
Batch Loss: 151.1070873774588 Accuracy: tensor(0.8704, device='cuda:0')
Batch Loss: 182.6547939889133 Accuracy: tensor(0.8702, device='cuda:0')
Batch Loss: 213.61575961485505 Accuracy: tensor(0.8674, device='cuda:0')
Batch Loss: 243.5359238795936 Accuracy: tensor(0.8674, device='cuda:0')
Batch Loss: 273.1171214170754 Accuracy: tensor(0.8682, device='cuda:0')
Batch Loss: 303.8252807818353 Accuracy: tensor(0.8678, device='cuda:0')
Batch Loss: 335.2729844413698 Accuracy: tensor(0.8672, device='cuda:0')
Batch Loss: 366.96213610097766 Accuracy: tensor(0.8663, device='cuda:0')
Batch Loss: 398.2528527639806 Accuracy: tensor(0.8669, device='cuda:0')
Batch Loss: 428.6262920834124 Accuracy: tensor(0.8671, device

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 28.60540644824505 Accuracy: tensor(0.8825, device='cuda:0')
Batch Loss: 53.886341132223606 Accuracy: tensor(0.8872, device='cuda:0')
Batch Loss: 84.8382131755352 Accuracy: tensor(0.8806, device='cuda:0')
Batch Loss: 115.20419025421143 Accuracy: tensor(0.8775, device='cuda:0')
Batch Loss: 144.72750063985586 Accuracy: tensor(0.8770, device='cuda:0')
Batch Loss: 177.35124750435352 Accuracy: tensor(0.8739, device='cuda:0')
Batch Loss: 203.67567519843578 Accuracy: tensor(0.8765, device='cuda:0')
Batch Loss: 234.94676310569048 Accuracy: tensor(0.8742, device='cuda:0')
Batch Loss: 263.349758207798 Accuracy: tensor(0.8756, device='cuda:0')
Batch Loss: 290.71396420896053 Accuracy: tensor(0.8758, device='cuda:0')
Batch Loss: 320.29969868436456 Accuracy: tensor(0.8756, device='cuda:0')
Batch Loss: 347.93063102290034 Accuracy: tensor(0.8766, device='cuda:0')
Batch Loss: 378.2829364053905 Accuracy: tensor(0.8762, device='cuda:0')
Batch Loss: 406.2486221678555 Accuracy: tensor(0.8769, de

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 27.093577701598406 Accuracy: tensor(0.8900, device='cuda:0')
Batch Loss: 56.19898984208703 Accuracy: tensor(0.8819, device='cuda:0')
Batch Loss: 83.87115113064647 Accuracy: tensor(0.8802, device='cuda:0')
Batch Loss: 112.02123117819428 Accuracy: tensor(0.8809, device='cuda:0')
Batch Loss: 141.949569568038 Accuracy: tensor(0.8801, device='cuda:0')
Batch Loss: 169.56207285821438 Accuracy: tensor(0.8798, device='cuda:0')
Batch Loss: 199.55282144993544 Accuracy: tensor(0.8791, device='cuda:0')
Batch Loss: 225.37644411437213 Accuracy: tensor(0.8795, device='cuda:0')
Batch Loss: 253.33571389876306 Accuracy: tensor(0.8797, device='cuda:0')
Batch Loss: 281.2591667678207 Accuracy: tensor(0.8799, device='cuda:0')
Batch Loss: 308.97435329668224 Accuracy: tensor(0.8799, device='cuda:0')
Batch Loss: 340.13805043138564 Accuracy: tensor(0.8787, device='cuda:0')
Batch Loss: 369.2106142062694 Accuracy: tensor(0.8788, device='cuda:0')
Batch Loss: 399.6291764769703 Accuracy: tensor(0.8782, de

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 25.945560559630394 Accuracy: tensor(0.8881, device='cuda:0')
Batch Loss: 53.83846150711179 Accuracy: tensor(0.8897, device='cuda:0')
Batch Loss: 83.39953595027328 Accuracy: tensor(0.8854, device='cuda:0')
Batch Loss: 110.68544086441398 Accuracy: tensor(0.8866, device='cuda:0')
Batch Loss: 137.60887340083718 Accuracy: tensor(0.8874, device='cuda:0')
Batch Loss: 163.87114299461246 Accuracy: tensor(0.8883, device='cuda:0')
Batch Loss: 191.03611988946795 Accuracy: tensor(0.8892, device='cuda:0')
Batch Loss: 218.49512492120266 Accuracy: tensor(0.8877, device='cuda:0')
Batch Loss: 244.63349048048258 Accuracy: tensor(0.8874, device='cuda:0')
Batch Loss: 271.62876830250025 Accuracy: tensor(0.8875, device='cuda:0')
Batch Loss: 302.04725528135896 Accuracy: tensor(0.8866, device='cuda:0')
Batch Loss: 330.1598618142307 Accuracy: tensor(0.8857, device='cuda:0')
Batch Loss: 358.2238500714302 Accuracy: tensor(0.8859, device='cuda:0')
Batch Loss: 385.95601864904165 Accuracy: tensor(0.8856,

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 26.93352174386382 Accuracy: tensor(0.8931, device='cuda:0')
Batch Loss: 50.82635088637471 Accuracy: tensor(0.8959, device='cuda:0')
Batch Loss: 77.23876788839698 Accuracy: tensor(0.8921, device='cuda:0')
Batch Loss: 105.19618634134531 Accuracy: tensor(0.8902, device='cuda:0')
Batch Loss: 131.87160072475672 Accuracy: tensor(0.8894, device='cuda:0')
Batch Loss: 159.0840643197298 Accuracy: tensor(0.8895, device='cuda:0')
Batch Loss: 186.35359875112772 Accuracy: tensor(0.8894, device='cuda:0')
Batch Loss: 211.5952863842249 Accuracy: tensor(0.8899, device='cuda:0')
Batch Loss: 237.52002131938934 Accuracy: tensor(0.8903, device='cuda:0')
Batch Loss: 266.76930855959654 Accuracy: tensor(0.8885, device='cuda:0')
Batch Loss: 293.244351323694 Accuracy: tensor(0.8893, device='cuda:0')
Batch Loss: 320.68173817545176 Accuracy: tensor(0.8891, device='cuda:0')
Batch Loss: 348.47263388335705 Accuracy: tensor(0.8880, device='cuda:0')
Batch Loss: 375.30929328501225 Accuracy: tensor(0.8887, de

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 25.386487394571304 Accuracy: tensor(0.8844, device='cuda:0')
Batch Loss: 51.197056259959936 Accuracy: tensor(0.8881, device='cuda:0')
Batch Loss: 75.14919552952051 Accuracy: tensor(0.8919, device='cuda:0')
Batch Loss: 102.8801824375987 Accuracy: tensor(0.8902, device='cuda:0')
Batch Loss: 126.40207026712596 Accuracy: tensor(0.8926, device='cuda:0')
Batch Loss: 152.9582114648074 Accuracy: tensor(0.8920, device='cuda:0')
Batch Loss: 177.31912346370518 Accuracy: tensor(0.8931, device='cuda:0')
Batch Loss: 202.54679125361145 Accuracy: tensor(0.8934, device='cuda:0')
Batch Loss: 230.80554674379528 Accuracy: tensor(0.8921, device='cuda:0')
Batch Loss: 257.2173947189003 Accuracy: tensor(0.8914, device='cuda:0')
Batch Loss: 285.15139586664736 Accuracy: tensor(0.8914, device='cuda:0')
Batch Loss: 311.65830515138805 Accuracy: tensor(0.8908, device='cuda:0')
Batch Loss: 338.7040693555027 Accuracy: tensor(0.8900, device='cuda:0')
Batch Loss: 362.93740016780794 Accuracy: tensor(0.8908, 

  0%|          | 0/11981 [00:00<?, ?it/s]

Batch Loss: 26.807430058717728 Accuracy: tensor(0.8856, device='cuda:0')
Batch Loss: 52.04713016375899 Accuracy: tensor(0.8875, device='cuda:0')
Batch Loss: 78.7411715425551 Accuracy: tensor(0.8898, device='cuda:0')
Batch Loss: 104.52130349352956 Accuracy: tensor(0.8906, device='cuda:0')
Batch Loss: 128.0357349589467 Accuracy: tensor(0.8938, device='cuda:0')
Batch Loss: 155.61065002530813 Accuracy: tensor(0.8916, device='cuda:0')
Batch Loss: 182.97924599796534 Accuracy: tensor(0.8909, device='cuda:0')
Batch Loss: 209.1531876027584 Accuracy: tensor(0.8906, device='cuda:0')
Batch Loss: 232.50292764231563 Accuracy: tensor(0.8920, device='cuda:0')
Batch Loss: 261.0734705142677 Accuracy: tensor(0.8910, device='cuda:0')
Batch Loss: 285.96047216281295 Accuracy: tensor(0.8910, device='cuda:0')
Batch Loss: 311.30157401040196 Accuracy: tensor(0.8918, device='cuda:0')
Batch Loss: 338.6938149854541 Accuracy: tensor(0.8911, device='cuda:0')
Batch Loss: 366.8495841920376 Accuracy: tensor(0.8909, dev

In [None]:
losses, accuracies

([6535.524240061641,
  5334.346693336964,
  4840.90824637562,
  4507.766692586243,
  4265.506487056613,
  4073.4613246694207,
  3908.9484866410494,
  3770.289063723758,
  3641.645820984617,
  3533.475737862289,
  3422.6501905899495,
  3329.503349598497,
  3236.4985523801297,
  3140.7263861633837,
  3048.0450954977423],
 [tensor(0.7206, device='cuda:0'),
  tensor(0.7913, device='cuda:0'),
  tensor(0.8157, device='cuda:0'),
  tensor(0.8310, device='cuda:0'),
  tensor(0.8415, device='cuda:0'),
  tensor(0.8503, device='cuda:0'),
  tensor(0.8588, device='cuda:0'),
  tensor(0.8644, device='cuda:0'),
  tensor(0.8696, device='cuda:0'),
  tensor(0.8738, device='cuda:0'),
  tensor(0.8787, device='cuda:0'),
  tensor(0.8830, device='cuda:0'),
  tensor(0.8858, device='cuda:0'),
  tensor(0.8902, device='cuda:0'),
  tensor(0.8938, device='cuda:0')])

테스트 데이터셋 정확도 확인하기

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/4022 [00:00<?, ?it/s]



Accuracy: tensor(0.8673, device='cuda:0')


In [None]:
# 모델 저장하기
path = '/content/drive/MyDrive/CP1/'
torch.save(model.state_dict(), path + "model.pt")

# 테스팅

In [None]:
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

In [None]:
path = '/content/drive/MyDrive/CP1/'
model.load_state_dict(torch.load(path + "model.pt"))

<All keys matched successfully>

In [None]:
def predict(text):
    encoded_review = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output[0], dim=1)
    print('입력 텍스트: ' + text)
    if str(prediction) == "tensor([1], device='cuda:0')":
        print('감정: 긍정적')
    else:
        print('감정: 부정적')

In [None]:
predict('액션이 없는데도 재미 있는 몇안되는 영화')

입력 텍스트: 액션이 없는데도 재미 있는 몇안되는 영화
감정: 긍정적




In [None]:
li = ['두사람 사면시키려면 우리나라 도둑놈들 다 사면시켜야됨',
 '유승민은 바보다.\n자신을픽밥한 박통을 사면하자고.\n이게 진심 큰정치다 .',
 '국민에게 고통을 주는 갈라치기정치에서 국민통합이라는 메세지. 오십보백보 같은 전.현정권의 행보에서의 형평성의 논리로 보여지네',
 '집권 여당의 꾀에 넘어가지마라. 어차피 고생한거 1년 더 고생하고 너희들이 정권잡고 사면시켜라. 그리고 현정부는 심판하고!!',
 '국민통합? 국민통합을 위해서라면 큰 죄를 지은 사람을 봐줘야 한다는 건가요? 동의할 수 없습니다.',
 '그래도 솔직하시네',
 '유승민이 항상 실패했던 이유',
 '하.....승민아  널 지지한걸 후회하게만드는 발언은하지마  \n그동안 가던데로  너만의길을  올바르게 가라',
 '절대 안된다.',
 '유승민 넌 정치 하지마라..국민들 마음도 못 읽으면서 무슨 정치를 한다고 나대는 거냐?..',
 '법지키며 사는 국민을 개 돼지 취급하는 사면! 법 안지키고 살면 편한거 모르는 사람없다! 국민통합? 웃기고 자빠졌네! 생계형 사면이라면 모를까 저런 사면은 개나 줘라',
 '승민이형도...대통보다는 시장도전이 현실적일듯...',
 '이새\n\n\n끼. 지가 배신때려서 박근혜대통령 탄핵시작됐었으면서...  이박쥐같은새\n\n\n\n끼는 절대안됨. 한번배반때린건 또 그짓함.',
 '유승민 요 OOO  내통했구나 서로 짜짝쿵이 잘 맞네... 한쪽에선 제안하고 한쪽에서 환영하고..',
 '유승민이랑 김무성이 탄핵시켰자나 저거도 완장놀이 재밌어해서 큰일임',
 '하여간 정치권이란',
 '질문도 좋고 먼저 말 거는 모습도 좋고..나토에서 좋은 모습 보여줘서 너무 좋습니다!',
 '잘하고 있네.',
 '정수기랑 혜경궁 보단 나음',
 '이왕 영부인 된 거 그냥 좋게 봐주자 방구석에 처박혀 아무것도 안하고 있는거 보다 영부인 위치에서 자기가 할수 있는 뭐라도 해보겠다고 하는게 좋아 보임',
 '세계에 내놓으니 우아하고 귀족적이다. 관상보는분이 귀한상이라고 하더니 바로 표나네요. ',
 '잘하고 있다. 어디에 내 놔도 손색없다. 차원이 다르다.']

In [None]:
text = ['관종 김건희 방치하고 대통령실 꼴 보면 자업자득임',
'민좆에 병신 하나 더 들어가서 나쁠것 없지 ㅋㅋㅋㅋㅋ 그래봤자 이재명 꼬붕 하나 더 늘어날 뿐',
'이준석은 언론 어떻게 다루는지 잘 모르는거 같음.',
'윤석열이 대통령실이든 구태들이든 컨트롤 못 한다는게 맞는거 같다',
'이준석 아니었으면 국힘 지지도 나락갔을테고  이번 대선도 나락이었을텐데  꼭 이렇게 이준석을 힘들게 해야 함??   "당 대표 중심으로 협력해라!"  이 말 한마디 하기가 글케 어려움??   진짜 실망스럽다',
'실망스럽다',
'이준석 요즘 너무 잘하는거 같지 않음?',
'이준석 요즘 잘하는거 같음']

In [None]:
%%time
for i in text:
    print(predict(i))

입력 텍스트: 관종 김건희 방치하고 대통령실 꼴 보면 자업자득임
감정: 부정적
None
입력 텍스트: 민좆에 병신 하나 더 들어가서 나쁠것 없지 ㅋㅋㅋㅋㅋ 그래봤자 이재명 꼬붕 하나 더 늘어날 뿐
감정: 부정적
None
입력 텍스트: 이준석은 언론 어떻게 다루는지 잘 모르는거 같음.
감정: 부정적
None
입력 텍스트: 윤석열이 대통령실이든 구태들이든 컨트롤 못 한다는게 맞는거 같다
감정: 부정적
None
입력 텍스트: 이준석 아니었으면 국힘 지지도 나락갔을테고  이번 대선도 나락이었을텐데  꼭 이렇게 이준석을 힘들게 해야 함??   "당 대표 중심으로 협력해라!"  이 말 한마디 하기가 글케 어려움??   진짜 실망스럽다
감정: 부정적
None
입력 텍스트: 실망스럽다
감정: 부정적
None
입력 텍스트: 이준석 요즘 너무 잘하는거 같지 않음?
감정: 부정적
None
입력 텍스트: 이준석 요즘 잘하는거 같음
감정: 긍정적
None
CPU times: user 123 ms, sys: 7.93 ms, total: 131 ms
Wall time: 127 ms


