In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd
%cd drive/MyDrive/DIAL/

/content
/content/drive/MyDrive/DIAL


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.3 MB/s[0m eta [36m0:00:0

In [4]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [5]:
traindf = pd.read_csv('./nsmc_data/train.csv')
testdf = pd.read_csv('./nsmc_data/test.csv')

print(traindf.head())

   label  rating                                           document
0      1       5  린넨느낌의 재질이 좋았고 달기도 편했습니다 길이를 잘못재는 바람에 딱 3센치가 모자...
1      1       4                          재구매 밥맛이좋아 계속 재구매해서 먹고 있어요
2      0       1  썩은거를 밑에다두고 포장하셨네요 밑면으로 맞춰놓고 포장하신거 같은데 저는 박스를 밑...
3      0       1  고무로 된 덮개들은 전부다 헐렁거리고 아크릴?케이스는 독에 꽂으면 충전안됩니다.. ...
4      0       1          색이 광고색과 너무 달라요 핑크는 전혀 핑크 같지 않고 당근색은 칙칙합니다


In [6]:
len(traindf), len(testdf)

(197000, 3000)

In [8]:
sentences = traindf['document']

In [9]:
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
sentences[:10]

['[CLS] 린넨느낌의 재질이 좋았고 달기도 편했습니다 길이를 잘못재는 바람에 딱 3센치가 모자라지만 그래도 예뻐요 [SEP]',
 '[CLS] 재구매 밥맛이좋아 계속 재구매해서 먹고 있어요 [SEP]',
 '[CLS] 썩은거를 밑에다두고 포장하셨네요 밑면으로 맞춰놓고 포장하신거 같은데 저는 박스를 밑으로 뜯는 버릇이 있어서 바로 눈에 들어왔네요ㅎ 잘먹겠습니다ㅎ [SEP]',
 '[CLS] 고무로 된 덮개들은 전부다 헐렁거리고 아크릴?케이스는 독에 꽂으면 충전안됩니다.. 완전비추요..돈날림 [SEP]',
 '[CLS] 색이 광고색과 너무 달라요 핑크는 전혀 핑크 같지 않고 당근색은 칙칙합니다 [SEP]',
 '[CLS] 너무오래걸림요. 하나는 못받았어요. [SEP]',
 '[CLS] 아직 사용전인데 빨리 써 보고 싶네요 [SEP]',
 '[CLS] 재구매 항상 잘 구매합니다! 잘 마실게요 [SEP]',
 '[CLS] 좋아요 나쁘지 않아요 ㅎㅎ 가볍고 가장다리 철심이 나름 모양도 잡아주고 편하게 사용하고 있어요 ㅎㅎ [SEP]',
 '[CLS] 냄새 안나고 이쁩니다 [SEP]']

In [10]:
labels = traindf['label'].values
labels

array([1, 1, 0, ..., 0, 1, 0])

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (sentences[0])
print (tokenized_texts[0])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

[CLS] 린넨느낌의 재질이 좋았고 달기도 편했습니다 길이를 잘못재는 바람에 딱 3센치가 모자라지만 그래도 예뻐요 [SEP]
['[CLS]', '린', '##넨', '##느', '##낌', '##의', '재', '##질', '##이', '좋', '##았고', '달', '##기도', '편', '##했', '##습', '##니다', '길', '##이를', '잘', '##못', '##재', '##는', '바', '##람', '##에', '딱', '3', '##센', '##치가', '모', '##자', '##라', '##지만', '그', '##래', '##도', '예', '##뻐', '##요', '[SEP]']


In [12]:
MAX_LEN = 128

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# 패딩
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([   101,   9240, 118736, 118760, 118713,  10459,   9659,  48599,
        10739,   9685,  77172,   9061,  27792,   9924, 119424, 119081,
        48345,   8934,  66623,   9654, 118940,  36210,  11018,   9318,
        61250,  10530,   9131,    124, 119044, 104504,   9283,  13764,
        17342,  28578,   8924,  37388,  12092,   9576, 119013,  48549,
          102,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [14]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [15]:

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels,
                                                                                    random_state=42,
                                                                                    test_size=0.1)


train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                       input_ids,
                                                       random_state=42,
                                                       test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])

tensor([   101,   9405,  37114, 118696,  12030, 119219,   9283,  81220,   8908,
        100372, 119424,  77884,  48549,    119,   9746,  25486,  68773, 118696,
         10622,   9405,  11664,   9495,  74311,    119,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [16]:
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test set

In [17]:
sentences = testdf['document']
sentences[:10]

0    마감이 안좋아요...실밥도 많고 바느질도 부족한 부분이 몇군데 있네요...교환받기 ...
1                              깨끗하게 잘 다듬어져 있어요. 맛도좋고요.
2                    재구매 배송빨라요 길냥이들이 잘먹어요~~ 대용량이라 좋네요~
3                           제품도 빨리 배송해주시고 꼼꼼하게 잘챙겨주셨어요
4                             기타 남 멋지고 예뻐요 여러 사은품도 좋아요
5    기존것보다 다리를 올려놓으면 푹빠지니깐 무서워서 안올라가요.ㅠㅠ 안고 올려놓으면 가...
6    상품은잘받았습니다 요청한거와 손잡이방향은다르게왔지만 설치는 잘했습니다. 좀더 신경을...
7    재구매 아기땜에 하루빨리 필요했는데 배송도 오래 걸렸는데 벨크로가 안왔어요 민원처리...
8                        좋네요. 사서 방전된 차에 점프 했는데 잘 됐습니다.
9                                  저렴하게 잘샀어요ㅎ 쓰던거라 좋아요
Name: document, dtype: object

In [18]:
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
sentences[:10]

['[CLS] 마감이 안좋아요...실밥도 많고 바느질도 부족한 부분이 몇군데 있네요...교환받기 귀찮아서 그냥 씁니다 [SEP]',
 '[CLS] 깨끗하게 잘 다듬어져 있어요. 맛도좋고요. [SEP]',
 '[CLS] 재구매 배송빨라요 길냥이들이 잘먹어요~~ 대용량이라 좋네요~ [SEP]',
 '[CLS] 제품도 빨리 배송해주시고 꼼꼼하게 잘챙겨주셨어요 [SEP]',
 '[CLS] 기타 남 멋지고 예뻐요 여러 사은품도 좋아요 [SEP]',
 '[CLS] 기존것보다 다리를 올려놓으면 푹빠지니깐 무서워서 안올라가요.ㅠㅠ 안고 올려놓으면 가만히 있는데. 습관되면 괜찮아지겠죠.^^ [SEP]',
 '[CLS] 상품은잘받았습니다 요청한거와 손잡이방향은다르게왔지만 설치는 잘했습니다. 좀더 신경을 써주시는게 어떨까 싶네요 [SEP]',
 '[CLS] 재구매 아기땜에 하루빨리 필요했는데 배송도 오래 걸렸는데 벨크로가 안왔어요 민원처리도 답답하구요 서비스 엉망입니다 [SEP]',
 '[CLS] 좋네요. 사서 방전된 차에 점프 했는데 잘 됐습니다. [SEP]',
 '[CLS] 저렴하게 잘샀어요ㅎ 쓰던거라 좋아요 [SEP]']

In [19]:
labels = testdf['label'].values
labels

array([0, 1, 1, ..., 0, 1, 0])

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (sentences[0])
print (tokenized_texts[0])

[CLS] 마감이 안좋아요...실밥도 많고 바느질도 부족한 부분이 몇군데 있네요...교환받기 귀찮아서 그냥 씁니다 [SEP]
['[CLS]', '마', '##감', '##이', '안', '##좋', '##아', '##요', '.', '.', '.', '실', '##밥', '##도', '많', '##고', '바', '##느', '##질', '##도', '부', '##족', '##한', '부', '##분', '##이', '몇', '##군', '##데', '있', '##네', '##요', '.', '.', '.', '교', '##환', '##받', '##기', '귀', '##찮', '##아', '##서', '그', '##냥', '[UNK]', '[SEP]']


In [21]:
MAX_LEN = 128

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([   101,   9246, 105197,  10739,   9521, 119214,  16985,  48549,
          119,    119,    119,   9489, 118969,  12092,   9249,  11664,
         9318, 118760,  48599,  12092,   9365,  52560,  11102,   9365,
        37712,  10739,   9282,  17360,  28911,   9647,  77884,  48549,
          119,    119,    119,   8907,  51745, 118965,  12310,   8920,
       119250,  16985,  12424,   8924, 118729,    100,    102,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [22]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [23]:
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

print(test_inputs[0])
print(test_labels[0])
print(test_masks[0])

tensor([   101,   9246, 105197,  10739,   9521, 119214,  16985,  48549,    119,
           119,    119,   9489, 118969,  12092,   9249,  11664,   9318, 118760,
         48599,  12092,   9365,  52560,  11102,   9365,  37712,  10739,   9282,
         17360,  28911,   9647,  77884,  48549,    119,    119,    119,   8907,
         51745, 118965,  12310,   8920, 119250,  16985,  12424,   8924, 118729,
           100,    102,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [24]:
batch_size = 64

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [25]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

No GPU available, using the CPU instead.


모델

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

epochs = 4

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



train

In [26]:
def flat_accuracy(preds, labels):

    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [27]:
def format_time(elapsed):

    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
from tqdm import tqdm

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad()

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(tqdm(train_dataloader, desc="Training", leave=False)):

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    #### eval
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in tqdm(validation_dataloader, desc="Validation", leave=False):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    torch.save(model.state_dict(), 'bert_nsmc_label.pth')

print("")
print("Training complete!")


Training...





  Average training loss: 0.26
  Training epcoh took: 0:18:06

Running Validation...




  Accuracy: 0.92
  Validation took: 0:00:39

Training...





  Average training loss: 0.20
  Training epcoh took: 0:18:04

Running Validation...




  Accuracy: 0.92
  Validation took: 0:00:39

Training...





  Average training loss: 0.17
  Training epcoh took: 0:18:04

Running Validation...




  Accuracy: 0.92
  Validation took: 0:00:40

Training...





  Average training loss: 0.14
  Training epcoh took: 0:18:04

Running Validation...




  Accuracy: 0.92
  Validation took: 0:00:39

Training complete!


test

In [28]:
testmodel = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
testmodel.cpu()
testmodel.load_state_dict(torch.load('bert_nsmc_label.pth', map_location = 'cpu'))

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

<All keys matched successfully>

In [30]:
from tqdm import tqdm
testmodel.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in tqdm(test_dataloader, desc="Test", leave=False):

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = testmodel(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print(f"Accuracy: {eval_accuracy/nb_eval_steps}")

                                                     


Accuracy: 0.9226823708206687




내 데이터

In [31]:

def convert_input_data(sentences, tokenizer):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    MAX_LEN = 128

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [32]:

def test_sentences(sentences, model):

    model.eval()

    inputs, masks = convert_input_data(sentences, tokenizer)

    b_input_ids = inputs.cpu()
    b_input_mask = masks.cpu()

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    return logits

In [33]:
logits = test_sentences(['너무 잘 쓰고 있어요 만족합니다'], testmodel)

print(np.argmax(logits))

1


In [34]:
logits = test_sentences(['주연배우가 아깝다. 총체적 난국...'], testmodel)

print(logits)
print(np.argmax(logits))

[[ 2.9987874 -3.0106373]]
0
