In [2]:
import pandas as pd

import sys
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn.functional as F
import torch.utils.data as data_utils
import tensorflow as tf
from sklearn.metrics import classification_report

from pytorch_pretrained_bert.modeling import BertForSequenceClassification , BertForSequenceClassification, BertConfig ,BertForQuestionAnswering
from tokenization import BertTokenizer

from transformers import  AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

import pandas as pd
import numpy as np
import random
import time
import datetime

In [3]:
torch.cuda.is_available()

True

In [4]:
torch.cuda.get_device_name(0)

'Quadro RTX 8000'

In [5]:
torch.cuda.empty_cache()

In [6]:
train_df = pd.read_csv('./data/ratings_train.txt', sep='\t')
test_df = pd.read_csv('./data/ratings_test.txt', sep='\t')

In [7]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df = train_df.sample(frac=0.5, random_state=999)
test_df = test_df.sample(frac=0.5, random_state=999)

In [8]:
sentences = train_df['document']

In [9]:
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

In [10]:
labels = train_df['label'].values

In [11]:
tokenizer = BertTokenizer.from_pretrained("./vocab", do_lower_case=False)

In [12]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [14]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [15]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 128

# 토큰을 숫자 인덱스로 변환
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([   2,    9,   10, 1041,   15, 1006, 4320,  940,   19,   32,  559,
         16,  375,  295, 3018, 3784,    9, 1406,  534,   56, 2498,   18,
        479,  193, 1729,    7,    3,    9,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [16]:
# 어텐션 마스크 초기화
attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [17]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=0, 
                                                                                    test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=0, 
                                                       test_size=0.1)



train_segment = np.zeros((len(train_inputs),128))
validation_segment = np.zeros((len(validation_inputs),128))

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs,dtype=torch.long)
train_labels = torch.tensor(train_labels,dtype=torch.long)
train_masks = torch.tensor(train_masks,dtype=torch.long)
train_segment = torch.tensor(train_segment,dtype=torch.long)
validation_inputs = torch.tensor(validation_inputs,dtype=torch.long)
validation_labels = torch.tensor(validation_labels,dtype=torch.long)
validation_masks = torch.tensor(validation_masks,dtype=torch.long)
validation_segment = torch.tensor(validation_segment,dtype=torch.long)

In [18]:
# 배치 사이즈
batch_size = 128

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_segment ,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_segment ,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [19]:
device = torch.device("cuda")

In [20]:
model = BertForSequenceClassification.from_pretrained("./pretrained_model",num_labels = 2)

In [21]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )
# 에폭수
epochs = 1

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [22]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [23]:
model.cuda()

model 구조

In [25]:
train_loss_set = []

model.train()
for _ in trange(epochs, desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        if torch.cuda.is_available():
            batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        train_loss_set.append(loss.item())

        loss.backward()

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        optimizer.step()
        optimizer.zero_grad()

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/528 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/528 [00:01<10:17,  1.17s/it][A
Iteration:   0%|          | 2/528 [00:02<10:09,  1.16s/it][A
Iteration:   1%|          | 3/528 [00:03<10:02,  1.15s/it][A
Iteration:   1%|          | 4/528 [00:04<09:57,  1.14s/it][A
Iteration:   1%|          | 5/528 [00:05<09:54,  1.14s/it][A
Iteration:   1%|          | 6/528 [00:06<09:50,  1.13s/it][A
Iteration:   1%|▏         | 7/528 [00:07<09:49,  1.13s/it][A
Iteration:   2%|▏         | 8/528 [00:09<09:47,  1.13s/it][A
Iteration:   2%|▏         | 9/528 [00:10<09:46,  1.13s/it][A
Iteration:   2%|▏         | 10/528 [00:11<09:46,  1.13s/it][A
Iteration:   2%|▏         | 11/528 [00:12<09:46,  1.13s/it][A
Iteration:   2%|▏         | 12/528 [00:13<09:45,  1.14s/it][A
Iteration:   2%|▏         | 13/528 [00:14<09:44,  1.14s/it][A
Iteration:   3%|▎         | 14/528 [00:15<09:43,  1.14s/it][A


학습과정 생략

In [26]:
y_pred = np.array([])
y_true = np.array([])

In [27]:
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
outputs = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(validation_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
        logits = model(input_ids, segment_ids, input_mask)

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()

    current_out = np.argmax(logits, axis=1)

    tmp_eval_accuracy = accuracy(logits, label_ids)
    y_pred = np.concatenate((y_pred,np.argmax(logits, axis=1)),axis=None)
    y_true = np.concatenate((y_true,label_ids),axis=None)
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1


eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

loss = tr_loss/nb_tr_steps

result = {'eval_loss': eval_loss,
          'eval_accuracy': eval_accuracy,
          'train_loss': loss}

Evaluating: 100%|██████████| 59/59 [00:41<00:00,  1.41it/s]


In [28]:
result

{'eval_loss': 0.26039724708613704,
 'eval_accuracy': 0.8946666666666667,
 'train_loss': 0.23427869711127697}

In [29]:
from sklearn.metrics import classification_report

In [30]:
target_names = ['Pos', 'Neg']

In [31]:
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         Pos       0.90      0.89      0.89      3737
         Neg       0.89      0.90      0.90      3763

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

