In [None]:
!pip install transformers

In [53]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification
from transformers import BertTokenizerFast


import pandas as pd
import numpy as np
import random
import time
import datetime
from google.colab import drive
from transformers import TextClassificationPipeline

In [54]:
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/model')
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/model').to("cuda")

In [None]:
import csv
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/TrainSet _1st.csv',encoding="utf-8")

for i in range(len(data)):
  data['Findings'][i] = str(data['Findings'][i]) + str(data['Conclusion\n'][i]) 

data_shuffled = data.sample(frac=1).reset_index(drop=True)
test = data_shuffled[:]
data.head()


In [56]:
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in test.Findings]

# 라벨 데이터
labels = test['AcuteInfarction'].values

# Word 토크나이저 토큰화
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# 시퀀스 설정 및 정수 인덱스 변환 & 패딩
MAX_LEN = 509
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# 어텐션 마스크
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
# 파이토치 텐서로 변환
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

# 배치 사이즈 설정 및 데이터 설정
batch_size = 16
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [57]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [58]:
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))


  Batch   100  of    387.    Elapsed: 0:01:01.
  Batch   200  of    387.    Elapsed: 0:02:01.
  Batch   300  of    387.    Elapsed: 0:03:02.

Accuracy: 0.99
Test took: 0:03:55


In [66]:
from transformers import pipeline
#nlp_sentence_classif = pipeline('sentiment-analysis',model=model, tokenizer=tokenizer, device=0)
#https://raki-1203.github.io/boostcamp_ai_tech/week_9/03.-single-sentence-classification-based-BERT-train/

#입력 데이터에 대한 0,1 분류 결과 출력
def sentences_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
            sent,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=128
    )
    tokenized_sent.to(device)
    
    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits)
    return result


x = data.Findings

ans = []
for i in range(len(data.Findings)):
  ans.append(sentences_predict(x[i]))

ck = 0
rc = []

for i in range(len(ans)):
  if ans[i] == data.AcuteInfarction[i]:
    ck +=1
  else:
    rc.append(i)
      
u = ck / len(ans) 
print(round(u,9)) #정확도


0.984975767


In [67]:
print(rc) #오류 데이터들



[81, 230, 238, 259, 265, 282, 294, 410, 442, 451, 479, 550, 558, 615, 817, 865, 902, 1030, 1113, 1192, 1426, 1508, 1546, 1622, 1662, 1786, 1829, 1909, 1957, 1972, 2014, 2099, 2306, 2327, 2368, 2394, 2531, 2569, 2634, 2844, 2858, 2938, 2948, 3029, 3104, 3150, 3168, 3253, 3269, 3413, 3583, 3632, 3698, 3735, 3767, 3788, 3793, 3812, 3937, 3957, 3969, 3980, 3982, 4031, 4079, 4142, 4144, 4175, 4255, 4307, 4391, 4430, 4482, 4529, 4629, 4675, 4720, 4723, 4900, 4984, 5056, 5106, 5193, 5361, 5390, 5649, 5661, 5826, 5920, 6004, 6048, 6109, 6115]
