# **SMS Analysis with pretrained BERT**

In [0]:
sample = list()
while True:
    sample.append([1])

In [0]:
!nvidia-smi

Mon Feb 24 07:50:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# 진행 순서

1. 학습 데이터 준비
2. 정규표현식 전처리
3. 토크나이징
4. tokens to ids
5. padding
6. 데이터 셋 분할
7. 어텐션 마스킹
8. 마스킹 분할
9. 텐서 변환
10. 모델링
11. 데이터 로더에 담기
12. cuda로 보내기
13. 데이터 학습 시작
14. 데스트 수행

# 학습 데이터 준비

In [0]:
import pandas as pd
df_data = pd.read_csv('/content/drive/My Drive/tabditor/FromZero/review_finance/Copy of 30000.csv', header=None)
df_data

# 정규표현식 전처리

In [0]:
import re
re_pat = re.compile('[^가-힇a-zA-Z0-9\s]')

In [0]:
re_pat.sub('', 'This is 테스트 문자열@#$%@%&*%^#423451. ')

'This is 테스트 문자열423451 '

In [0]:
df_data[0].apply(lambda x: '[CLS]'+x)

In [0]:
df_data.apply(lambda x: '[CLS]'+x[0], axis=1)

In [0]:
def my_function(x):
    return '[CLS]'+x

In [0]:
my_function('안녕하세요')

'[CLS]안녕하세요'

In [0]:
df_data[0].apply(my_function)

In [0]:
tmp = lambda x='안녕하세요': '[CLS]'+x
tmp('안녕하세요')

'[CLS]안녕하세요'

In [0]:
df_data[0] = df_data[0].apply(lambda x: re_pat.sub('', x))

In [0]:
df_data[0]

# 토크나이징

In [0]:
pip install transformers



In [0]:
import transformers
from transformers import BertTokenizer

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [0]:
tokenizer.tokenize('테스트 문장 입니다')

['테', '##스트', '문', '##장', '입', '##니다']

In [0]:
df_data[0] = df_data[0].apply(lambda x: tokenizer.tokenize(x))
df_data[0]

In [0]:
#uncased tokenizer
tokenizer_uncased = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=871891, style=ProgressStyle(description_wid…




In [0]:
print(tokenizer_uncased.tokenize('햇살론 사잇돌 채무통XXX 금융센터 2018년 현재'))

['해', '##ᆺ', '##사', '##ᆯ로', '##ᆫ', 'ᄉ', '##ᅡ이', '##ᆺ', '##도', '##ᆯ', 'ᄎ', '##ᅢ', '##무', '##통', '##xx', '##x', '그', '##ᆷ', '##유', '##ᆼ', '##세', '##ᆫ터', '2018년', '현재']


In [0]:
df_data[0].apply(lambda x: tokenizer_uncased.tokenize(x))

# tokens to ids

In [0]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_additional_special_tokens',
 '_bos_token',
 '_cls_token',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_eos_token',
 '_from_pretrained',
 '_mask_token',
 '_pad_token',
 '_pad_token_type_id',
 '_sep_token',
 '_tokenize',
 '_unk_token',
 'add_special_tokens',
 'add_tokens',
 'added_tokens_decoder',
 'added_tokens_encoder',
 'additional_special_tokens',
 'additional_special_tokens_ids',
 'all_special_ids',
 'all_special_tokens',
 'basic_tokenizer',
 'batch_encode_plus',
 'bos_token',
 'bos_token_id',
 'build_inputs_with_special_tokens'

In [0]:
import numpy as np
tokenizer.convert_tokens_to_ids(['근', '하', '신'])

[8926, 9952, 9487]

In [0]:
tokenizer.convert_ids_to_tokens([8926, 9952, 9487])

['근', '하', '신']

In [0]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df_data[0]]
print(input_ids)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [0]:
type(input_ids[0][0])

int

# 패딩

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
dir([])

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [0]:
pad_sequences([[1, 2, 3, 4, 5, 6, 7]], maxlen=5, truncating='pre')

array([[3, 4, 5, 6, 7]], dtype=int32)

In [0]:
# 이전 방법
input_ids = [pad_sequences(x, dtype='long', maxlen=max_len, padding='post') for x in input_ids]
input_ids

ValueError: ignored

In [0]:
# 새로운 방법
max_len = 512
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype='long', padding='post', truncating='post')
input_ids

array([[   185,  10410,   9272, ...,      0,      0,      0],
       [ 62885,   8888, 118617, ...,      0,      0,      0],
       [ 62885,   8888, 118617, ...,      0,      0,      0],
       ...,
       [  9964, 106249,  42769, ...,      0,      0,      0],
       [  9964, 106249,  42769, ...,  62885,  12674,  12674],
       [  9964, 106249,  42769, ...,  11903,  14867,   9659]])

In [0]:
print(type(input_ids[0]))
print(type(input_ids[0][0]))

<class 'numpy.ndarray'>
<class 'numpy.int64'>


# 데이터셋 분할

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
labels = df_data[1].to_list()
print(labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels,
                                                                                    random_state=2018,
                                                                                    test_size=0.1)

In [0]:
dir(df_data[0])

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__p

In [0]:
print(train_labels)
print(validation_labels)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# 어텐션 마스킹

In [0]:
attention_masks = list()

for id in input_ids:

    ls_id = list()
    for i in id:
        if i != 0:
            ls_id.append(1)
        else:
            ls_id.append(0)
    attention_masks.append(ls_id)

print(attention_masks)
print(len(attention_masks))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# 마스킹 분할

In [0]:
# train_masks, validation_masks, _, _ = train_test_split(attention_masks,
#                                                         [0 for _ in range(30344)],
#                                                         random_state=2018,
#                                                         test_size=0.1)

In [0]:
train_masks, validation_masks = train_test_split(attention_masks,
                                                  random_state=2018,
                                                  test_size=0.1)

In [0]:
print(train_masks)
print(validation_masks)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# 텐서 변환

In [0]:
import torch

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [0]:
print(type(train_inputs))
print(train_inputs.type())

<class 'torch.Tensor'>
torch.LongTensor


# 모델링

In [0]:
from transformers import BertForSequenceClassification

In [0]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

HBox(children=(IntProgress(value=0, description='Downloading', max=569, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=714314041, style=ProgressStyle(description_…




In [0]:
# 모델 구조 출력 시 케라스에서는 model.summary()
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [0]:
model_multiclass = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=10)

In [0]:
model_multiclass

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

# 데이터 로더에 담기

In [0]:
#train_inputs, validation_inputs, train_labels, validation_labels

In [0]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler

In [0]:
batch_size = 4

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# cuda로 보내기

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)
loss_function.to(device)

CrossEntropyLoss()

# 데이터 학습 시작

In [0]:
for i in range(4):
    for idx, batch in enumerate(train_dataloader):
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_input_masks, b_input_labels = batch
        
        prediction = model(b_input_ids, b_input_masks)

        loss = loss_function(prediction[0], b_input_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (idx+1) % 100 == 0:
            print(idx+1, '번 데이터 학습 완료', loss)
    print(i+1, '번째 epoch 완료')

100 번 데이터 학습 완료 tensor(0.0595, device='cuda:0', grad_fn=<NllLossBackward>)
200 번 데이터 학습 완료 tensor(0.0277, device='cuda:0', grad_fn=<NllLossBackward>)
300 번 데이터 학습 완료 tensor(0.0271, device='cuda:0', grad_fn=<NllLossBackward>)
400 번 데이터 학습 완료 tensor(0.1178, device='cuda:0', grad_fn=<NllLossBackward>)
500 번 데이터 학습 완료 tensor(0.0173, device='cuda:0', grad_fn=<NllLossBackward>)
600 번 데이터 학습 완료 tensor(0.0197, device='cuda:0', grad_fn=<NllLossBackward>)
700 번 데이터 학습 완료 tensor(0.0066, device='cuda:0', grad_fn=<NllLossBackward>)
800 번 데이터 학습 완료 tensor(0.0545, device='cuda:0', grad_fn=<NllLossBackward>)
900 번 데이터 학습 완료 tensor(0.8113, device='cuda:0', grad_fn=<NllLossBackward>)
1000 번 데이터 학습 완료 tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
1100 번 데이터 학습 완료 tensor(0.0293, device='cuda:0', grad_fn=<NllLossBackward>)
1200 번 데이터 학습 완료 tensor(0.0884, device='cuda:0', grad_fn=<NllLossBackward>)
1300 번 데이터 학습 완료 tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
1400 번 데이터 학습 완료 tens

KeyboardInterrupt: ignored

In [0]:
pwd

'/content'

In [0]:
# save
savePath = "/content/drive/My Drive/tabditor/FromZero/review_finance/bert_fine_tuned_2eps"
torch.save(model.state_dict(), savePath)

In [0]:
# load
from transformers import BertForSequenceClassification

savePath = "/content/drive/My Drive/tabditor/FromZero/review_finance/bert_fine_tuned_2eps"

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.load_state_dict(torch.load(savePath))
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

# 테스트 수행

In [0]:
!pip install transformers

import torch
import pandas as pd
import re
import transformers
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df_data = pd.read_csv('/content/drive/My Drive/tabditor/FromZero/review_finance/Copy of 30000.csv', header=None)
re_pat = re.compile('[^가-힇a-zA-Z0-9\s]')
df_data[0] = df_data[0].apply(lambda x: re_pat.sub('', x))

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
df_data[0] = df_data[0].apply(lambda x: tokenizer.tokenize(x))
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df_data[0]]

labels = df_data[1].to_list()

max_len = 512
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype='long', padding='post', truncating='post')

attention_masks = list()
for id in input_ids:
    ls_id = list()
    for i in id:
        if i != 0:
            ls_id.append(1)
        else:
            ls_id.append(0)
    attention_masks.append(ls_id)



In [0]:
with torch.no_grad():
    for _ in range(1):

        input_ids = torch.tensor(input_ids)
        labels = torch.tensor(labels)
        attention_masks = torch.tensor(attention_masks)
        
        prediction = model(input_ids, attention_masks)
        print(prediction)
        break
        correct = torch.argmax(prediction, 1) == y_test
        accuracy = correct.float().mean()
        print(accuracy.item())

  after removing the cwd from sys.path.
  """
  


In [0]:
with torch.no_grad():
    model.eval()
    for _ in range(1):
        input_value = ('우리 이따가 저녁이나 같이 먹을까? 아니면 커피 한잔 같이 해도 좋고!'
                        '몇시가 괜찮아? 우리가 왕십리로 갈까? 어디가 좋을까'
                        '그럼 어느정도 이야기 다 된거 같네, 그럼 내일 연락할게!')
        print('input message: ', input_value)
        
        #토크나이징
        input_value = tokenizer.tokenize(input_value)
        # print(input_value)
        
        #token_to_ids
        input_value = tokenizer.convert_tokens_to_ids(input_value)
        # print(input_value)

        #패딩
        input_value = pad_sequences([input_value], maxlen=512, dtype='long', padding='post', truncating='post')
        # print(input_value)
        
        #어텐션 마스킹
        attention_masks = list()
        for value in input_value:
            ls_value = list()
            for v in value:
                if int(v) != 0:
                    ls_value.append(1)
                else:
                    ls_value.append(0)
            attention_masks.append(ls_value)
        # print(attention_masks)

        #텐서변환
        input_value = torch.tensor(input_value)
        attention_masks = torch.tensor(attention_masks)

        #모델에 입력하여 출력값 확인하기
        prediction = model(input_value, attention_masks)
        
        if torch.argmax(prediction[0][0]) == 0:
            return_message = 'This is not a non-smishing text'
        else:
            return_message = 'This is a smishing text'

        print('예측 결과: ', return_message)
        

input message:  우리 이따가 저녁이나 같이 먹을까? 아니면 커피 한잔 같이 해도 좋고!몇시가 괜찮아? 우리가 왕십리로 갈까? 어디가 좋을까그럼 어느정도 이야기 다 된거 같네, 그럼 내일 연락할게!
예측 결과:  This is not a non-smishing text
