In [None]:
# 14MiB 를 넘겨야 함
!nvidia-smi

In [None]:
!pip3 install kobert-transformers
!pip install transformers==3.0.0
!pip install sklearn_crfsuite

In [27]:
from sklearn_crfsuite.metrics import flat_classification_report
from transformers import BertConfig, BertTokenizer, TFBertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from kobert_transformers import get_tokenizer
from google.colab import drive 
from tqdm import tqdm,trange
import tensorflow as tf
import pandas as pd
import numpy as np
import logging
import os

In [None]:
savePath = '/content/check/My Drive/rsc'
drive.mount('/content/check') 
fname = '/dataset_v4'

In [5]:
# drive에 rsc 폴더를 만들고 만들어진 preprocessed_data 안에 csv 파일을 넣는다.
file_full_name=savePath + fname + '.csv'
df_data = pd.read_csv(file_full_name)
df_data = df_data.drop('Unnamed: 0', axis=1)

In [6]:
## tag의 종류 뽑아내기
tag_list=df_data.Tag.unique()
# ascending=False 하면 내림차순
tag_list=np.sort(tag_list)[::-1]

label_map = {label: i+1 for i, label in enumerate(tag_list)}   ##tag_list에 레이블 
idx2label = {i: w for w, i in label_map.items()}  ##tag_list을 뒤집어 놓은거
idx2label[0] = 'Null'
num_labels = len(tag_list) +1

idx2label_k = idx2label
for key, values in idx2label.items():
  if '-' in idx2label[key]:
    idx2label_k[key] = values.split('-')[1]

In [7]:
x_train,x_test=train_test_split(df_data,test_size=0.20,shuffle=False)
agg_func = lambda s: [ [w,t] for w,t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
x_train_grouped = x_train.groupby("Sentence #").apply(agg_func)
x_test_grouped = x_test.groupby("Sentence #").apply(agg_func)

x_train_sentences = [[s[0] for s in sent] for sent in x_train_grouped.values]
x_test_sentences = [[s[0] for s in sent] for sent in x_test_grouped.values]

x_train_tags = [[t[1] for t in tag] for tag in x_train_grouped.values]
x_test_tags = [[t[1] for t in tag] for tag in x_test_grouped.values]

In [None]:
''' BERT 모델 설정 '''
max_seq_length =128
pad_token_label_id = 0
BERT_MODEL="bert-base-multilingual-cased"
config = BertConfig.from_pretrained(BERT_MODEL)     # transformer BERT 라이브러리 사용을 위한 config
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,do_lower_case=False)

In [None]:
''' BERT모델의 input에 맞는 형식으로 바꿔주는 함수 '''
def convert_to_input(sentences,tags):
  input_id_list,attention_mask_list,token_type_id_list=[],[],[]
  label_id_list=[]
  for x,y in tqdm(zip(sentences,tags),total=len(tags)):
    tokens = []
    label_ids = []
    for word, label in zip(x, y):
      word_tokens = tokenizer.tokenize(word)
      tokens.extend(word_tokens)
      if label[0][0] == "B" :
        label_ids.extend([label_map[label]] + [label_map["I"+label[1:]]] * (len(word_tokens) - 1))
      else :
        label_ids.extend([label_map[label]] + [label_map[label]] * (len(word_tokens) - 1))   
  
    special_tokens_count =  2
    if len(tokens) > max_seq_length - special_tokens_count:
      tokens = tokens[: (max_seq_length - special_tokens_count)]
      label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    inputs = tokenizer.encode_plus(tokens,add_special_tokens=True, max_length=max_seq_length)
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_masks = [1] * len(input_ids)
    label_ids = [pad_token_label_id]+label_ids+[pad_token_label_id]

    input_id_list.append(input_ids)
    token_type_id_list.append(token_type_ids)
    attention_mask_list.append(attention_masks)
    label_id_list.append(label_ids)

  return input_id_list,token_type_id_list,attention_mask_list,label_id_list
logging.basicConfig(level=logging.ERROR)
''' 위 함수를 이용해 변환 작업 '''
input_ids_train,token_ids_train,attention_masks_train,label_ids_train=convert_to_input(x_train_sentences,x_train_tags)
input_ids_test,token_ids_test,attention_masks_test,label_ids_test=convert_to_input(x_test_sentences,x_test_tags)

In [10]:
''' 128개 입력값을 받는 BERT 입력값에 맞게 빈칸 채우는 padding작업과, y 결과값을 원핫인코딩 작업 '''
## 훈련데이터 패딩 
input_ids_train = pad_sequences(input_ids_train,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
token_ids_train = pad_sequences(token_ids_train,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
attention_masks_train = pad_sequences(attention_masks_train,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
label_ids_train = pad_sequences(label_ids_train,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
## 테스트 데이터 패딩
input_ids_test = pad_sequences(input_ids_test,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
token_ids_test = pad_sequences(token_ids_test,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
attention_masks_test = pad_sequences(attention_masks_test,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
label_ids_test = pad_sequences(label_ids_test,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
## y 결과값 카테고리 원핫인코딩화
from keras.utils import to_categorical
label_ids_train2 = [to_categorical(i, num_classes = num_labels) for i in label_ids_train]
label_ids_test2 = [to_categorical(i, num_classes = num_labels) for i in label_ids_test]

In [None]:
''' 신경망 모델 작업 현재 모델 model 3 '''
## BERT 레이어 만들기
bert_model = TFBertModel.from_pretrained(BERT_MODEL, config= config)
bert_model.trainable = True     ## BERT 파인튜닝 할 시 True
## BERT 입력값 형식 맞춰주기
input_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_ids"   )
attention_masks = tf.keras.Input(shape=(max_seq_length,),dtype=tf.int32,name="attention_masks"   )
token_type_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32,name="token_type_ids"  )
## 신경망 레이서 쌓기 시작
sequence_output, pooled_output = bert_model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)

bi_lstm = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(64*4, return_sequences=True, recurrent_dropout=0.1) )(sequence_output)
output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_labels, activation = "softmax"))(bi_lstm)

model = tf.keras.models.Model(  inputs=[input_ids, attention_masks, token_type_ids], outputs=output )

model.compile( optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"] )

model.summary()

In [None]:
history = model.fit(
    x = [input_ids_train, token_ids_train, attention_masks_train],
    y = np.array(label_ids_train2),
    validation_data = ([input_ids_test, token_ids_test, attention_masks_test], np.array(label_ids_test2)),
    batch_size = 64,
    epochs=1,
    #callbacks=[checkpointer],
    use_multiprocessing=True,
    workers=-1
    )

In [22]:
model.save_weights(savePath + '/model' + fname + '.h5')
model.load_weights(savePath + '/model/BERT_NER_v2.h5')

In [29]:
# Evaluation
y_pred1 = model.predict([input_ids_test, token_ids_test, attention_masks_test])
y_pred = np.argmax(y_pred1, axis=-1)
y_test_true = np.argmax(label_ids_test2, -1)
# Convert the index to tag
y_pred2 = [[idx2label_k[i] for i in row] for row in y_pred]
y_test_true2 = [[idx2label_k[i] for i in row] for row in y_test_true]

In [None]:
report = flat_classification_report(y_pred=y_pred2, y_true=y_test_true2)
print(report)

In [44]:
cs_no = 0.7
def convert_sentences_to_input(sentences):
  tokens = []
  word_tokens = tokenizer.tokenize('intent ' + sentences)
  tokens.extend(word_tokens)
  special_tokens_count =  2
  # 길면 자름
  if len(tokens) > max_seq_length - special_tokens_count:
    tokens = tokens[: (max_seq_length - special_tokens_count)]
  # 빈 공간 채움
  inputs = tokenizer.encode_plus(tokens,add_special_tokens=True, max_length=max_seq_length)
  input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
  attention_masks = [1] * len(input_ids)
  
  return [input_ids], [token_type_ids], [attention_masks]
def input2intentNer(sentence) :
  #버트 입력값 만들기
  input_ids_sentence, token_ids_sentence, attention_masks_sentence = convert_sentences_to_input(sentence)
  #버트 형식에 맞게 패딩작업
  input_ids_sentence = pad_sequences(input_ids_sentence,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  token_ids_sentence = pad_sequences(token_ids_sentence,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  attention_masks_sentence = pad_sequences(attention_masks_sentence,maxlen=max_seq_length,dtype="long",truncating="post",padding="post")
  bert_input = [input_ids_sentence, token_ids_sentence,attention_masks_sentence]
  ner_score = model.predict(bert_input)
  ner_output = np.argmax(ner_score, axis=-1)
  lists = []
  for i in range(len(input_ids_sentence[0])) :
    piece = ner_output[0][i]
    if input_ids_sentence[0][i]!=101 and input_ids_sentence[0][i]!=102 and input_ids_sentence[0][i]!=0 : 
      word = tokenizer.decode(input_ids_sentence[0][i:i+1])
      if word[0:2]!="##" :
        if len(lists) != 0 and lists[-1][list(lists[-1].keys())[0]] == '':
          lists[-1][list(lists[-1].keys())[0]] = words
        if piece > 1.5 and max(ner_score[0][i]) >= cs_no:
          lists.append({idx2label_k.get(piece):''})
        words = word
      else :
        words += word[2:]
  if len(lists) != 0 and lists[-1][list(lists[-1].keys())[0]] == '':
    lists[-1][list(lists[-1].keys())[0]] = words
  return lists
  

In [46]:
test_datas = [
  '2차 국가장학금',
  '헬로우',
  'ㄱㅐ',
  '2차 국가장학금 어떻게 되니?',
  ' 헬로우 2차 국가장학금',
  ' ㄱㅐ 2차 국가장학금',
  ' 2차 국가장학금 ㄱㅐ',
  '2차 국가장학금 지원자격',
  '지원자격 2차 국가장학금',
  '2차 국가장학금 지원자격 어떻게 되니?',
  '지원자격 2차 국가장학금 어떻게 되니?',
  'ㄱㅐ 2차 국가장학금 지원자격 어떻게 되니?',
  '2차 국가장학금 ㄱㅐ 지원자격 어떻게 되니?',
  '2차 국가장학금 지원자격 ㄱㅐ 어떻게 되니?',
  '2차 국가장학금 지원자격 어떻게 되니? ㄱㅐ',
  'ㄱㅐ 지원자격 2차 국가장학금 어떻게 되니?',
  '지원자격 ㄱㅐ 2차 국가장학금 어떻게 되니?',
  '지원자격 2차 국가장학금 ㄱㅐ 어떻게 되니?',
  '지원자격 2차 국가장학금 어떻게 되니? ㄱㅐ',
  '국가 장학금 언제 나와요?',
  '국장 금액이 얼마나 되나요?',
  '씨발.. 장학금 언제 나오냐',
  '아니 돈 얼마나 줌? 국장임',
  '높은 곳에서는 새가 날고 낮은 곳에서 물이 흐르니 내 성적은 바닥을 기며 혀를 낼름 거리는데 장학금 받을 수 있을까?'
]

In [None]:
for data in test_datas:
  print(data)
  print(input2intentNer(data))
  print()