In [40]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

from keras_bert import load_trained_model_from_checkpoint, load_vocabulary
from keras_bert import Tokenizer
from keras_bert import AdamWarmup, calc_train_steps

import tensorflow_hub as hub
import bert

from bert import BertModelLayer
import tensorflow as tf

import codecs
import shutil
import os
from tqdm import tqdm

In [2]:
with open('D:/LikeLion/Code/Project2/Data/stopwords.txt', 'r', encoding='utf-8') as f:
    list_file = f.readlines()
stopwords = [line.rstrip('\n') for line in list_file] 

In [3]:
df = pd.read_csv('D:/LikeLion/Code/Project2/Data/spell_check.csv', encoding='utf-8')
df = df[df['label'] != 2]

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size = 0.2, shuffle=True, random_state=119)
train_set = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
test_set = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)

In [34]:
pretrained_path = 'D:/LikeLion/Code/Project2/BERT/multi_cased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt' )
vocab_path = os.path.join(pretrained_path,'vocab.txt')

In [27]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'label' 
MAX_SEQ_LEN = 70
BATCH_SIZE = 16
EPOCHS = 10
LR = 1e-5

In [6]:
token_dict = {}

with codecs.open(vocab_path, 'r', encoding='utf-8') as reader:
    for line in reader:
        token = line.strip()
        if "_" in token:
            token = token.replace("_","")
            token = "##" + token
        token_dict[token] = len(token_dict)

In [7]:
class inherit_Tokenizer(Tokenizer):
  def _tokenize(self, text):
    if not self._cased:
      text = text

      text = text.lower()

    spaced = ''
    for ch in text:
      if self._is_punctuation(ch) or self._is_cjk_character(ch):
        spaced += ' ' + ch + ' '
      elif self._is_space(ch):
        spaced += ' '
      elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
        continue
      else:
        spaced += ch
      tokens = []
    for word in spaced.strip().split():
      tokens += self._word_piece_tokenize(word)
    return tokens

In [8]:
tokenizer = inherit_Tokenizer(token_dict)

In [30]:
def convert_data(data_df):
  global tokenizer
  indices, targets = [], []
  for i in tqdm(range(len(data_df))):
    ids, segments = tokenizer.encode(data_df[DATA_COLUMN][i], max_len=MAX_SEQ_LEN)
    indices.append(ids)
    targets.append(data_df[LABEL_COLUMN][i])

  items = list(zip(indices, targets))

  indices, targets = zip(*items)
  indices = np.array(indices)
  return [indices, np.zeros_like(indices)], np.array(targets)

def load_data(pandas_dataframe):
  data_df = pandas_dataframe
  data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)

  data_x, data_y = convert_data(data_df)

  return data_x, data_y

In [31]:
train_x, train_y = load_data(train_set)

100%|██████████| 31931/31931 [00:02<00:00, 15603.03it/s]


In [32]:
def sentence_convert_data(data):
  global tokenizer
  indices = []
  for i in tqdm(range(len(data))):
    print(tokenizer.tokenize(data[i]))
    ids, segments = tokenizer.encode(data[i], max_len = SEQ_LEN)
    indices.append(ids)

  items = indices
  indices = np.array(indices)
  return [indices, np.zeros_like(indices)]

def sentence_load_data(sentences): #sentence는 list input

  data_x = sentence_convert_data(sentences)

  return data_x

In [35]:
layer_num = 12
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training = True,
    trainable = True,
    seq_len = MAX_SEQ_LEN
)

In [43]:
def get_bert_finetuning_model(model):
  inputs = model.inputs[:2]
  dense = model.layers[-3].output

  outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
                              name = 'real_output')(dense) #밑에 두개의 layer를 제외하고 output layer를 붙여줌

  bert_model = tf.keras.models.Model(inputs, outputs)
  Adam = tf.keras.optimizers.Adam(lr=0.00001)
  bert_model.compile(
      optimizer = Adam,
      loss = 'binary_crossentropy',
      metrics = ['accuracy'])
  
  return bert_model

In [44]:
bert_model = get_bert_finetuning_model(model)
history = bert_model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, shuffle=True)

Epoch 1/10
  24/1996 [..............................] - ETA: 1:47:53 - loss: 0.5638 - accuracy: 0.8051

KeyboardInterrupt: 