# BERT 사전학습 모델 만들기 위해 필요한 것들 로드 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install sentencepiece
!pip install bert-tensorflow
# !git clone https://github.com/google-research/bert

fatal: destination path 'bert' already exists and is not an empty directory.


In [3]:
import os
import sys
import json
import nltk
from pathlib import Path
import numpy as np
import pandas as pd
import re
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
# from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

sys.path.append("bert")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

# auth.authenticate_user()




In [4]:
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

1 Physical GPUs, 1 Logical GPU


# 학습데이터 로드

In [5]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020

In [6]:
algo_name = 'mta'
feature_name = 'emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [7]:
train = pd.read_csv(trn_file, index_col=0)
train.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [8]:
test = pd.read_csv(tst_file, index_col=0)
test.head()

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [9]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

In [10]:
train['text'] = train['text'].str.lower().apply(alpha_num)

In [11]:
np.savetxt('dataset.txt', train['text'], fmt='%s', encoding='utf-8')

In [12]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [13]:
RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"}
PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"}

# apply normalization to the dataset
# this will take a minute or two

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [14]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 23465 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

In [15]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

Learnt vocab size: 23208
Sample tokens: ['▁load', '▁murderi', '▁rampage', 'ourier', 'hip', '▁hobb', '▁relaps', '▁recurrence', '▁has', '▁leg']


In [16]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

In [17]:
random.sample(bert_vocab,10)

['evenings',
 'hobble',
 'carcase',
 'mothers',
 'atrocit',
 '##utton',
 '##studi',
 'briton',
 'shov',
 '##eg']

In [18]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

23465


In [19]:
VOC_FNAME = "vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [20]:
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

mkdir: cannot create directory ‘./shards’: File exists
shard_0000


In [21]:
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 4 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}


In [22]:
"""
XARGS_CMD = ("ls ./shards/ | "
              "xargs -n 1 -P {} -I{} "
              "python3 bert/create_pretraining_data.py "
              "--input_file=./shards/{} "
              "--output_file={}/{}.tfrecord "
              "--vocab_file={} "
              "--do_lower_case={} "
              "--max_predictions_per_seq={} "
              "--max_seq_length={} "
              "--masked_lm_prob={} "
              "--random_seed=34 "
              "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                              VOC_FNAME, DO_LOWER_CASE, 
                              MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

tf.io.gfile.mkdir(PRETRAINING_DIR)
!$XARGS_CMD
"""

'\nXARGS_CMD = ("ls ./shards/ | "\n              "xargs -n 1 -P {} -I{} "\n              "python3 bert/create_pretraining_data.py "\n              "--input_file=./shards/{} "\n              "--output_file={}/{}.tfrecord "\n              "--vocab_file={} "\n              "--do_lower_case={} "\n              "--max_predictions_per_seq={} "\n              "--max_seq_length={} "\n              "--masked_lm_prob={} "\n              "--random_seed=34 "\n              "--dupe_factor=5")\n\nXARGS_CMD = XARGS_CMD.format(PROCESSES, \'{}\', \'{}\', PRETRAINING_DIR, \'{}\', \n                              VOC_FNAME, DO_LOWER_CASE, \n                              MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)\n\ntf.io.gfile.mkdir(PRETRAINING_DIR)\n!$XARGS_CMD\n'

In [23]:
BUCKET_NAME = "test" #@param {type:"string"}
MODEL_DIR = "bert_model" #@param {type:"string"}
tf.io.gfile.mkdir(MODEL_DIR)

In [24]:
bert_mini_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size":256, 
  "initializer_range": 0.02, 
  "intermediate_size": 1024, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 4, 
  "num_hidden_layers": 4, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_mini_config, fo, indent=2)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [25]:
BUCKET_NAME = "beomi-blog-sample" #@param {type:"string"}
MODEL_DIR = "bert_model" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}
VOC_FNAME = "vocab.txt" #@param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 32 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}
# NUM_TPU_CORES = 8

# if BUCKET_NAME:
#   BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
# else:
#   BUCKET_PATH = "."

# BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
# DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

BERT_GCS_DIR = "./{}".format(MODEL_DIR)
DATA_GCS_DIR = "./{}".format(PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.io.gfile.glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

2020-11-30 07:29:08,903 :  From /tf/LunchPlay/pretraeind_bert_model/test_notebook/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

2020-11-30 07:29:08,905 :  Using checkpoint: ./bert_model/model.ckpt-1000000
2020-11-30 07:29:08,906 :  Using 1 data shards


In [26]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=False,
      use_one_hot_embeddings=True)

# tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)


run_config = tf.contrib.tpu.RunConfig(
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

# 학습하자!!
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

2020-11-30 07:29:08,959 :  
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

2020-11-30 07:29:08,961 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f58fc5ad0d0>) includes params argument, but params are not passed to Estimator.
2020-11-30 07:29:08,964 :  Using config: {'_model_dir': './bert_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distri

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x7f58fc66c1d0>