In [1]:
# %load_ext tensorboard

In [40]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import os

import datetime

import time

import import_ipynb
from QBert import train_utils, models

from tqdm.notebook import tqdm



## 해당 파일은 bert.run_pretraining.run_bert_pretrain을 구현하는 것을 목표로 한다.

 - Parameter는 FLAG 형식에서 직접 정의해주는 방식으로 변경하고, Main에서 직접 정의하도록 한다.

In [42]:
from transformers import BertTokenizerFast
tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-3000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


## Bert_Config

In [43]:
# sub_model (core_model 필요 Config)

vocab_size = 32000 # 
hidden_size = 512 # Transformer hidden Layers
type_vocab_size = 2 #: The number of types that the 'type_ids' input can take.
num_layers = 3
num_attention_heads = 8
max_seq_length = 256 # 512
dropout_rate = .1
# attention_dropout_rate = .1
inner_dim = 512 * 4
# hidden_act = 'gelu'
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)

# Pretrain Model 필요 Config
max_predictions_per_seq = 40


## Input_Files

In [38]:
[ x for x in os.listdir('.') if x.startswith("Q_Bert") & x.endswith('.tfrecords') ] 

['Q_Bert00.tfrecords', 'Q_Bert01.tfrecords']

In [None]:
while len([ x for x in os.listdir('.') if x.startswith("Q_Bert") & x.endswith('.tfrecords') ] ) != 10 :
    time.sleep(600)
    
time.sleep(600)

In [55]:
filenames = ['./Q_Bert{}.tfrecords'.format(str(x).zfill(2)) for x in range(10)]
filenames = ['./Test_Examples3.tfrecords']
# Create a description of the features.
feature_description = {
    'input_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
    'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
}

# keys = feature_description.keys()

In [56]:
def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, feature_description)

def _select_data_from_record(record):
    """Filter out features to use for pretraining."""
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids'],
        'masked_lm_positions': record['masked_lm_positions'],
        'masked_lm_ids': record['masked_lm_ids'],
        'masked_lm_weights': record['masked_lm_weights'],
    }
    if use_next_sentence_label:
        x['next_sentence_labels'] = record['next_sentence_labels']
    if use_position_id:
        x['position_ids'] = record['position_ids']

    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
    if output_fake_labels:
        return (x, record['masked_lm_weights'])
    else:
        return x


In [57]:

BUFFER_SIZE = 10000

GLOBAL_BATCH_SIZE = 16
# BATCH_SIZE_PER_REPLICA = np.ceil(GLOBAL_BATCH_SIZE // strategy.num_replicas_in_sync)

use_next_sentence_label = True
output_fake_labels = True
use_position_id = False


In [58]:


train_dataset = tf.data.Dataset.from_tensor_slices(filenames)
train_dataset = train_dataset.interleave(tf.data.TFRecordDataset
                                         , cycle_length = tf.data.experimental.AUTOTUNE
                                         , num_parallel_calls = tf.data.experimental.AUTOTUNE)
dataset_inputs = train_dataset.map(_parse_function,
                                   num_parallel_calls=tf.data.experimental.AUTOTUNE) # String to Example
dataset_inputs_with_labels = dataset_inputs.map(_select_data_from_record,
                                                num_parallel_calls=tf.data.experimental.AUTOTUNE) # Example to InputData
## 본래대로라면 그냥 써도 되지만, 현재 Label이 없는 데이터이기 때문에
## max_predictions_per_seq 길이의 허위 정답 (Fake_y)를 삽입하는 mapping function이다.

dataset = dataset_inputs_with_labels
dataset = dataset.cache()
dataset = dataset.repeat()
dataset = dataset.shuffle(10000, reshuffle_each_iteration = True)
dataset = dataset.batch(GLOBAL_BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)


## Traning Config

In [65]:
# callback

## model checkpoint
t = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

checkpoint_dir = './training_checkpoints_{}'.format(t)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

model_cp = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True),

## Learning Rate Print


callbacks = [
    model_cp
]

In [60]:
# optimizer

## Learning Rate Decay

# lr = 1e-4 warmup stage (step <= 10000)
# Decay linearly

init_lr = 1e-3
warmup_steps = 10000
num_train_steps = 1000000
end_lr = 0

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=init_lr,
      decay_steps=num_train_steps,
      end_learning_rate=end_lr)

lr_schedule = train_utils.WarmUp(
        initial_learning_rate=init_lr,
        decay_schedule_fn=lr_schedule,
        warmup_steps=warmup_steps)

optimizer = train_utils.AdamWeightDecay( 
    learning_rate=lr_schedule,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])


# optimizer = tf.keras.optimizers.Adam(lr = init_lr, beta_1 = 0.9, beta_2 = 0.999, epsilon=1e-6)

In [61]:
# loss
def loss_fn(fake_y, losses, **unused_args) :
    
    return tf.reduce_mean(losses, axis = -1)
    

## Model Define

In [62]:
model, sub_model = models.get_bert_models_fn(vocab_size
                                         , hidden_size
                                         , type_vocab_size
                                         , num_layers
                                         , num_attention_heads
                                         , max_seq_length
                                         , max_predictions_per_seq
                                         , dropout_rate
                                         , inner_dim 
                                         , initializer)
model.compile(optimizer, loss=loss_fn)

# 훈련

In [None]:
hist = model.fit(dataset.repeat(),
                  epochs=2, #40,
                  callbacks=callbacks,
                  steps_per_epoch = 1)#100000)

Epoch 1/2
Epoch 2/2

In [32]:
print("======MLM TEST======")
inputs, labels = next(iter(dataset))
sub_res = model.layers[4]([inputs['input_ids'], inputs['input_mask'][:, tf.newaxis, tf.newaxis, :], inputs['segment_ids']])

layer_x = model.layers[6]
layer_x._output_type = 'predicitions'
output_logits = layer_x([sub_res['sequence_output'], tf.cast(inputs['masked_lm_positions'], dtype = tf.int32)])

prediction = tf.math.exp(output_logits)
print(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(output_logits[0], axis = 1)))
print(tokenizer_for_load.convert_ids_to_tokens(inputs['masked_lm_ids'][0]))

lm_labels = inputs['masked_lm_ids'][0]
lm_output = output_logits[0]
lm_label_weights = inputs['masked_lm_weights'][0]

masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(lm_labels, lm_output)
numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
denominator = tf.reduce_sum(lm_label_weights) + 1e-5
masked_lm_accuracy = numerator / denominator

print("REAL_ACC : {}".format(masked_lm_accuracy))

['기자회', '##지게', ',', '전', '라고', 'william', '시나리오', '##는', '##하다가', 'william', '관한', '##권에', '오리지널', '##와', '그의', 'william', '단지', '##에', '(', '그의', '2012년', '퍼져', '널리', '"', '라는', '##있는', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william', 'william']
['기자회', '##지게', '##et', '전', '라고', '깁', ')', '##는', '##들이', 'william', '##ance', 'ne', '##ir', '##와', '##작인', '##d', '단지', '##견', '(', '그의', '2012년', '퍼져', '널리', '"', '라는', '##있는', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
REAL_ACC : 0.6153843998908997
