In [19]:
# %load_ext tensorboard

In [20]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import os

import datetime

import import_ipynb
from QBert import train_utils, models

from tqdm.notebook import tqdm



## 해당 파일은 bert.run_pretraining.run_bert_pretrain을 구현하는 것을 목표로 한다.

 - Parameter는 FLAG 형식에서 직접 정의해주는 방식으로 변경하고, Main에서 직접 정의하도록 한다.

## Model Test 

In [3]:
# sub_model (core_model 필요 Config)

vocab_size = 32000 # 
hidden_size = 768 # Transformer hidden Layers
type_vocab_size = 12 #: The number of types that the 'type_ids' input can take.
num_layers = 12
num_attention_heads = 12
max_seq_length = 256 # 512
dropout_rate = .1
# attention_dropout_rate = .1
inner_dim = 3072
# hidden_act = 'gelu'
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)

# Pretrain Model 필요 Config
max_predictions_per_seq = 40


In [4]:
# Label에 관계없이 Loss를 Predict하는 모델이므로
# 결과값의 reduce_mean(over batch data)을 반환한다.

def loss_fn(fake_y, losses, **unused_args) :
    
    return tf.reduce_mean(losses, axis = -1)
    

In [5]:


# Create a description of the features.
feature_description = {
    'input_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
    'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
}

# keys = feature_description.keys()

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, feature_description)

In [6]:
def _select_data_from_record(record):
    """Filter out features to use for pretraining."""
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids'],
        'masked_lm_positions': record['masked_lm_positions'],
        'masked_lm_ids': record['masked_lm_ids'],
        'masked_lm_weights': record['masked_lm_weights'],
    }
    if use_next_sentence_label:
        x['next_sentence_labels'] = record['next_sentence_labels']
    if use_position_id:
        x['position_ids'] = record['position_ids']

    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
    if output_fake_labels:
        return (x, record['masked_lm_weights'])
    else:
        return x


- dir(dataset)

*'apply'* : 데이터셋 전체에 Function을 적용할 때 사용

*as_numpy_iterator* : 데이터셋의 Element들을 Numpy array로 반환, list(dataset.as_numpy_iterator()) 이렇게 하면 거의 기존에 알던 데이터셋이 나옴

*batch* : BatchSize 크기만큼의 Dataset Iteration을 만듬, drop_remainder를 이용하여 마지막 batch가 batch_size크기가 안되면 Drop시킬 수 있음.

*cache* : Mapping 이후에, cache가 선언된다면 첫번째 Epoch를 하면서 처리했던 mapping을 유지하고 있습니다. 따라서 cache는 시간이 오래걸리지만 Memory를 많이 소비하지 않는 Function 뒤에 쓰면 좋습니다.

 'filter',

'from_generator',
 'from_tensor_slices',
 'from_tensors',
 'interleave',
 'list_files',
 'map',
*prefetch* : 
 'range',
 'reduce',
 'repeat',
 'shard',
 'shuffle',
 'skip',
 'take',
 'unbatch',
 'window',
 'with_options',
 'zip']

In [7]:
batch_size = 8
lr = 1e-3

In [25]:
use_next_sentence_label = True
output_fake_labels = True
use_position_id = False

filenames = ['./Test_Examples.tfrecords']

train_dataset = tf.data.Dataset.from_tensor_slices(filenames)

train_dataset = train_dataset.interleave(tf.data.TFRecordDataset,
                                         cycle_length = -1)

# train_dataset = train_dataset.batch(1)

In [9]:
dataset_inputs = train_dataset.map(_parse_function) # String to Example
dataset_inputs_with_labels = dataset_inputs.map(_select_data_from_record) # Example to InputData
## 본래대로라면 그냥 써도 되지만, 현재 Label이 없는 데이터이기 때문에
## max_predictions_per_seq 길이의 허위 정답 (Fake_y)를 삽입하는 mapping function이다.

In [10]:
dataset = dataset_inputs_with_labels
dataset = dataset.cache()
dataset = dataset.shuffle(10000, reshuffle_each_iteration = True)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [11]:
model, sub_model = models.get_bert_models_fn(vocab_size
                                             , hidden_size
                                             , type_vocab_size
                                             , num_layers
                                             , num_attention_heads
                                             , max_seq_length
                                             , max_predictions_per_seq
                                             , dropout_rate
                                             , inner_dim 
                                             , initializer)

#####################################################################################################################
# model : pretrained model
#  - Input들을 받아서 Loss까지 계산한다.
# sub_model : bert_encoder
#  - input_ids, input_mask, segmend_id를 주면 Encoding한 결과를 가져온다.
#  - outputs = ['sequence_output' : 마지막 Encoder Layer의 결과값들이 기록
#               'hidden_states' : 모든 Encoder Layer의 결과값들이 기록
#               'pooled_output' : 마지막 Encoder Layer의 첫번째 예측결과에 Dense를 추가한 output ]
#####################################################################################################################

In [12]:

        
optimizer = tf.keras.optimizers.Adam(lr = 1e-3)

model.compile(optimizer = optimizer, loss=loss_fn)



In [17]:

# 체크포인트를 저장할 체크포인트 디렉터리를 지정합니다.
checkpoint_dir = './training_checkpoints'
# 체크포인트 파일의 이름
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=1)

# 학습률을 점점 줄이기 위한 함수
# 필요한 함수를 직접 정의하여 사용할 수 있습니다.
def decay(epoch):
    if epoch < 3:
        return 1e-3
    elif epoch >= 3 and epoch < 7:
        return 1e-4
    else:
        return 1e-5

callbacks = [
#     tensorboard_callback
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True)
    , tf.keras.callbacks.LearningRateScheduler(decay)
]

In [18]:
hist = model.fit(dataset, epochs = 5, callbacks=callbacks, steps_per_epoch=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Strategy

In [21]:
tf.config.list_physical_devices() # device 확인

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [22]:
# Strategy 정의

distribution_strategy = 'mirrored' # 'tpu'
num_gpus = 0
all_reduce_alg = None

if distribution_strategy == 'tpu' :
    tpu_address = ""
else :
    tpu_address = None



strategy = train_utils.get_distribution_strategy(
                  distribution_strategy=distribution_strategy,
                  num_gpus=num_gpus,
                  all_reduce_alg=all_reduce_alg,
                  tpu_address=tpu_address)

print ('\n장치의 수: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)

장치의 수: 1


## Bert_Config

In [9]:
# sub_model (core_model 필요 Config)

vocab_size = 32000 # 
hidden_size = 768 # Transformer hidden Layers
type_vocab_size = 12 #: The number of types that the 'type_ids' input can take.
num_layers = 12
num_attention_heads = 12
max_seq_length = 256 # 512
dropout_rate = .1
# attention_dropout_rate = .1
inner_dim = 3072
# hidden_act = 'gelu'
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)

# Pretrain Model 필요 Config
max_predictions_per_seq = 40


## Input_Files

In [23]:
filenames = ['./Test_Examples.tfrecords']

# Create a description of the features.
feature_description = {
    'input_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
    'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
    'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
}

# keys = feature_description.keys()

In [27]:
def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, feature_description)

def _select_data_from_record(record):
    """Filter out features to use for pretraining."""
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids'],
        'masked_lm_positions': record['masked_lm_positions'],
        'masked_lm_ids': record['masked_lm_ids'],
        'masked_lm_weights': record['masked_lm_weights'],
    }
    if use_next_sentence_label:
        x['next_sentence_labels'] = record['next_sentence_labels']
    if use_position_id:
        x['position_ids'] = record['position_ids']

    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
    if output_fake_labels:
        return (x, record['masked_lm_weights'])
    else:
        return x


In [28]:

BUFFER_SIZE = 672

BATCH_SIZE_PER_REPLICA = 32
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10

use_next_sentence_label = True
output_fake_labels = True
use_position_id = False

batch_size = 8
lr = 1e-3

filenames = ['./Test_Examples.tfrecords']

In [14]:

with strategy.scope():

    train_dataset = tf.data.Dataset.from_tensor_slices(filenames)
    train_dataset = train_dataset.interleave(tf.data.TFRecordDataset, cycle_length = -1)
    
    dataset_inputs = train_dataset.map(_parse_function) # String to Example
    dataset_inputs_with_labels = dataset_inputs.map(_select_data_from_record) # Example to InputData
    ## 본래대로라면 그냥 써도 되지만, 현재 Label이 없는 데이터이기 때문에
    ## max_predictions_per_seq 길이의 허위 정답 (Fake_y)를 삽입하는 mapping function이다.
    
    dataset = dataset_inputs_with_labels
    dataset = dataset.cache()
    dataset = dataset.shuffle(10000, reshuffle_each_iteration = True)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    train_dist_dataset = strategy.experimental_distribute_dataset(a)


## Model Define

In [10]:
with strategy.scope() :

    model, sub_model = models.get_bert_models_fn(vocab_size
                                             , hidden_size
                                             , type_vocab_size
                                             , num_layers
                                             , num_attention_heads
                                             , max_seq_length
                                             , max_predictions_per_seq
                                             , dropout_rate
                                             , inner_dim 
                                             , initializer)

NameError: name 'strategy' is not defined

## Traning Config

ValueError: in user code:

    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:541 train_step  **
        self.trainable_variables)
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:1804 _minimize
        trainable_variables))
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:521 _aggregate_gradients
        filtered_grads_and_vars = _filter_grads(grads_and_vars)
    C:\Users\LGCNS\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:1219 _filter_grads
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['embedding_2/embeddings:0', 'position_embedding_1/embeddings:0', 'embedding_3/embeddings:0', 'layer_normalization_25/gamma:0', 'layer_normalization_25/beta:0', 'multi_head_attention_12/dense_72/kernel:0', 'multi_head_attention_12/dense_72/bias:0', 'multi_head_attention_12/dense_73/kernel:0', 'multi_head_attention_12/dense_73/bias:0', 'multi_head_attention_12/dense_74/kernel:0', 'multi_head_attention_12/dense_74/bias:0', 'multi_head_attention_12/dense_75/kernel:0', 'multi_head_attention_12/dense_75/bias:0', 'layer_normalization_26/gamma:0', 'layer_normalization_26/beta:0', 'dense_76/kernel:0', 'dense_76/bias:0', 'dense_77/kernel:0', 'dense_77/bias:0', 'layer_normalization_27/gamma:0', 'layer_normalization_27/beta:0', 'multi_head_attention_13/dense_78/kernel:0', 'multi_head_attention_13/dense_78/bias:0', 'multi_head_attention_13/dense_79/kernel:0', 'multi_head_attention_13/dense_79/bias:0', 'multi_head_attention_13/dense_80/kernel:0', 'multi_head_attention_13/dense_80/bias:0', 'multi_head_attention_13/dense_81/kernel:0', 'multi_head_attention_13/dense_81/bias:0', 'layer_normalization_28/gamma:0', 'layer_normalization_28/beta:0', 'dense_82/kernel:0', 'dense_82/bias:0', 'dense_83/kernel:0', 'dense_83/bias:0', 'layer_normalization_29/gamma:0', 'layer_normalization_29/beta:0', 'multi_head_attention_14/dense_84/kernel:0', 'multi_head_attention_14/dense_84/bias:0', 'multi_head_attention_14/dense_85/kernel:0', 'multi_head_attention_14/dense_85/bias:0', 'multi_head_attention_14/dense_86/kernel:0', 'multi_head_attention_14/dense_86/bias:0', 'multi_head_attention_14/dense_87/kernel:0', 'multi_head_attention_14/dense_87/bias:0', 'layer_normalization_30/gamma:0', 'layer_normalization_30/beta:0', 'dense_88/kernel:0', 'dense_88/bias:0', 'dense_89/kernel:0', 'dense_89/bias:0', 'layer_normalization_31/gamma:0', 'layer_normalization_31/beta:0', 'multi_head_attention_15/dense_90/kernel:0', 'multi_head_attention_15/dense_90/bias:0', 'multi_head_attention_15/dense_91/kernel:0', 'multi_head_attention_15/dense_91/bias:0', 'multi_head_attention_15/dense_92/kernel:0', 'multi_head_attention_15/dense_92/bias:0', 'multi_head_attention_15/dense_93/kernel:0', 'multi_head_attention_15/dense_93/bias:0', 'layer_normalization_32/gamma:0', 'layer_normalization_32/beta:0', 'dense_94/kernel:0', 'dense_94/bias:0', 'dense_95/kernel:0', 'dense_95/bias:0', 'layer_normalization_33/gamma:0', 'layer_normalization_33/beta:0', 'multi_head_attention_16/dense_96/kernel:0', 'multi_head_attention_16/dense_96/bias:0', 'multi_head_attention_16/dense_97/kernel:0', 'multi_head_attention_16/dense_97/bias:0', 'multi_head_attention_16/dense_98/kernel:0', 'multi_head_attention_16/dense_98/bias:0', 'multi_head_attention_16/dense_99/kernel:0', 'multi_head_attention_16/dense_99/bias:0', 'layer_normalization_34/gamma:0', 'layer_normalization_34/beta:0', 'dense_100/kernel:0', 'dense_100/bias:0', 'dense_101/kernel:0', 'dense_101/bias:0', 'layer_normalization_35/gamma:0', 'layer_normalization_35/beta:0', 'multi_head_attention_17/dense_102/kernel:0', 'multi_head_attention_17/dense_102/bias:0', 'multi_head_attention_17/dense_103/kernel:0', 'multi_head_attention_17/dense_103/bias:0', 'multi_head_attention_17/dense_104/kernel:0', 'multi_head_attention_17/dense_104/bias:0', 'multi_head_attention_17/dense_105/kernel:0', 'multi_head_attention_17/dense_105/bias:0', 'layer_normalization_36/gamma:0', 'layer_normalization_36/beta:0', 'dense_106/kernel:0', 'dense_106/bias:0', 'dense_107/kernel:0', 'dense_107/bias:0', 'layer_normalization_37/gamma:0', 'layer_normalization_37/beta:0', 'multi_head_attention_18/dense_108/kernel:0', 'multi_head_attention_18/dense_108/bias:0', 'multi_head_attention_18/dense_109/kernel:0', 'multi_head_attention_18/dense_109/bias:0', 'multi_head_attention_18/dense_110/kernel:0', 'multi_head_attention_18/dense_110/bias:0', 'multi_head_attention_18/dense_111/kernel:0', 'multi_head_attention_18/dense_111/bias:0', 'layer_normalization_38/gamma:0', 'layer_normalization_38/beta:0', 'dense_112/kernel:0', 'dense_112/bias:0', 'dense_113/kernel:0', 'dense_113/bias:0', 'layer_normalization_39/gamma:0', 'layer_normalization_39/beta:0', 'multi_head_attention_19/dense_114/kernel:0', 'multi_head_attention_19/dense_114/bias:0', 'multi_head_attention_19/dense_115/kernel:0', 'multi_head_attention_19/dense_115/bias:0', 'multi_head_attention_19/dense_116/kernel:0', 'multi_head_attention_19/dense_116/bias:0', 'multi_head_attention_19/dense_117/kernel:0', 'multi_head_attention_19/dense_117/bias:0', 'layer_normalization_40/gamma:0', 'layer_normalization_40/beta:0', 'dense_118/kernel:0', 'dense_118/bias:0', 'dense_119/kernel:0', 'dense_119/bias:0', 'layer_normalization_41/gamma:0', 'layer_normalization_41/beta:0', 'multi_head_attention_20/dense_120/kernel:0', 'multi_head_attention_20/dense_120/bias:0', 'multi_head_attention_20/dense_121/kernel:0', 'multi_head_attention_20/dense_121/bias:0', 'multi_head_attention_20/dense_122/kernel:0', 'multi_head_attention_20/dense_122/bias:0', 'multi_head_attention_20/dense_123/kernel:0', 'multi_head_attention_20/dense_123/bias:0', 'layer_normalization_42/gamma:0', 'layer_normalization_42/beta:0', 'dense_124/kernel:0', 'dense_124/bias:0', 'dense_125/kernel:0', 'dense_125/bias:0', 'layer_normalization_43/gamma:0', 'layer_normalization_43/beta:0', 'multi_head_attention_21/dense_126/kernel:0', 'multi_head_attention_21/dense_126/bias:0', 'multi_head_attention_21/dense_127/kernel:0', 'multi_head_attention_21/dense_127/bias:0', 'multi_head_attention_21/dense_128/kernel:0', 'multi_head_attention_21/dense_128/bias:0', 'multi_head_attention_21/dense_129/kernel:0', 'multi_head_attention_21/dense_129/bias:0', 'layer_normalization_44/gamma:0', 'layer_normalization_44/beta:0', 'dense_130/kernel:0', 'dense_130/bias:0', 'dense_131/kernel:0', 'dense_131/bias:0', 'layer_normalization_45/gamma:0', 'layer_normalization_45/beta:0', 'multi_head_attention_22/dense_132/kernel:0', 'multi_head_attention_22/dense_132/bias:0', 'multi_head_attention_22/dense_133/kernel:0', 'multi_head_attention_22/dense_133/bias:0', 'multi_head_attention_22/dense_134/kernel:0', 'multi_head_attention_22/dense_134/bias:0', 'multi_head_attention_22/dense_135/kernel:0', 'multi_head_attention_22/dense_135/bias:0', 'layer_normalization_46/gamma:0', 'layer_normalization_46/beta:0', 'dense_136/kernel:0', 'dense_136/bias:0', 'dense_137/kernel:0', 'dense_137/bias:0', 'layer_normalization_47/gamma:0', 'layer_normalization_47/beta:0', 'multi_head_attention_23/dense_138/kernel:0', 'multi_head_attention_23/dense_138/bias:0', 'multi_head_attention_23/dense_139/kernel:0', 'multi_head_attention_23/dense_139/bias:0', 'multi_head_attention_23/dense_140/kernel:0', 'multi_head_attention_23/dense_140/bias:0', 'multi_head_attention_23/dense_141/kernel:0', 'multi_head_attention_23/dense_141/bias:0', 'layer_normalization_48/gamma:0', 'layer_normalization_48/beta:0', 'dense_142/kernel:0', 'dense_142/bias:0', 'dense_143/kernel:0', 'dense_143/bias:0', 'layer_normalization_49/gamma:0', 'layer_normalization_49/beta:0', 'pooler_layer_1/kernel:0', 'pooler_layer_1/bias:0', 'transform/bias:0', 'lm_layer_1/transform/dense/kernel:0', 'lm_layer_1/transform/dense/bias:0', 'lm_layer_1/transform/LayerNorm/gamma:0', 'lm_layer_1/transform/LayerNorm/beta:0', 'predictions/transform/logits_1/kernel:0', 'predictions/transform/logits_1/bias:0'].
